In [1]:
# import: numpy, pandas, and plotting
import numpy as np
import pandas as pd
from sklearn.cross_validation import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score
import matplotlib.pyplot as plt
# all the data read correctly
studentData = pd.read_csv('StudentData.csv',encoding = 'ISO-8859-1')
ratesData = pd.read_csv('pertusisRates2010_2015.csv',encoding = 'ISO-8859-1')
# observing student data we need to drop data on nMMR and nPolio since we are interested only in Pertussis outbreaks
# also, we need to drop data on 2015 year as well as years 2000-2009 since we are only interested in 2010-2014 years
# lastly, we need to sort the data's rows according to years and county from lowest to highest
studentData = studentData.drop(['schoolType','SCHOOL','school_code','nMMR', 'nPolio', 'nPBE', 'nPME'], axis=1)
for i in range(2000,2010):
    studentData = studentData[studentData.year != i]
studentData = studentData[studentData.year != 2015]
# sort the student data frame according to county and year
studentData = studentData.sort_values(by=['COUNTY', 'year'])
# on rates data we will drop first row as it contains cases for entire California
ratesData = ratesData.iloc[1:]
# now we calculate the unvaccinated number of students for all counties
studentData['not_vaccinated'] = studentData['n'] - studentData['nDTP']
# next we create the new data frame that represents the student data for years 2010 to 2014
cols = ['county','students','vaccinated','not_vaccinated','year','cases']
df = pd.DataFrame( columns = cols)
yrs = [2010,2011,2012,2013,2014]
for x in yrs:
    new_df = studentData[studentData['year']==x]
    for i in sorted(set(new_df['COUNTY'].tolist())):
        new_df2 = new_df[new_df['COUNTY']==i]
        df = df.append({'county' : i , 
                        'students' : new_df2['n'].sum() ,
                        'vaccinated' : new_df2['nDTP'].sum() ,
                        'not_vaccinated' : new_df2['not_vaccinated'].sum() ,
                        'year' : x
                       } ,ignore_index=True)
# sort the data frame according to county and year
df = df.sort_values(by=['county', 'year'])
# to account for missing ALPINE county for 2014 year, we add values of zero for the year 2014
df = df.append({'county' : 'ALPINE' , 'students' : 0, 'vaccinated' : 0, 'not_vaccinated' : 0, 'year' : 2014, 'cases' : 0 } , ignore_index=True)
df = df.sort_values(by=['county', 'year'])
# now we calculate the vaccinated and unvaccinated data for the year 2014
vaccinated_2014 = []
not_vaccinated_2014 = []
for i in range(len(df['year'])):
    if df['year'].tolist()[i] == 2014:
        vaccinated_2014.append(df['vaccinated'].tolist()[i])
        not_vaccinated_2014.append(df['not_vaccinated'].tolist()[i])     
# now we drop data for 2014 as we do not need it from here
df = df[df.year != 2014]
# we drop the rates data of the years 2010 to 2014 and drop the cases, and store it to a new dataframe
rates_new = ratesData.drop(['Rate2010','Rate2011','Rate2012', 'Rate2013','Rate2014','Cases2014'], axis=1)
# get the transpose, used for the average calculation in the loop
conv = rates_new.T
l = []
for i in conv:
    for w in range(1,len(conv[i].tolist())):
        l.append(conv[i].tolist()[w])
# set the dataframe's cases column to the list containing cases per year for each county    
df['cases'] = l
# define variables used in the averages calculations
a = 0
b = 0
c = 0
d = 0
e = 0
f = 0
g = 0
h = 0
u = 0
t = 0
y = 0
o = 0
res1 = 0
res2 = 0
res3 = 0
counties = []
avg_cases = []
avg_vacc = []
avg_notVacc = []
# loop through and calculate the averages over the 2010 to 2013 years
for i in range(0, len(df['cases']), 4):
    counties.append(df['county'].tolist()[i])
    # vaccinated data 
    a = df['vaccinated'].tolist()[i]
    b = df['vaccinated'].tolist()[i+1]
    c = df['vaccinated'].tolist()[i+2]
    d = df['vaccinated'].tolist()[i+3]
    # unvaccinated data
    e = df['not_vaccinated'].tolist()[i]
    f = df['not_vaccinated'].tolist()[i+1]
    g = df['not_vaccinated'].tolist()[i+2]
    h = df['not_vaccinated'].tolist()[i+3]
    # cases data
    u = df['cases'].tolist()[i]
    t = df['cases'].tolist()[i+1]
    y = df['cases'].tolist()[i+2]
    o = df['cases'].tolist()[i+3]
    # averages calculation
    res1 = round((a+b+c+d)/4)
    res2 = round((e+f+g+h)/4)
    res3 = round((u+t+y+o)/4)
    # append results to the lists
    avg_vacc.append(res1)  
    avg_notVacc.append(res2)
    avg_cases.append(res3)
# this represents the final dataframe
cols = ['not_vaccinated_avg','cases_avg']
final_df = pd.DataFrame( columns = cols)
final_df['not_vaccinated_avg'] = avg_notVacc
final_df['cases_avg'] = avg_cases
# classification ranges:
# H: 140-2000 cases which is 3
# M: 10 - 139 cases which is 2
# L: 0-9 cases which is 1
update = []
# update the final dataframe cases_avg from numerical to categorical
for i in range(len(final_df['cases_avg'].tolist())):
    if final_df['cases_avg'].tolist()[i] >= 0 and final_df['cases_avg'].tolist()[i] <= 9:
        update.append(1)
    elif final_df['cases_avg'].tolist()[i] >= 10 and final_df['cases_avg'].tolist()[i] <= 139:
         update.append(2)
    else: update.append(3)
# set the final data frame cases_avg to the updated categorical representation
final_df['cases_avg'] = update
# for the 2014 data used in testing later, create dataframe containing data of not vaccinated for year 2014 only
cols = ['not_vaccinated']
data_2014 = pd.DataFrame( columns = cols)
data_2014['not_vaccinated'] = not_vaccinated_2014
# now we represent the x and y axis for input and output for the model
# y-axis represents cases_avg
Y = final_df['cases_avg']
# x-axis represents not_vaccinated_avg
X = final_df.drop(['cases_avg'], axis=1)
# use a 70/30 split for training and testing
X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.70, random_state=0)
# fit the model
regressor = DecisionTreeClassifier().fit(X_train, y_train)
# get the predicted results
results = regressor.predict(data_2014)
predicted = dict((el,0) for el in counties)
# map values to counties' names
index = 0
for i in predicted:
    predicted[i] = results[index]
    index += 1
# same update for the rates cases 2014 data
# H: 140-2000 cases which is 3
# M: 10 - 139 cases which is 2
# L: 0-9 cases which is 1
update2 = []
for i in range(len(ratesData['Cases2014'].tolist())):
    if ratesData['Cases2014'].tolist()[i] >= 0 and ratesData['Cases2014'].tolist()[i] <= 9:
        update2.append(1)
    elif ratesData['Cases2014'].tolist()[i] >= 10 and ratesData['Cases2014'].tolist()[i] <= 139:
         update2.append(2)
    else: update2.append(3)
# gather the actual values for the accuracy calculation
actual = dict((el,0) for el in counties)
index = 0
for i in actual:
    actual[i] = update2[index]
    index += 1
# print the accuracy score
score = accuracy_score(list(predicted.values()), list(actual.values())) * 100
print("Accuracy:",score,"%")



Accuracy: 67.24137931034483 %
