In [1]:
from configparser import ConfigParser
import psycopg2
import numpy as np
import pandas as pd
from sklearn.feature_selection import VarianceThreshold
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import LinearSVC
from sklearn.metrics import recall_score, precision_score
#Can be very helpful to notice any imbalance in classes
from collections import Counter 
from sklearn.tree import export_text

  """)


In [7]:
def config(filename='database.ini', section='postgresql'):
    parser = ConfigParser()
    # read config file
    parser.read(filename) 
 
    # get section, default to postgresql
    db = {}
    if parser.has_section(section):
        params = parser.items(section)
        for param in params:
            db[param[0]] = param[1]
    else:
        raise Exception('Section {0} not found in the {1} file'.format(section, filename))
 
    return db

In [5]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [9]:
#Get the configuration file as a python dictionary
cfg = config()

In [10]:
#Establish the connection and create a cursor to the database
try:
    print("Attempting to connect to the database")
    conn = psycopg2.connect(**cfg)
    cursor = conn.cursor()
    print("Success")
    
except (Exception, psycopg2.DatabaseError) as error:
    print(error)

Attempting to connect to the database
Success


In [11]:
try:
    #Lets get our data 
    cursor.execute("SELECT patient.gender, patient.age_group, patient.acquisition_info, patient.outbreak_related,\
                   onset.day_of_year, onset.weekend_indr, onset.is_holiday, onset.season,\
                   reported.day_of_year, reported.weekend_indr, reported.is_holiday, reported.season,\
                   test.day_of_year, test.weekend_indr, test.is_holiday, test.season,\
                   specimen.day_of_year, specimen.weekend_indr, specimen.is_holiday, specimen.season,\
                   weather.mean_temp_c, weather.total_rain_mm, weather.total_snow_cm,\
                   mobility.retail_and_recreation, mobility.grocery_and_pharmacy, mobility.parks, mobility.transit_stations, mobility.workplaces, mobility.residential,\
                   phu.city, fact.number_resolved, fact.number_fatal\
                   from fact_table as fact inner join\
                   patient_dimension as patient on fact.patient_surrogate = patient.patient_surrogate_key inner join onset_date_dimension as onset\
                   on fact.onset_date_surrogate = onset.date_surrogate inner join reported_date_dimension as reported on fact.reported_date_surrogate = reported.date_surrogate\
                   inner join test_date_dimension as test on fact.test_date_surrogate = test.date_surrogate inner join specimen_date_dimension as specimen on fact.specimen_date_surrogate = specimen.date_surrogate inner join\
                   weather_dimension as weather on fact.weather_surrogate = weather.weather_surrogate inner join\
                   mobility_dimension as mobility on fact.mobility_surrogate = mobility.mobility_surrogate_key\
                   inner join phu_location_dimension as phu on fact.phu_location_surrogate = phu.phu_surrogate_key")
    #Get the complete result set. It will be a list of tuples where each tuple is a row from the result set
    result_list = cursor.fetchall()
        
except (Exception, psycopg2.DatabaseError) as error:
    print(error)

In [12]:
print(len(result_list[23423]))
print(result_list[23423])

32
('MALE', '50s', 'CC', 'No', 244, 'f', 'f', 'summer', 245, 'f', 'f', 'summer', 245, 'f', 'f', 'summer', 244, 'f', 'f', 'summer', 23.4, 10.4, 0.0, -25, -6, 95, -50, -52, 14, 'Toronto', 1, 0)


In [13]:
#Ensure to run this cell at the end of all your experiments to close all connections
cursor.close()
conn.close()

In [14]:
#Now, 
result_df = pd.DataFrame(result_list, columns = ['Gender','Age_Group', 'Acquisition_Info', 'Outbreak_Related', 
                    'Onset_Day_Of_Year','Onset_Weekend','Onset_Holiday','Onset_season',
                    'Reported_Day_Of_Year','Reported_Weekend','Reported_Holiday','Reported_season',
                    'Test_Day_Of_Year','Test_Weekend','Test_Holiday','Test_season',
                    'Specimen_Day_Of_Year','Specimen_Weekend','Specimen_Holiday','Specimen_season',
                    'Mean_Temp_c', 'Total_Rain_mm', 'Total_Snow_cm', 
                    'Retail_and_Recreation', 'Grocery_and_Pharmacy', 'Parks','Transit_Stations','Workplaces','Residential',
                    'City','Number_Resolved','Number_Fatal'])

In [15]:
result_df.head(50000)

Unnamed: 0,Gender,Age_Group,Acquisition_Info,Outbreak_Related,Onset_Day_Of_Year,Onset_Weekend,Onset_Holiday,Onset_season,Reported_Day_Of_Year,Reported_Weekend,Reported_Holiday,Reported_season,Test_Day_Of_Year,Test_Weekend,Test_Holiday,Test_season,Specimen_Day_Of_Year,Specimen_Weekend,Specimen_Holiday,Specimen_season,Mean_Temp_c,Total_Rain_mm,Total_Snow_cm,Retail_and_Recreation,Grocery_and_Pharmacy,Parks,Transit_Stations,Workplaces,Residential,City,Number_Resolved,Number_Fatal
0,FEMALE,40s,CC,No,336,f,f,autumn,345,f,f,autumn,345.0,f,f,autumn,343.0,f,f,autumn,3.4,0.0,0.0,-15,8,39.0,-53.0,-42,18,Oakville,1,0
1,MALE,40s,MISSING INFORMATION,No,345,f,f,autumn,345,f,f,autumn,,,,,,,,,3.4,0.0,0.0,-46,-10,-13.0,-60.0,-48,20,Toronto,1,0
2,MALE,20s,CC,No,341,t,f,autumn,344,f,f,autumn,344.0,f,f,autumn,342.0,f,f,autumn,2.3,0.2,4.2,-37,-14,-29.0,-65.0,-42,18,Mississauga,4,0
3,MALE,20s,OB,Yes,344,f,f,autumn,345,f,f,autumn,345.0,f,f,autumn,344.0,f,f,autumn,3.4,0.0,0.0,-15,8,39.0,-53.0,-42,18,Oakville,1,0
4,FEMALE,20s,CC,No,325,f,f,autumn,345,f,f,autumn,345.0,f,f,autumn,343.0,f,f,autumn,0.1,0.0,0.0,-25,0,-29.0,-64.0,-49,19,Ottawa,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
49995,MALE,<20,CC,Yes,342,f,f,autumn,345,f,f,autumn,345.0,f,f,autumn,342.0,f,f,autumn,3.4,0.0,0.0,-46,-10,-13.0,-60.0,-48,20,Toronto,2,0
49996,FEMALE,30s,NO KNOWN EPI LINK,No,328,f,f,autumn,331,f,f,autumn,331.0,f,f,autumn,329.0,f,f,autumn,9.3,1.2,0.0,-44,-10,-9.0,-60.0,-48,20,Toronto,1,0
49997,FEMALE,90+,NO KNOWN EPI LINK,No,304,f,f,autumn,304,f,f,autumn,304.0,f,f,autumn,304.0,f,f,autumn,-1.0,0.0,0.0,-25,-7,-11.0,-57.0,-38,15,Mississauga,1,0
49998,FEMALE,70s,CC,No,343,f,f,autumn,346,f,f,autumn,346.0,f,f,autumn,345.0,f,f,autumn,5.1,0.0,0.0,-34,-13,-3.0,-63.0,-41,18,Mississauga,1,0


In [16]:
dates = result_df[["Onset_Day_Of_Year","Reported_Day_Of_Year","Test_Day_Of_Year","Specimen_Day_Of_Year"]]

dateFeatures = {'Onset_To_Reported': [],
        'Reported_To_Test': [],
        'Test_To_Specimen':[]}


In [17]:
dates.head()

Unnamed: 0,Onset_Day_Of_Year,Reported_Day_Of_Year,Test_Day_Of_Year,Specimen_Day_Of_Year
0,336,345,345.0,343.0
1,345,345,,
2,341,344,344.0,342.0
3,344,345,345.0,344.0
4,325,345,345.0,343.0


In [18]:
for index, row in dates.iterrows():
  dateFeatures["Onset_To_Reported"].append(row[1] - row[0])
  dateFeatures.get("Reported_To_Test").append(row[2] - row[1])
  dateFeatures.get("Test_To_Specimen").append(row[3] - row[2])

In [19]:
del result_df['Onset_Day_Of_Year']
del result_df['Reported_Day_Of_Year']
del result_df['Test_Day_Of_Year']
del result_df['Specimen_Day_Of_Year']

In [20]:
result_df.insert(4, "Onset_To_Reported", dateFeatures["Onset_To_Reported"], True)
result_df.insert(5, "Reported_To_Test", dateFeatures["Reported_To_Test"], True)
result_df.insert(6, "Test_To_Specimen", dateFeatures["Test_To_Specimen"], True)

In [21]:
result_df.head(30000)

Unnamed: 0,Gender,Age_Group,Acquisition_Info,Outbreak_Related,Onset_To_Reported,Reported_To_Test,Test_To_Specimen,Onset_Weekend,Onset_Holiday,Onset_season,Reported_Weekend,Reported_Holiday,Reported_season,Test_Weekend,Test_Holiday,Test_season,Specimen_Weekend,Specimen_Holiday,Specimen_season,Mean_Temp_c,Total_Rain_mm,Total_Snow_cm,Retail_and_Recreation,Grocery_and_Pharmacy,Parks,Transit_Stations,Workplaces,Residential,City,Number_Resolved,Number_Fatal
0,FEMALE,40s,CC,No,9.0,0.0,-2.0,f,f,autumn,f,f,autumn,f,f,autumn,f,f,autumn,3.4,0.0,0.0,-15,8,39.0,-53.0,-42,18,Oakville,1,0
1,MALE,40s,MISSING INFORMATION,No,0.0,,,f,f,autumn,f,f,autumn,,,,,,,3.4,0.0,0.0,-46,-10,-13.0,-60.0,-48,20,Toronto,1,0
2,MALE,20s,CC,No,3.0,0.0,-2.0,t,f,autumn,f,f,autumn,f,f,autumn,f,f,autumn,2.3,0.2,4.2,-37,-14,-29.0,-65.0,-42,18,Mississauga,4,0
3,MALE,20s,OB,Yes,1.0,0.0,-1.0,f,f,autumn,f,f,autumn,f,f,autumn,f,f,autumn,3.4,0.0,0.0,-15,8,39.0,-53.0,-42,18,Oakville,1,0
4,FEMALE,20s,CC,No,20.0,0.0,-2.0,f,f,autumn,f,f,autumn,f,f,autumn,f,f,autumn,0.1,0.0,0.0,-25,0,-29.0,-64.0,-49,19,Ottawa,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
29995,MALE,<20,CC,No,3.0,0.0,-3.0,f,f,autumn,f,f,autumn,f,f,autumn,f,f,autumn,10.9,0.8,0.0,-33,-3,14.0,-53.0,-46,17,Toronto,2,0
29996,FEMALE,30s,MISSING INFORMATION,No,3.0,0.0,-2.0,f,f,autumn,f,f,autumn,f,f,autumn,t,f,autumn,13.2,0.0,0.0,-36,-10,45.0,-55.0,-47,17,Toronto,3,0
29997,FEMALE,40s,NO KNOWN EPI LINK,No,5.0,0.0,-2.0,f,f,autumn,t,f,autumn,t,f,autumn,f,f,autumn,2.3,1.2,0.0,-45,-14,-5.0,-52.0,-20,9,Toronto,1,0
29998,FEMALE,30s,MISSING INFORMATION,No,4.0,0.0,-2.0,t,f,autumn,f,f,autumn,f,f,autumn,f,f,autumn,3.4,0.0,0.0,-46,-10,-13.0,-60.0,-48,20,Toronto,2,0


In [22]:
Counter(result_df['City'])

Counter({'Mississauga': 24265,
         'Newmarket': 12316,
         'Oakville': 4600,
         'Ottawa': 6380,
         'Toronto': 34671,
         'Whitby': 5165})

In [23]:
result_df.head()# this is our features

Unnamed: 0,Gender,Age_Group,Acquisition_Info,Outbreak_Related,Onset_To_Reported,Reported_To_Test,Test_To_Specimen,Onset_Weekend,Onset_Holiday,Onset_season,Reported_Weekend,Reported_Holiday,Reported_season,Test_Weekend,Test_Holiday,Test_season,Specimen_Weekend,Specimen_Holiday,Specimen_season,Mean_Temp_c,Total_Rain_mm,Total_Snow_cm,Retail_and_Recreation,Grocery_and_Pharmacy,Parks,Transit_Stations,Workplaces,Residential,City,Number_Resolved,Number_Fatal
0,FEMALE,40s,CC,No,9.0,0.0,-2.0,f,f,autumn,f,f,autumn,f,f,autumn,f,f,autumn,3.4,0.0,0.0,-15,8,39.0,-53.0,-42,18,Oakville,1,0
1,MALE,40s,MISSING INFORMATION,No,0.0,,,f,f,autumn,f,f,autumn,,,,,,,3.4,0.0,0.0,-46,-10,-13.0,-60.0,-48,20,Toronto,1,0
2,MALE,20s,CC,No,3.0,0.0,-2.0,t,f,autumn,f,f,autumn,f,f,autumn,f,f,autumn,2.3,0.2,4.2,-37,-14,-29.0,-65.0,-42,18,Mississauga,4,0
3,MALE,20s,OB,Yes,1.0,0.0,-1.0,f,f,autumn,f,f,autumn,f,f,autumn,f,f,autumn,3.4,0.0,0.0,-15,8,39.0,-53.0,-42,18,Oakville,1,0
4,FEMALE,20s,CC,No,20.0,0.0,-2.0,f,f,autumn,f,f,autumn,f,f,autumn,f,f,autumn,0.1,0.0,0.0,-25,0,-29.0,-64.0,-49,19,Ottawa,1,0


In [24]:
for index, row in result_df.iterrows():

  if row['Onset_Weekend'] == 'f':
    result_df.at[index,'Onset_Weekend'] = 0
  elif row['Onset_Weekend'] == 't':
    result_df.at[index,'Onset_Weekend'] = 1
  if row['Onset_Holiday'] == 'f':
    result_df.at[index,'Onset_Holiday'] = 0
  elif row['Onset_Holiday'] == 't':
    result_df.at[index,'Onset_Holiday'] = 1

  if row['Reported_Weekend'] == 'f':
    result_df.at[index,'Reported_Weekend'] = 0
  elif row['Reported_Weekend'] == 't':
    result_df.at[index,'Reported_Weekend'] = 1
  if row['Reported_Holiday'] == 'f':
    result_df.at[index,'Reported_Holiday'] = 0
  elif row['Reported_Holiday'] == 't':
    result_df.at[index,'Reported_Holiday'] = 1

  if row['Test_Weekend'] == 'f':
    result_df.at[index,'Test_Weekend'] = 0
  elif row['Test_Weekend'] == 't':
    result_df.at[index,'Test_Weekend'] = 1
  if row['Test_Holiday'] == 'f':
    result_df.at[index,'Test_Holiday'] = 0
  elif row['Test_Holiday'] == 't':
    result_df.at[index,'Test_Holiday'] = 1

  if row['Specimen_Weekend'] == 'f':
    result_df.at[index,'Specimen_Weekend'] = 0
  elif row['Specimen_Weekend'] == 't':
    result_df.at[index,'Specimen_Weekend'] = 1
  if row['Specimen_Holiday'] == 'f':
    result_df.at[index,'Specimen_Holiday'] = 0
  elif row['Specimen_Holiday'] == 't':
    result_df.at[index,'Specimen_Holiday'] = 1

  if row['City'] == 'Ottawa':
    result_df.at[index,'City'] = 0
  elif row['City'] == 'Toronto':
    result_df.at[index,'City'] = 1
  elif row['City'] == 'Mississauga':
    result_df.at[index,'City'] = 2
  elif row['City'] == 'Oakville':
    result_df.at[index,'City'] = 3
  elif row['City'] == 'Newmarket':
    result_df.at[index,'City'] = 4
  elif row['City'] == 'Whitby':
    result_df.at[index,'City'] = 5

Ottawa = 1
Toronto = 2
Mississauga = 3
Oakville = 4
Newmarket = 5
Whitby = 6

In [26]:
result_df.head()

Unnamed: 0,Gender,Age_Group,Acquisition_Info,Outbreak_Related,Onset_To_Reported,Reported_To_Test,Test_To_Specimen,Onset_Weekend,Onset_Holiday,Onset_season,Reported_Weekend,Reported_Holiday,Reported_season,Test_Weekend,Test_Holiday,Test_season,Specimen_Weekend,Specimen_Holiday,Specimen_season,Mean_Temp_c,Total_Rain_mm,Total_Snow_cm,Retail_and_Recreation,Grocery_and_Pharmacy,Parks,Transit_Stations,Workplaces,Residential,City,Number_Resolved,Number_Fatal
0,FEMALE,40s,CC,No,9.0,0.0,-2.0,0,0,autumn,0,0,autumn,0.0,0.0,autumn,0.0,0.0,autumn,3.4,0.0,0.0,-15,8,39.0,-53.0,-42,18,3,1,0
1,MALE,40s,MISSING INFORMATION,No,0.0,,,0,0,autumn,0,0,autumn,,,,,,,3.4,0.0,0.0,-46,-10,-13.0,-60.0,-48,20,1,1,0
2,MALE,20s,CC,No,3.0,0.0,-2.0,1,0,autumn,0,0,autumn,0.0,0.0,autumn,0.0,0.0,autumn,2.3,0.2,4.2,-37,-14,-29.0,-65.0,-42,18,2,4,0
3,MALE,20s,OB,Yes,1.0,0.0,-1.0,0,0,autumn,0,0,autumn,0.0,0.0,autumn,0.0,0.0,autumn,3.4,0.0,0.0,-15,8,39.0,-53.0,-42,18,3,1,0
4,FEMALE,20s,CC,No,20.0,0.0,-2.0,0,0,autumn,0,0,autumn,0.0,0.0,autumn,0.0,0.0,autumn,0.1,0.0,0.0,-25,0,-29.0,-64.0,-49,19,0,1,0


In [28]:
# Our features have categorical data. We need to convert them to one-hot encoders to get the best results.
new_result_df = result_df[['Gender','Age_Group', 'Acquisition_Info', 'Outbreak_Related', 
                    'Onset_season', 'Reported_season', 'Test_season', 'Specimen_season']]
new_result_df = pd.get_dummies(new_result_df)
new_result_df["Onset_Weekend"] = result_df["Onset_Weekend"]
new_result_df["Reported_Weekend"] = result_df["Reported_Weekend"]
new_result_df["Test_Weekend"] = result_df["Test_Weekend"]
new_result_df["Specimen_Weekend"] = result_df["Specimen_Weekend"]
new_result_df["Onset_Holiday"] = result_df["Onset_Holiday"]
new_result_df["Reported_Holiday"] = result_df["Reported_Holiday"]
new_result_df["Test_Holiday"] = result_df["Test_Holiday"]
new_result_df["Specimen_Holiday"] = result_df["Specimen_Holiday"]
new_result_df["Onset_To_Reported"] = result_df["Onset_To_Reported"]
new_result_df["Reported_To_Test"] = result_df["Reported_To_Test"]
new_result_df["Test_To_Specimen"] = result_df["Test_To_Specimen"]
new_result_df["Mean_Temp_c"] = result_df["Mean_Temp_c"]
new_result_df["Total_Rain_mm"] = result_df["Total_Rain_mm"]
new_result_df["Total_Snow_cm"] = result_df["Total_Snow_cm"]
new_result_df["Retail_and_Recreation"] = result_df["Retail_and_Recreation"]
new_result_df["Grocery_and_Pharmacy"] = result_df["Grocery_and_Pharmacy"]
new_result_df["Parks"] = result_df["Parks"]
new_result_df["Transit_Stations"] = result_df["Transit_Stations"]
new_result_df["Workplaces"] = result_df["Workplaces"]
new_result_df["Residential"] = result_df["Residential"]
new_result_df["Number_Resolved"] = result_df["Number_Resolved"]
new_result_df["Number_Fatal"] = result_df["Number_Fatal"]
new_result_df["City"] = result_df["City"]

new_result_df.head()

Unnamed: 0,Gender_FEMALE,Gender_GENDER DIVERSE,Gender_MALE,Gender_UNSPECIFIED,Age_Group_20s,Age_Group_30s,Age_Group_40s,Age_Group_50s,Age_Group_60s,Age_Group_70s,Age_Group_80s,Age_Group_90+,Age_Group_<20,Age_Group_UNKNOWN,Acquisition_Info_CC,Acquisition_Info_MISSING INFORMATION,Acquisition_Info_NO KNOWN EPI LINK,Acquisition_Info_OB,Acquisition_Info_TRAVEL,Acquisition_Info_UNSPECIFIED EPI LINK,Outbreak_Related_No,Outbreak_Related_Yes,Onset_season_autumn,Onset_season_spring,Onset_season_summer,Onset_season_winter,Reported_season_autumn,Reported_season_summer,Reported_season_winter,Test_season_autumn,Test_season_spring,Test_season_summer,Test_season_winter,Specimen_season_autumn,Specimen_season_spring,Specimen_season_summer,Specimen_season_winter,Onset_Weekend,Reported_Weekend,Test_Weekend,Specimen_Weekend,Onset_Holiday,Reported_Holiday,Test_Holiday,Specimen_Holiday,Onset_To_Reported,Reported_To_Test,Test_To_Specimen,Mean_Temp_c,Total_Rain_mm,Total_Snow_cm,Retail_and_Recreation,Grocery_and_Pharmacy,Parks,Transit_Stations,Workplaces,Residential,Number_Resolved,Number_Fatal,City
0,1,0,0,0,0,0,1,0,0,0,0,0,0,0,1,0,0,0,0,0,1,0,1,0,0,0,1,0,0,1,0,0,0,1,0,0,0,0,0,0.0,0.0,0,0,0.0,0.0,9.0,0.0,-2.0,3.4,0.0,0.0,-15,8,39.0,-53.0,-42,18,1,0,3
1,0,0,1,0,0,0,1,0,0,0,0,0,0,0,0,1,0,0,0,0,1,0,1,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,,,0,0,,,0.0,,,3.4,0.0,0.0,-46,-10,-13.0,-60.0,-48,20,1,0,1
2,0,0,1,0,1,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,1,0,1,0,0,0,1,0,0,1,0,0,0,1,0,0,0,1,0,0.0,0.0,0,0,0.0,0.0,3.0,0.0,-2.0,2.3,0.2,4.2,-37,-14,-29.0,-65.0,-42,18,4,0,2
3,0,0,1,0,1,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,1,1,0,0,0,1,0,0,1,0,0,0,1,0,0,0,0,0,0.0,0.0,0,0,0.0,0.0,1.0,0.0,-1.0,3.4,0.0,0.0,-15,8,39.0,-53.0,-42,18,1,0,3
4,1,0,0,0,1,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,1,0,1,0,0,0,1,0,0,1,0,0,0,1,0,0,0,0,0,0.0,0.0,0,0,0.0,0.0,20.0,0.0,-2.0,0.1,0.0,0.0,-25,0,-29.0,-64.0,-49,19,1,0,0


In [31]:
new_result_df.isnull().sum().sum()

22304

In [32]:
new_result_df = new_result_df.dropna()
new_result_df=new_result_df.reset_index(drop=True)

In [34]:
new_result_df.isnull().sum().sum()

0

In [36]:
## preparation for svm
new_df2 = new_result_df

In [37]:
new_df2.columns

Index(['Gender_FEMALE', 'Gender_GENDER DIVERSE', 'Gender_MALE',
       'Gender_UNSPECIFIED', 'Age_Group_20s', 'Age_Group_30s', 'Age_Group_40s',
       'Age_Group_50s', 'Age_Group_60s', 'Age_Group_70s', 'Age_Group_80s',
       'Age_Group_90+', 'Age_Group_<20', 'Age_Group_UNKNOWN',
       'Acquisition_Info_CC', 'Acquisition_Info_MISSING INFORMATION',
       'Acquisition_Info_NO KNOWN EPI LINK', 'Acquisition_Info_OB',
       'Acquisition_Info_TRAVEL', 'Acquisition_Info_UNSPECIFIED EPI LINK',
       'Outbreak_Related_No', 'Outbreak_Related_Yes', 'Onset_season_autumn',
       'Onset_season_spring', 'Onset_season_summer', 'Onset_season_winter',
       'Reported_season_autumn', 'Reported_season_summer',
       'Reported_season_winter', 'Test_season_autumn', 'Test_season_spring',
       'Test_season_summer', 'Test_season_winter', 'Specimen_season_autumn',
       'Specimen_season_spring', 'Specimen_season_summer',
       'Specimen_season_winter', 'Onset_Weekend', 'Reported_Weekend',
       'Tes

In [None]:
#Get the labels
y = new_result_df['City']
y=y.astype('string')
#Drop the column for the next step
new_result_df.drop(columns=['City'], inplace=True)

X = new_result_df.values

In [None]:
new_result_df.head()

Unnamed: 0,Gender_FEMALE,Gender_GENDER DIVERSE,Gender_MALE,Gender_UNSPECIFIED,Age_Group_20s,Age_Group_30s,Age_Group_40s,Age_Group_50s,Age_Group_60s,Age_Group_70s,Age_Group_80s,Age_Group_90+,Age_Group_<20,Age_Group_UNKNOWN,Acquisition_Info_CC,Acquisition_Info_MISSING INFORMATION,Acquisition_Info_NO KNOWN EPI LINK,Acquisition_Info_OB,Acquisition_Info_TRAVEL,Acquisition_Info_UNSPECIFIED EPI LINK,Outbreak_Related_No,Outbreak_Related_Yes,Onset_season_autumn,Onset_season_spring,Onset_season_summer,Onset_season_winter,Reported_season_autumn,Reported_season_summer,Reported_season_winter,Test_season_autumn,Test_season_spring,Test_season_summer,Test_season_winter,Specimen_season_autumn,Specimen_season_spring,Specimen_season_summer,Specimen_season_winter,Onset_Weekend,Reported_Weekend,Test_Weekend,Specimen_Weekend,Onset_Holiday,Reported_Holiday,Test_Holiday,Specimen_Holiday,Onset_To_Reported,Reported_To_Test,Test_To_Specimen,Mean_Temp_c,Total_Rain_mm,Total_Snow_cm,Retail_and_Recreation,Grocery_and_Pharmacy,Parks,Transit_Stations,Workplaces,Residential,Number_Resolved,Number_Fatal
0,1,0,0,0,0,0,1,0,0,0,0,0,0,0,1,0,0,0,0,0,1,0,1,0,0,0,1,0,0,1,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,9.0,0.0,-2.0,3.4,0.0,0.0,-15,8,39.0,-53.0,-42,18,1,0
1,0,0,1,0,1,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,1,0,1,0,0,0,1,0,0,1,0,0,0,1,0,0,0,1,0,0,0,0,0,0,0,3.0,0.0,-2.0,2.3,0.2,4.2,-37,-14,-29.0,-65.0,-42,18,4,0
2,0,0,1,0,1,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,1,1,0,0,0,1,0,0,1,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,1.0,0.0,-1.0,3.4,0.0,0.0,-15,8,39.0,-53.0,-42,18,1,0
3,1,0,0,0,1,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,1,0,1,0,0,0,1,0,0,1,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,20.0,0.0,-2.0,0.1,0.0,0.0,-25,0,-29.0,-64.0,-49,19,1,0
4,0,0,1,0,0,0,0,1,0,0,0,0,0,0,0,0,1,0,0,0,1,0,1,0,0,0,1,0,0,1,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,2.0,0.0,-2.0,3.4,0.0,0.0,-34,-8,-15.0,-64.0,-41,17,1,0


## **FEATURE ENGINEERING/PROCESSING DONE**


**Decision Tree**

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, shuffle=True, stratify=y)

In [None]:
type(y_train)

pandas.core.series.Series

In [None]:
#Let's see the no. of records per class in training and test set
print("Training set {} ".format(Counter(y_train)))
print("Test set {} ".format(Counter(y_test)))

Training set Counter({'1': 24269, '2': 17526, '4': 8751, '0': 4957, '5': 3646, '3': 3318}) 
Test set Counter({'1': 6068, '2': 4381, '4': 2188, '0': 1239, '5': 911, '3': 830}) 


In [None]:
#This step is to train our classifier based on the training data.
dt = DecisionTreeClassifier(criterion='entropy')
#
#criterion='entropy',max_depth=None
dt.fit(X_train, y_train)

DecisionTreeClassifier(ccp_alpha=0.0, class_weight=None, criterion='entropy',
                       max_depth=None, max_features=None, max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, presort='deprecated',
                       random_state=None, splitter='best')

In [None]:
# Now that our classifier is trained, it is time to test it with the testing data. Because our data is imbalanced, we should loom at precision and recall, rather than accuracy.
y_pred = dt.predict(X_test)
recall = recall_score(y_pred, y_test,average=None) * 100
precision = precision_score(y_pred, y_test,average=None) * 100

from sklearn.metrics import accuracy_score
accuracy = accuracy_score(y_test, y_pred) * 100

#print("Recall of Decision Tree {:.2f} %".format(recall))
#print("precision of Decision Tree {:.2f} %".format(precision))

from sklearn.metrics import confusion_matrix
cm = confusion_matrix(y_test, y_pred)

print(recall)
print(precision)
print(accuracy)
print(cm)

print("Mean Recall:" , np.mean(recall))
print("Mean Precision:" , np.mean(precision))
print("Accuracy:" , accuracy)

[100.         100.         100.         100.         100.
  99.89035088]
[100.         100.         100.         100.          99.95429616
 100.        ]
99.99359672152141
[[1239    0    0    0    0    0]
 [   0 6068    0    0    0    0]
 [   0    0 4381    0    0    0]
 [   0    0    0  830    0    0]
 [   0    0    0    0 2187    1]
 [   0    0    0    0    0  911]]
Mean Recall: 99.98172514619883
Mean Precision: 99.99238269347957
Accuracy: 99.99359672152141


In [None]:
r = export_text(dt, list(new_result_df.columns))
print(r)

|--- Retail_and_Recreation <= -27.50
|   |--- Transit_Stations <= -62.50
|   |   |--- Workplaces <= -45.50
|   |   |   |--- Retail_and_Recreation <= -39.50
|   |   |   |   |--- Parks <= 15.00
|   |   |   |   |   |--- Residential <= 27.50
|   |   |   |   |   |   |--- Transit_Stations <= -76.00
|   |   |   |   |   |   |   |--- class: 0
|   |   |   |   |   |   |--- Transit_Stations >  -76.00
|   |   |   |   |   |   |   |--- Retail_and_Recreation <= -40.50
|   |   |   |   |   |   |   |   |--- class: 1
|   |   |   |   |   |   |   |--- Retail_and_Recreation >  -40.50
|   |   |   |   |   |   |   |   |--- Mean_Temp_c <= 2.05
|   |   |   |   |   |   |   |   |   |--- class: 3
|   |   |   |   |   |   |   |   |--- Mean_Temp_c >  2.05
|   |   |   |   |   |   |   |   |   |--- class: 1
|   |   |   |   |   |--- Residential >  27.50
|   |   |   |   |   |   |--- Retail_and_Recreation <= -82.00
|   |   |   |   |   |   |   |--- Parks <= -23.00
|   |   |   |   |   |   |   |   |--- Mean_Temp_c <= -0.80
|   

**Gradient Boosting**

In [None]:
from sklearn.ensemble import GradientBoostingClassifier

In [None]:
gb = GradientBoostingClassifier()
gb.fit(X_train, y_train)

GradientBoostingClassifier(ccp_alpha=0.0, criterion='friedman_mse', init=None,
                           learning_rate=0.1, loss='deviance', max_depth=3,
                           max_features=None, max_leaf_nodes=None,
                           min_impurity_decrease=0.0, min_impurity_split=None,
                           min_samples_leaf=1, min_samples_split=2,
                           min_weight_fraction_leaf=0.0, n_estimators=100,
                           n_iter_no_change=None, presort='deprecated',
                           random_state=None, subsample=1.0, tol=0.0001,
                           validation_fraction=0.1, verbose=0,
                           warm_start=False)

In [None]:
y_pred = gb.predict(X_test)
recall = recall_score(y_pred, y_test,average=None) * 100
precision = precision_score(y_pred, y_test,average=None) * 100
#print("Recall of Decision Tree {:.2f} %".format(recall))
#print("precision of Decision Tree {:.2f} %".format(precision))

accuracy = accuracy_score(y_test, y_pred) * 100

cm = confusion_matrix(y_test, y_pred)

print(recall)
print(precision)
print(accuracy)
print(cm)

print("Mean Recall:" , np.mean(recall))
print("Mean Precision:" , np.mean(precision))
print("Accuracy:" , accuracy)

[100.         100.          99.81772613  99.75639464  99.8630137
 100.        ]
[100.         100.         100.          98.6746988   99.95429616
  99.89023052]
99.91675737977845
[[1239    0    0    0    0    0]
 [   0 6068    0    0    0    0]
 [   0    0 4381    0    0    0]
 [   0    0    8  819    3    0]
 [   0    0    0    1 2187    0]
 [   0    0    0    1    0  910]]
Mean Recall: 99.90618907880464
Mean Precision: 99.75320424532913
Accuracy: 99.91675737977845


**Random Forest**

In [None]:
from sklearn.ensemble import RandomForestClassifier

In [None]:
clf = RandomForestClassifier()
clf.fit(X_train, y_train)

RandomForestClassifier(bootstrap=True, ccp_alpha=0.0, class_weight=None,
                       criterion='gini', max_depth=None, max_features='auto',
                       max_leaf_nodes=None, max_samples=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=100,
                       n_jobs=None, oob_score=False, random_state=None,
                       verbose=0, warm_start=False)

In [None]:
y_pred = clf.predict(X_test)
recall = recall_score(y_pred, y_test,average=None) * 100
precision = precision_score(y_pred, y_test,average=None) * 100
#print("Recall of Decision Tree {:.2f} %".format(recall))
#print("precision of Decision Tree {:.2f} %".format(precision))

accuracy = accuracy_score(y_test, y_pred) * 100

cm = confusion_matrix(y_test, y_pred)

print(recall)
print(precision)
print(accuracy)
print(cm)

print("Mean Recall:" , np.mean(recall))
print("Mean Precision:" , np.mean(precision))
print("Accuracy:" , accuracy)

[100.         100.         100.         100.          99.95431704
 100.        ]
[100.         100.         100.          99.87951807 100.
 100.        ]
99.99359672152141
[[1239    0    0    0    0    0]
 [   0 6068    0    0    0    0]
 [   0    0 4381    0    0    0]
 [   0    0    0  829    1    0]
 [   0    0    0    0 2188    0]
 [   0    0    0    0    0  911]]
Mean Recall: 99.9923861732907
Mean Precision: 99.97991967871485
Accuracy: 99.99359672152141


**No Mobility or Weather Features**

In [None]:
new_run = result_df[['Gender','Age_Group', 'Acquisition_Info', 'Outbreak_Related', 
                    'Onset_season', 'Reported_season', 'Test_season', 'Specimen_season']]

new_run = pd.get_dummies(new_run)
new_run["Onset_Weekend"] = result_df["Onset_Weekend"]
new_run["Reported_Weekend"] = result_df["Reported_Weekend"]
new_run["Test_Weekend"] = result_df["Test_Weekend"]
new_run["Specimen_Weekend"] = result_df["Specimen_Weekend"]
new_run["Onset_Holiday"] = result_df["Onset_Holiday"]
new_run["Reported_Holiday"] = result_df["Reported_Holiday"]
new_run["Test_Holiday"] = result_df["Test_Holiday"]
new_run["Specimen_Holiday"] = result_df["Specimen_Holiday"]
new_run["Onset_To_Reported"] = result_df["Onset_To_Reported"]
new_run["Reported_To_Test"] = result_df["Reported_To_Test"]
new_run["Test_To_Specimen"] = result_df["Test_To_Specimen"]
new_run["Number_Resolved"] = result_df["Number_Resolved"]
new_run["Number_Fatal"] = result_df["Number_Fatal"]
new_run["City"] = result_df["City"]

new_run = new_run.dropna()
new_run=new_run.reset_index(drop=True)

In [None]:
#Get the labels
y = new_run['City']
y=y.astype('string')
#Drop the column for the next step
new_run.drop(columns=['City'], inplace=True)

X = new_run.values
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, shuffle=True, stratify=y)

In [None]:
#This step is to train our classifier based on the training data.
dt = DecisionTreeClassifier(criterion='entropy')
dt.fit(X_train, y_train)

DecisionTreeClassifier(ccp_alpha=0.0, class_weight=None, criterion='entropy',
                       max_depth=None, max_features=None, max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, presort='deprecated',
                       random_state=None, splitter='best')

In [None]:
# Now that our classifier is trained, it is time to test it with the testing data. Because our data is imbalanced, we should loom at precision and recall, rather than accuracy.
y_pred = dt.predict(X_test)
recall = recall_score(y_pred, y_test,average=None) * 100
precision = precision_score(y_pred, y_test,average=None) * 100
#print("Recall of Decision Tree {:.2f} %".format(recall))
#print("precision of Decision Tree {:.2f} %".format(precision))

from sklearn.metrics import confusion_matrix
accuracy = accuracy_score(y_test, y_pred) * 100

cm = confusion_matrix(y_test, y_pred)

print(recall)
print(precision)
print(accuracy)
print(cm)

print("Mean Recall:" , np.mean(recall))
print("Mean Precision:" , np.mean(precision))
print("Accuracy:" , accuracy)

[16.14906832 58.77730421 40.45547121  9.43738657 22.93632075 25.85949178]
[16.41673244 66.45083572 45.16329062  5.67066521 16.0677406  16.81243926]
43.358849997078245
[[ 208  415  403   39  149   53]
 [ 332 4413 1236  156  404  100]
 [ 370 1396 2185  170  517  200]
 [  80  324  319   52   95   47]
 [ 198  720  932   86  389   96]
 [ 100  240  326   48  142  173]]
Mean Recall: 28.935840474037477
Mean Precision: 27.763617308237784
Accuracy: 43.358849997078245


In [None]:
r = export_text(dt, list(new_run.columns))
print(r)

|--- Acquisition_Info_MISSING INFORMATION <= 0.50
|   |--- Test_To_Specimen <= -7.50
|   |   |--- Onset_To_Reported <= 7.50
|   |   |   |--- Reported_To_Test <= 0.50
|   |   |   |   |--- Acquisition_Info_CC <= 0.50
|   |   |   |   |   |--- Onset_To_Reported <= 2.50
|   |   |   |   |   |   |--- Test_To_Specimen <= -19.50
|   |   |   |   |   |   |   |--- Specimen_Weekend <= 0.50
|   |   |   |   |   |   |   |   |--- Onset_To_Reported <= 1.50
|   |   |   |   |   |   |   |   |   |--- class: 1
|   |   |   |   |   |   |   |   |--- Onset_To_Reported >  1.50
|   |   |   |   |   |   |   |   |   |--- Test_To_Specimen <= -76.50
|   |   |   |   |   |   |   |   |   |   |--- Test_To_Specimen <= -80.50
|   |   |   |   |   |   |   |   |   |   |   |--- truncated branch of depth 4
|   |   |   |   |   |   |   |   |   |   |--- Test_To_Specimen >  -80.50
|   |   |   |   |   |   |   |   |   |   |   |--- truncated branch of depth 3
|   |   |   |   |   |   |   |   |   |--- Test_To_Specimen >  -76.50
|   |   | 

In [None]:
#X_train, X_test, y_train, y_test
gb = GradientBoostingClassifier()
gb.fit(X_train, y_train)

GradientBoostingClassifier(ccp_alpha=0.0, criterion='friedman_mse', init=None,
                           learning_rate=0.1, loss='deviance', max_depth=3,
                           max_features=None, max_leaf_nodes=None,
                           min_impurity_decrease=0.0, min_impurity_split=None,
                           min_samples_leaf=1, min_samples_split=2,
                           min_weight_fraction_leaf=0.0, n_estimators=100,
                           n_iter_no_change=None, presort='deprecated',
                           random_state=None, subsample=1.0, tol=0.0001,
                           validation_fraction=0.1, verbose=0,
                           warm_start=False)

In [None]:
y_pred = gb.predict(X_test)
recall = recall_score(y_pred, y_test,average=None) * 100
precision = precision_score(y_pred, y_test,average=None) * 100
#print("Recall of Decision Tree {:.2f} %".format(recall))
#print("precision of Decision Tree {:.2f} %".format(precision))

accuracy = accuracy_score(y_test, y_pred) * 100

cm = confusion_matrix(y_test, y_pred)

print(recall)
print(precision)
print(accuracy)
print(cm)

print("Mean Recall:" , np.mean(recall))
print("Mean Precision:" , np.mean(precision))
print("Accuracy:" , accuracy)

[49.42528736 68.9107413  41.79489939 14.28571429 44.07582938 52.02020202]
[ 6.78768745 68.58906791 80.2811079   0.10905125 11.52416357 20.01943635]
52.65587565009058
[[  86  451  659    0   49   22]
 [  18 4555 1942    3   97   26]
 [  15  761 3884    1  112   65]
 [   3  240  616    1   30   27]
 [  26  430 1636    0  279   50]
 [  26  173  556    2   66  206]]
Mean Recall: 45.08544562230308
Mean Precision: 31.218419071130725
Accuracy: 52.65587565009058


In [None]:
clf = RandomForestClassifier()
clf.fit(X_train, y_train)

RandomForestClassifier(bootstrap=True, ccp_alpha=0.0, class_weight=None,
                       criterion='gini', max_depth=None, max_features='auto',
                       max_leaf_nodes=None, max_samples=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=100,
                       n_jobs=None, oob_score=False, random_state=None,
                       verbose=0, warm_start=False)

In [None]:
y_pred = clf.predict(X_test)
recall = recall_score(y_pred, y_test,average=None) * 100
precision = precision_score(y_pred, y_test,average=None) * 100
#print("Recall of Decision Tree {:.2f} %".format(recall))
#print("precision of Decision Tree {:.2f} %".format(precision))

accuracy = accuracy_score(y_test, y_pred) * 100

cm = confusion_matrix(y_test, y_pred)

print(recall)
print(precision)
print(accuracy)
print(cm)

print("Mean Recall:" , np.mean(recall))
print("Mean Precision:" , np.mean(precision))
print("Accuracy:" , accuracy)

[20.93023256 62.02549575 40.91806515  9.43820225 24.18699187 28.42639594]
[12.78610892 65.93886463 51.40553948  4.58015267 19.66129698 21.76870748]
45.40407877052533
[[ 162  409  435   40  161   60]
 [ 214 4379 1352  114  461  121]
 [ 188 1171 2487  161  620  211]
 [  43  271  386   42  119   56]
 [ 107  609 1054   59  476  116]
 [  60  221  364   29  131  224]]
Mean Recall: 30.987563919675768
Mean Precision: 29.356778361019362
Accuracy: 45.40407877052533


**No Date Besides 3 Engineered + Rest of Features**

In [None]:
# Our features have categorical data. We need to convert them to one-hot encoders to get the best results.
new_result_df = result_df[['Gender','Age_Group', 'Acquisition_Info', 'Outbreak_Related']]

new_result_df = pd.get_dummies(new_result_df)

new_result_df["Onset_To_Reported"] = result_df["Onset_To_Reported"]
new_result_df["Reported_To_Test"] = result_df["Reported_To_Test"]
new_result_df["Test_To_Specimen"] = result_df["Test_To_Specimen"]
new_result_df["Mean_Temp_c"] = result_df["Mean_Temp_c"]
new_result_df["Total_Rain_mm"] = result_df["Total_Rain_mm"]
new_result_df["Total_Snow_cm"] = result_df["Total_Snow_cm"]
new_result_df["Retail_and_Recreation"] = result_df["Retail_and_Recreation"]
new_result_df["Grocery_and_Pharmacy"] = result_df["Grocery_and_Pharmacy"]
new_result_df["Parks"] = result_df["Parks"]
new_result_df["Transit_Stations"] = result_df["Transit_Stations"]
new_result_df["Workplaces"] = result_df["Workplaces"]
new_result_df["Residential"] = result_df["Residential"]
new_result_df["Number_Resolved"] = result_df["Number_Resolved"]
new_result_df["Number_Fatal"] = result_df["Number_Fatal"]
new_result_df["City"] = result_df["City"]

new_result_df = new_result_df.dropna()
new_result_df=new_result_df.reset_index(drop=True)

new_result_df.head()

Unnamed: 0,Gender_FEMALE,Gender_GENDER DIVERSE,Gender_MALE,Gender_UNSPECIFIED,Age_Group_20s,Age_Group_30s,Age_Group_40s,Age_Group_50s,Age_Group_60s,Age_Group_70s,Age_Group_80s,Age_Group_90+,Age_Group_<20,Age_Group_UNKNOWN,Acquisition_Info_CC,Acquisition_Info_MISSING INFORMATION,Acquisition_Info_NO KNOWN EPI LINK,Acquisition_Info_OB,Acquisition_Info_TRAVEL,Acquisition_Info_UNSPECIFIED EPI LINK,Outbreak_Related_No,Outbreak_Related_Yes,Onset_To_Reported,Reported_To_Test,Test_To_Specimen,Mean_Temp_c,Total_Rain_mm,Total_Snow_cm,Retail_and_Recreation,Grocery_and_Pharmacy,Parks,Transit_Stations,Workplaces,Residential,Number_Resolved,Number_Fatal,City
0,1,0,0,0,0,0,1,0,0,0,0,0,0,0,1,0,0,0,0,0,1,0,9.0,0.0,-2.0,3.4,0.0,0.0,-15,8,39.0,-53.0,-42,18,1,0,3
1,0,0,1,0,1,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,1,0,3.0,0.0,-2.0,2.3,0.2,4.2,-37,-14,-29.0,-65.0,-42,18,4,0,2
2,0,0,1,0,1,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,1,1.0,0.0,-1.0,3.4,0.0,0.0,-15,8,39.0,-53.0,-42,18,1,0,3
3,1,0,0,0,1,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,1,0,20.0,0.0,-2.0,0.1,0.0,0.0,-25,0,-29.0,-64.0,-49,19,1,0,0
4,0,0,1,0,0,0,0,1,0,0,0,0,0,0,0,0,1,0,0,0,1,0,2.0,0.0,-2.0,3.4,0.0,0.0,-34,-8,-15.0,-64.0,-41,17,1,0,2


In [None]:
#Get the labels
y = new_result_df['City']
y=y.astype('string')
#Drop the column for the next step
new_result_df.drop(columns=['City'], inplace=True)

X = new_result_df.values
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, shuffle=True, stratify=y)

In [None]:
#This step is to train our classifier based on the training data.
dt = DecisionTreeClassifier(criterion='entropy')
dt.fit(X_train, y_train)

DecisionTreeClassifier(ccp_alpha=0.0, class_weight=None, criterion='entropy',
                       max_depth=None, max_features=None, max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, presort='deprecated',
                       random_state=None, splitter='best')

In [None]:
# Now that our classifier is trained, it is time to test it with the testing data. Because our data is imbalanced, we should loom at precision and recall, rather than accuracy.
y_pred = dt.predict(X_test)
recall = recall_score(y_pred, y_test,average=None) * 100
precision = precision_score(y_pred, y_test,average=None) * 100
#print("Recall of Decision Tree {:.2f} %".format(recall))
#print("precision of Decision Tree {:.2f} %".format(precision))

from sklearn.metrics import confusion_matrix
accuracy = accuracy_score(y_test, y_pred) * 100

cm = confusion_matrix(y_test, y_pred)

print(recall)
print(precision)
print(accuracy)
print(cm)

print("Mean Recall:" , np.mean(recall))
print("Mean Precision:" , np.mean(precision))
print("Accuracy:" , accuracy)

[100. 100. 100. 100. 100. 100.]
[100. 100. 100. 100. 100. 100.]
100.0
[[1239    0    0    0    0    0]
 [   0 6068    0    0    0    0]
 [   0    0 4381    0    0    0]
 [   0    0    0  830    0    0]
 [   0    0    0    0 2188    0]
 [   0    0    0    0    0  911]]
Mean Recall: 100.0
Mean Precision: 100.0
Accuracy: 100.0


In [None]:
r = export_text(dt, list(new_result_df.columns))
print(r)

|--- Retail_and_Recreation <= -27.50
|   |--- Transit_Stations <= -62.50
|   |   |--- Workplaces <= -45.50
|   |   |   |--- Retail_and_Recreation <= -39.50
|   |   |   |   |--- Parks <= 15.00
|   |   |   |   |   |--- Residential <= 27.50
|   |   |   |   |   |   |--- Transit_Stations <= -76.00
|   |   |   |   |   |   |   |--- class: 0
|   |   |   |   |   |   |--- Transit_Stations >  -76.00
|   |   |   |   |   |   |   |--- Retail_and_Recreation <= -40.50
|   |   |   |   |   |   |   |   |--- class: 1
|   |   |   |   |   |   |   |--- Retail_and_Recreation >  -40.50
|   |   |   |   |   |   |   |   |--- Total_Snow_cm <= 3.00
|   |   |   |   |   |   |   |   |   |--- class: 3
|   |   |   |   |   |   |   |   |--- Total_Snow_cm >  3.00
|   |   |   |   |   |   |   |   |   |--- class: 1
|   |   |   |   |   |--- Residential >  27.50
|   |   |   |   |   |   |--- Grocery_and_Pharmacy <= -73.50
|   |   |   |   |   |   |   |--- Residential <= 30.50
|   |   |   |   |   |   |   |   |--- Total_Snow_cm <= 

In [None]:
#X_train, X_test, y_train, y_test
gb = GradientBoostingClassifier()
gb.fit(X_train, y_train)

GradientBoostingClassifier(ccp_alpha=0.0, criterion='friedman_mse', init=None,
                           learning_rate=0.1, loss='deviance', max_depth=3,
                           max_features=None, max_leaf_nodes=None,
                           min_impurity_decrease=0.0, min_impurity_split=None,
                           min_samples_leaf=1, min_samples_split=2,
                           min_weight_fraction_leaf=0.0, n_estimators=100,
                           n_iter_no_change=None, presort='deprecated',
                           random_state=None, subsample=1.0, tol=0.0001,
                           validation_fraction=0.1, verbose=0,
                           warm_start=False)

In [None]:
y_pred = gb.predict(X_test)
recall = recall_score(y_pred, y_test,average=None) * 100
precision = precision_score(y_pred, y_test,average=None) * 100
#print("Recall of Decision Tree {:.2f} %".format(recall))
#print("precision of Decision Tree {:.2f} %".format(precision))

accuracy = accuracy_score(y_test, y_pred) * 100

cm = confusion_matrix(y_test, y_pred)

print(recall)
print(precision)
print(accuracy)
print(cm)

print("Mean Recall:" , np.mean(recall))
print("Mean Precision:" , np.mean(precision))
print("Accuracy:" , accuracy)

[52.04678363 68.70616326 41.85743734 35.71428571 41.72989378 51.28205128]
[ 7.02446725 69.3269086  79.74369574  0.54525627 11.35894259 17.49271137]
52.65587565009058
[[  89  410  688    2   57   21]
 [  16 4604 1907    2   87   25]
 [  18  783 3858    1  121   57]
 [   2  256  599    5   36   19]
 [  25  463 1607    2  275   49]
 [  21  185  558    2   83  180]]
Mean Recall: 48.5561025006289
Mean Precision: 30.91533030200408
Accuracy: 52.65587565009058


In [None]:
clf = RandomForestClassifier()
clf.fit(X_train, y_train)

RandomForestClassifier(bootstrap=True, ccp_alpha=0.0, class_weight=None,
                       criterion='gini', max_depth=None, max_features='auto',
                       max_leaf_nodes=None, max_samples=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=100,
                       n_jobs=None, oob_score=False, random_state=None,
                       verbose=0, warm_start=False)

In [None]:
y_pred = clf.predict(X_test)
recall = recall_score(y_pred, y_test,average=None) * 100
precision = precision_score(y_pred, y_test,average=None) * 100
#print("Recall of Decision Tree {:.2f} %".format(recall))
#print("precision of Decision Tree {:.2f} %".format(precision))

accuracy = accuracy_score(y_test, y_pred) * 100

cm = confusion_matrix(y_test, y_pred)

print(recall)
print(precision)
print(accuracy)
print(cm)

print("Mean Recall:" , np.mean(recall))
print("Mean Precision:" , np.mean(precision))
print("Accuracy:" , accuracy)

[20.90800478 62.83546438 42.15570876  8.73563218 25.62862669 27.64456982]
[13.8121547  67.33925614 51.81893344  4.14394766 21.89178026 19.04761905]
46.26891836615439
[[ 175  365  459   24  187   57]
 [ 215 4472 1263  120  455  116]
 [ 199 1154 2507  152  632  194]
 [  62  279  362   38  138   38]
 [ 114  621  985   63  530  108]
 [  72  226  371   38  126  196]]
Mean Recall: 31.318001102293223
Mean Precision: 29.675615205822997
Accuracy: 46.26891836615439


**No Mobility, Weather Features, Dates Besides 3 Engineered**

In [None]:
# Our features have categorical data. We need to convert them to one-hot encoders to get the best results.
limited = result_df[['Gender','Age_Group', 'Acquisition_Info', 'Outbreak_Related']]
limited = pd.get_dummies(limited)
limited["Onset_To_Reported"] = result_df["Onset_To_Reported"]
limited["Reported_To_Test"] = result_df["Reported_To_Test"]
limited["Test_To_Specimen"] = result_df["Test_To_Specimen"]
limited["Number_Resolved"] = result_df["Number_Resolved"]
limited["Number_Fatal"] = result_df["Number_Fatal"]
limited["City"] = result_df["City"]

limited = limited.dropna()
limited=limited.reset_index(drop=True)

limited.head()

Unnamed: 0,Gender_FEMALE,Gender_GENDER DIVERSE,Gender_MALE,Gender_UNSPECIFIED,Age_Group_20s,Age_Group_30s,Age_Group_40s,Age_Group_50s,Age_Group_60s,Age_Group_70s,Age_Group_80s,Age_Group_90+,Age_Group_<20,Age_Group_UNKNOWN,Acquisition_Info_CC,Acquisition_Info_MISSING INFORMATION,Acquisition_Info_NO KNOWN EPI LINK,Acquisition_Info_OB,Acquisition_Info_TRAVEL,Acquisition_Info_UNSPECIFIED EPI LINK,Outbreak_Related_No,Outbreak_Related_Yes,Onset_To_Reported,Reported_To_Test,Test_To_Specimen,Number_Resolved,Number_Fatal,City
0,1,0,0,0,0,0,1,0,0,0,0,0,0,0,1,0,0,0,0,0,1,0,9.0,0.0,-2.0,1,0,3
1,0,0,1,0,1,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,1,0,3.0,0.0,-2.0,4,0,2
2,0,0,1,0,1,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,1,1.0,0.0,-1.0,1,0,3
3,1,0,0,0,1,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,1,0,20.0,0.0,-2.0,1,0,0
4,0,0,1,0,0,0,0,1,0,0,0,0,0,0,0,0,1,0,0,0,1,0,2.0,0.0,-2.0,1,0,2


In [None]:
#Get the labels
y = limited['City']
y=y.astype('string')
#Drop the column for the next step
limited.drop(columns=['City'], inplace=True)

X = limited.values
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, shuffle=True, stratify=y)

In [None]:
#This step is to train our classifier based on the training data.
dt = DecisionTreeClassifier(criterion='entropy')
dt.fit(X_train, y_train)

DecisionTreeClassifier(ccp_alpha=0.0, class_weight=None, criterion='entropy',
                       max_depth=None, max_features=None, max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, presort='deprecated',
                       random_state=None, splitter='best')

In [None]:
# Now that our classifier is trained, it is time to test it with the testing data. Because our data is imbalanced, we should loom at precision and recall, rather than accuracy.
y_pred = dt.predict(X_test)
recall = recall_score(y_pred, y_test,average=None) * 100
precision = precision_score(y_pred, y_test,average=None) * 100
#print("Recall of Decision Tree {:.2f} %".format(recall))
#print("precision of Decision Tree {:.2f} %".format(precision))

from sklearn.metrics import confusion_matrix
accuracy = accuracy_score(y_test, y_pred) * 100

cm = confusion_matrix(y_test, y_pred)

print(recall)
print(precision)
print(accuracy)
print(cm)

print("Mean Recall:" , np.mean(recall))
print("Mean Precision:" , np.mean(precision))
print("Accuracy:" , accuracy)

[16.66666667 58.90463918 40.62237174  8.80829016 26.38469285 31.39841689]
[ 8.60299921 68.82999548 59.90078545  1.85387132 10.82197439 11.56462585]
46.607841991468476
[[ 109  497  521   10  101   29]
 [ 171 4571 1593   51  202   53]
 [ 148 1341 2898   58  297   96]
 [  45  354  439   17   45   17]
 [ 121  694 1251   28  262   65]
 [  60  303  432   29   86  119]]
Mean Recall: 30.464179579136133
Mean Precision: 26.929041950413744
Accuracy: 46.607841991468476


In [None]:
r = export_text(dt, list(limited.columns))
print(r)

|--- Acquisition_Info_MISSING INFORMATION <= 0.50
|   |--- Reported_To_Test <= -0.50
|   |   |--- Test_To_Specimen <= -1.50
|   |   |   |--- Reported_To_Test <= -1.50
|   |   |   |   |--- Reported_To_Test <= -3.50
|   |   |   |   |   |--- Acquisition_Info_CC <= 0.50
|   |   |   |   |   |   |--- Onset_To_Reported <= 6.50
|   |   |   |   |   |   |   |--- Age_Group_60s <= 0.50
|   |   |   |   |   |   |   |   |--- Gender_MALE <= 0.50
|   |   |   |   |   |   |   |   |   |--- Onset_To_Reported <= 5.50
|   |   |   |   |   |   |   |   |   |   |--- class: 1
|   |   |   |   |   |   |   |   |   |--- Onset_To_Reported >  5.50
|   |   |   |   |   |   |   |   |   |   |--- Age_Group_20s <= 0.50
|   |   |   |   |   |   |   |   |   |   |   |--- truncated branch of depth 2
|   |   |   |   |   |   |   |   |   |   |--- Age_Group_20s >  0.50
|   |   |   |   |   |   |   |   |   |   |   |--- class: 2
|   |   |   |   |   |   |   |   |--- Gender_MALE >  0.50
|   |   |   |   |   |   |   |   |   |--- Age_Group_4

In [None]:
#X_train, X_test, y_train, y_test
gb = GradientBoostingClassifier()
gb.fit(X_train, y_train)

GradientBoostingClassifier(ccp_alpha=0.0, criterion='friedman_mse', init=None,
                           learning_rate=0.1, loss='deviance', max_depth=3,
                           max_features=None, max_leaf_nodes=None,
                           min_impurity_decrease=0.0, min_impurity_split=None,
                           min_samples_leaf=1, min_samples_split=2,
                           min_weight_fraction_leaf=0.0, n_estimators=100,
                           n_iter_no_change=None, presort='deprecated',
                           random_state=None, subsample=1.0, tol=0.0001,
                           validation_fraction=0.1, verbose=0,
                           warm_start=False)

In [None]:
y_pred = gb.predict(X_test)
recall = recall_score(y_pred, y_test,average=None) * 100
precision = precision_score(y_pred, y_test,average=None) * 100
#print("Recall of Decision Tree {:.2f} %".format(recall))
#print("precision of Decision Tree {:.2f} %".format(precision))

accuracy = accuracy_score(y_test, y_pred) * 100

cm = confusion_matrix(y_test, y_pred)

print(recall)
print(precision)
print(accuracy)
print(cm)

print("Mean Recall:" , np.mean(recall))
print("Mean Precision:" , np.mean(precision))
print("Accuracy:" , accuracy)

[51.42857143 68.20226462 40.21805584 16.66666667 42.85714286 53.29512894]
[ 2.84135754 66.20990815 81.58329888  0.10905125  7.55885998 18.07580175]
51.13071933617718
[[  36  396  734    0   81   20]
 [   2 4397 2188    0   36   18]
 [   5  767 3947    1   55   63]
 [   2  268  621    1   13   12]
 [  20  420 1747    1  183   50]
 [   5  199  577    3   59  186]]
Mean Recall: 45.444638391668285
Mean Precision: 29.396379591044553
Accuracy: 51.13071933617718


In [None]:
clf = RandomForestClassifier()
clf.fit(X_train, y_train)

RandomForestClassifier(bootstrap=True, ccp_alpha=0.0, class_weight=None,
                       criterion='gini', max_depth=None, max_features='auto',
                       max_leaf_nodes=None, max_samples=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=100,
                       n_jobs=None, oob_score=False, random_state=None,
                       verbose=0, warm_start=False)

In [None]:
y_pred = clf.predict(X_test)
recall = recall_score(y_pred, y_test,average=None) * 100
precision = precision_score(y_pred, y_test,average=None) * 100
#print("Recall of Decision Tree {:.2f} %".format(recall))
#print("precision of Decision Tree {:.2f} %".format(precision))

accuracy = accuracy_score(y_test, y_pred) * 100

cm = confusion_matrix(y_test, y_pred)

print(recall)
print(precision)
print(accuracy)
print(cm)

print("Mean Recall:" , np.mean(recall))
print("Mean Precision:" , np.mean(precision))
print("Accuracy:" , accuracy)

[22.4691358  61.77894591 40.73638856 10.         28.125      31.96078431]
[ 7.18232044 67.2489083  64.48945845  2.07197383 13.01115242 15.84062196]
47.764857126161395
[[  91  411  579   16  111   59]
 [  99 4466 1728   42  239   67]
 [  78 1153 3120   64  305  118]
 [  22  331  470   19   55   20]
 [  77  618 1295   33  315   83]
 [  38  250  467   16   95  163]]
Mean Recall: 32.51170909849464
Mean Precision: 28.307405899994265
Accuracy: 47.764857126161395


**Only Patient and Fact Table Features**

In [None]:
# Our features have categorical data. We need to convert them to one-hot encoders to get the best results.
limited2 = result_df[['Gender','Age_Group', 'Acquisition_Info', 'Outbreak_Related']]
limited2 = pd.get_dummies(limited2)
limited2["Number_Resolved"] = result_df["Number_Resolved"]
limited2["Number_Fatal"] = result_df["Number_Fatal"]
limited2["City"] = result_df["City"]

limited2 = limited2.dropna()
limited2=limited2.reset_index(drop=True)

limited2.head()

Unnamed: 0,Gender_FEMALE,Gender_GENDER DIVERSE,Gender_MALE,Gender_UNSPECIFIED,Age_Group_20s,Age_Group_30s,Age_Group_40s,Age_Group_50s,Age_Group_60s,Age_Group_70s,Age_Group_80s,Age_Group_90+,Age_Group_<20,Age_Group_UNKNOWN,Acquisition_Info_CC,Acquisition_Info_MISSING INFORMATION,Acquisition_Info_NO KNOWN EPI LINK,Acquisition_Info_OB,Acquisition_Info_TRAVEL,Acquisition_Info_UNSPECIFIED EPI LINK,Outbreak_Related_No,Outbreak_Related_Yes,Number_Resolved,Number_Fatal,City
0,1,0,0,0,0,0,1,0,0,0,0,0,0,0,1,0,0,0,0,0,1,0,1,0,3
1,0,0,1,0,0,0,1,0,0,0,0,0,0,0,0,1,0,0,0,0,1,0,1,0,1
2,0,0,1,0,1,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,1,0,4,0,2
3,0,0,1,0,1,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,1,1,0,3
4,1,0,0,0,1,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,1,0,1,0,0


In [None]:
#Get the labels
y = limited2['City']
y=y.astype('string')
#Drop the column for the next step
limited2.drop(columns=['City'], inplace=True)

X = limited2.values
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, shuffle=True, stratify=y)

In [None]:
#This step is to train our classifier based on the training data.
dt = DecisionTreeClassifier(criterion='entropy')
dt.fit(X_train, y_train)

DecisionTreeClassifier(ccp_alpha=0.0, class_weight=None, criterion='entropy',
                       max_depth=None, max_features=None, max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, presort='deprecated',
                       random_state=None, splitter='best')

In [None]:
# Now that our classifier is trained, it is time to test it with the testing data. Because our data is imbalanced, we should loom at precision and recall, rather than accuracy.
y_pred = dt.predict(X_test)
recall = recall_score(y_pred, y_test,average=None) * 100
precision = precision_score(y_pred, y_test,average=None) * 100
#print("Recall of Decision Tree {:.2f} %".format(recall))
#print("precision of Decision Tree {:.2f} %".format(precision))

from sklearn.metrics import confusion_matrix
accuracy = accuracy_score(y_test, y_pred) * 100

cm = confusion_matrix(y_test, y_pred)

print(recall)
print(precision)
print(accuracy)
print(cm)

print("Mean Recall:" , np.mean(recall))
print("Mean Precision:" , np.mean(precision))
print("Accuracy:" , accuracy)

[21.21212121 68.55818414 37.1927555   0.         43.84384384  0.        ]
[ 1.09717868 61.84571017 82.93838863  0.          5.92773041  0.        ]
48.47826086956522
[[  14  348  877    1   36    0]
 [  12 4289 2548    0   86    0]
 [  14  775 4025    0   38    1]
 [   4  239  666    0   11    0]
 [  12  400 1904    0  146    1]
 [  10  205  802    0   16    0]]
Mean Recall: 28.467817449541176
Mean Precision: 25.301501314145423
Accuracy: 48.47826086956522


In [None]:
r = export_text(dt, list(limited2.columns))
print(r)

|--- Acquisition_Info_MISSING INFORMATION <= 0.50
|   |--- Acquisition_Info_OB <= 0.50
|   |   |--- Number_Resolved <= 1.50
|   |   |   |--- Outbreak_Related_Yes <= 0.50
|   |   |   |   |--- Gender_UNSPECIFIED <= 0.50
|   |   |   |   |   |--- Acquisition_Info_NO KNOWN EPI LINK <= 0.50
|   |   |   |   |   |   |--- Age_Group_<20 <= 0.50
|   |   |   |   |   |   |   |--- Age_Group_20s <= 0.50
|   |   |   |   |   |   |   |   |--- Number_Fatal <= 0.50
|   |   |   |   |   |   |   |   |   |--- Age_Group_70s <= 0.50
|   |   |   |   |   |   |   |   |   |   |--- Acquisition_Info_TRAVEL <= 0.50
|   |   |   |   |   |   |   |   |   |   |   |--- truncated branch of depth 10
|   |   |   |   |   |   |   |   |   |   |--- Acquisition_Info_TRAVEL >  0.50
|   |   |   |   |   |   |   |   |   |   |   |--- truncated branch of depth 6
|   |   |   |   |   |   |   |   |   |--- Age_Group_70s >  0.50
|   |   |   |   |   |   |   |   |   |   |--- Acquisition_Info_CC <= 0.50
|   |   |   |   |   |   |   |   |   |   | 

In [None]:
#X_train, X_test, y_train, y_test
gb = GradientBoostingClassifier()
gb.fit(X_train, y_train)

GradientBoostingClassifier(ccp_alpha=0.0, criterion='friedman_mse', init=None,
                           learning_rate=0.1, loss='deviance', max_depth=3,
                           max_features=None, max_leaf_nodes=None,
                           min_impurity_decrease=0.0, min_impurity_split=None,
                           min_samples_leaf=1, min_samples_split=2,
                           min_weight_fraction_leaf=0.0, n_estimators=100,
                           n_iter_no_change=None, presort='deprecated',
                           random_state=None, subsample=1.0, tol=0.0001,
                           validation_fraction=0.1, verbose=0,
                           warm_start=False)

In [None]:
y_pred = gb.predict(X_test)
recall = recall_score(y_pred, y_test,average=None) * 100
precision = precision_score(y_pred, y_test,average=None) * 100
#print("Recall of Decision Tree {:.2f} %".format(recall))
#print("precision of Decision Tree {:.2f} %".format(precision))

accuracy = accuracy_score(y_test, y_pred) * 100

cm = confusion_matrix(y_test, y_pred)

print(recall)
print(precision)
print(accuracy)
print(cm)

print("Mean Recall:" , np.mean(recall))
print("Mean Precision:" , np.mean(precision))
print("Accuracy:" , accuracy)

  _warn_prf(average, modifier, msg_start, len(result))


[48.38709677 71.08931337 36.69992054  0.         43.36569579  0.        ]
[ 1.17554859 59.56741168 85.65835566  0.          5.44051969  0.        ]
48.26659038901602
[[  15  318  916    1   26    0]
 [   0 4131 2719    1   84    0]
 [   1  667 4157    0   28    0]
 [   1  193  713    0   13    0]
 [   8  349 1972    0  134    0]
 [   6  153  850    0   24    0]]
Mean Recall: 33.25700441368328
Mean Precision: 25.30697260282577
Accuracy: 48.26659038901602


In [None]:
clf = RandomForestClassifier()
clf.fit(X_train, y_train)

RandomForestClassifier(bootstrap=True, ccp_alpha=0.0, class_weight=None,
                       criterion='gini', max_depth=None, max_features='auto',
                       max_leaf_nodes=None, max_samples=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=100,
                       n_jobs=None, oob_score=False, random_state=None,
                       verbose=0, warm_start=False)

In [None]:
y_pred = clf.predict(X_test)
recall = recall_score(y_pred, y_test,average=None) * 100
precision = precision_score(y_pred, y_test,average=None) * 100
#print("Recall of Decision Tree {:.2f} %".format(recall))
#print("precision of Decision Tree {:.2f} %".format(precision))

accuracy = accuracy_score(y_test, y_pred) * 100

cm = confusion_matrix(y_test, y_pred)

print(recall)
print(precision)
print(accuracy)
print(cm)

print("Mean Recall:" , np.mean(recall))
print("Mean Precision:" , np.mean(precision))
print("Accuracy:" , accuracy)

[30.76923077 71.86678352 36.7427568   0.         40.24390244  0.        ]
[ 1.2539185  59.12040375 86.23531836  0.          5.3593179   0.        ]
48.243707093821506
[[  16  304  922    0   33    1]
 [   7 4100 2736    1   90    1]
 [   9  622 4185    0   36    1]
 [   2  186  719    0   12    1]
 [   9  347 1975    0  132    0]
 [   9  146  853    0   25    0]]
Mean Recall: 29.937112255949103
Mean Precision: 25.328159751527988
Accuracy: 48.243707093821506


**Class Balanced: All Features**

In [None]:
balance = result_df[['Gender','Age_Group', 'Acquisition_Info', 'Outbreak_Related', 
                    'Onset_season', 'Reported_season', 'Test_season', 'Specimen_season']]

balance = pd.get_dummies(balance)

balance["Onset_Weekend"] = result_df["Onset_Weekend"]
balance["Reported_Weekend"] = result_df["Reported_Weekend"]
balance["Test_Weekend"] = result_df["Test_Weekend"]
balance["Specimen_Weekend"] = result_df["Specimen_Weekend"]
balance["Onset_Holiday"] = result_df["Onset_Holiday"]
balance["Reported_Holiday"] = result_df["Reported_Holiday"]
balance["Test_Holiday"] = result_df["Test_Holiday"]
balance["Specimen_Holiday"] = result_df["Specimen_Holiday"]
balance["Onset_To_Reported"] = result_df["Onset_To_Reported"]
balance["Reported_To_Test"] = result_df["Reported_To_Test"]
balance["Test_To_Specimen"] = result_df["Test_To_Specimen"]

balance["Mean_Temp_c"] = result_df["Mean_Temp_c"]
balance["Total_Rain_mm"] = result_df["Total_Rain_mm"]
balance["Total_Snow_cm"] = result_df["Total_Snow_cm"]
balance["Retail_and_Recreation"] = result_df["Retail_and_Recreation"]
balance["Grocery_and_Pharmacy"] = result_df["Grocery_and_Pharmacy"]
balance["Parks"] = result_df["Parks"]
balance["Transit_Stations"] = result_df["Transit_Stations"]
balance["Workplaces"] = result_df["Workplaces"]
balance["Residential"] = result_df["Residential"]

balance["Number_Resolved"] = result_df["Number_Resolved"]
balance["Number_Fatal"] = result_df["Number_Fatal"]
balance["City"] = result_df["City"]

balance = balance.dropna()
balance=balance.reset_index(drop=True)

#3318,830
#4148

In [None]:
balance.head()

Unnamed: 0,Gender_FEMALE,Gender_GENDER DIVERSE,Gender_MALE,Gender_UNSPECIFIED,Age_Group_20s,Age_Group_30s,Age_Group_40s,Age_Group_50s,Age_Group_60s,Age_Group_70s,Age_Group_80s,Age_Group_90+,Age_Group_<20,Age_Group_UNKNOWN,Acquisition_Info_CC,Acquisition_Info_MISSING INFORMATION,Acquisition_Info_NO KNOWN EPI LINK,Acquisition_Info_OB,Acquisition_Info_TRAVEL,Acquisition_Info_UNSPECIFIED EPI LINK,Outbreak_Related_No,Outbreak_Related_Yes,Onset_season_autumn,Onset_season_spring,Onset_season_summer,Onset_season_winter,Reported_season_autumn,Reported_season_summer,Reported_season_winter,Test_season_autumn,Test_season_spring,Test_season_summer,Test_season_winter,Specimen_season_autumn,Specimen_season_spring,Specimen_season_summer,Specimen_season_winter,Onset_Weekend,Reported_Weekend,Test_Weekend,Specimen_Weekend,Onset_Holiday,Reported_Holiday,Test_Holiday,Specimen_Holiday,Onset_To_Reported,Reported_To_Test,Test_To_Specimen,Mean_Temp_c,Total_Rain_mm,Total_Snow_cm,Retail_and_Recreation,Grocery_and_Pharmacy,Parks,Transit_Stations,Workplaces,Residential,Number_Resolved,Number_Fatal,City
0,1,0,0,0,0,0,1,0,0,0,0,0,0,0,1,0,0,0,0,0,1,0,1,0,0,0,1,0,0,1,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,9.0,0.0,-2.0,3.4,0.0,0.0,-15,8,39.0,-53.0,-42,18,1,0,3
1,0,0,1,0,1,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,1,0,1,0,0,0,1,0,0,1,0,0,0,1,0,0,0,1,0,0,0,0,0,0,0,3.0,0.0,-2.0,2.3,0.2,4.2,-37,-14,-29.0,-65.0,-42,18,4,0,2
2,0,0,1,0,1,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,1,1,0,0,0,1,0,0,1,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,1.0,0.0,-1.0,3.4,0.0,0.0,-15,8,39.0,-53.0,-42,18,1,0,3
3,1,0,0,0,1,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,1,0,1,0,0,0,1,0,0,1,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,20.0,0.0,-2.0,0.1,0.0,0.0,-25,0,-29.0,-64.0,-49,19,1,0,0
4,0,0,1,0,0,0,0,1,0,0,0,0,0,0,0,0,1,0,0,0,1,0,1,0,0,0,1,0,0,1,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,2.0,0.0,-2.0,3.4,0.0,0.0,-34,-8,-15.0,-64.0,-41,17,1,0,2


In [None]:
a = 0
b = 0
c = 0
d = 0
e = 0
f = 0
data = pd.DataFrame()
for index, row in balance.iterrows():
  if row['City'] == 0 and a < 4148:
    a+=1
    data = data.append(row)
  elif row['City'] == 1 and b < 4148:
    b+=1
    data = data.append(row)
  elif row['City'] == 2 and c < 4148:
    c+=1
    data = data.append(row)
  elif row['City'] == 3 and d < 4148:
    d+=1
    data = data.append(row)
  elif row['City'] == 4 and e < 4148:
    e+=1
    data = data.append(row)
  elif row['City'] == 5 and f < 4148:
    f+=1
    data = data.append(row)
  
  if a == 4147 and b == 4147 and c == 4147 and d == 4147 and e == 4147 and f == 4147:
    break

In [None]:
data.head()

Unnamed: 0,Acquisition_Info_CC,Acquisition_Info_MISSING INFORMATION,Acquisition_Info_NO KNOWN EPI LINK,Acquisition_Info_OB,Acquisition_Info_TRAVEL,Acquisition_Info_UNSPECIFIED EPI LINK,Age_Group_20s,Age_Group_30s,Age_Group_40s,Age_Group_50s,Age_Group_60s,Age_Group_70s,Age_Group_80s,Age_Group_90+,Age_Group_<20,Age_Group_UNKNOWN,City,Gender_FEMALE,Gender_GENDER DIVERSE,Gender_MALE,Gender_UNSPECIFIED,Grocery_and_Pharmacy,Mean_Temp_c,Number_Fatal,Number_Resolved,Onset_Holiday,Onset_To_Reported,Onset_Weekend,Onset_season_autumn,Onset_season_spring,Onset_season_summer,Onset_season_winter,Outbreak_Related_No,Outbreak_Related_Yes,Parks,Reported_Holiday,Reported_To_Test,Reported_Weekend,Reported_season_autumn,Reported_season_summer,Reported_season_winter,Residential,Retail_and_Recreation,Specimen_Holiday,Specimen_Weekend,Specimen_season_autumn,Specimen_season_spring,Specimen_season_summer,Specimen_season_winter,Test_Holiday,Test_To_Specimen,Test_Weekend,Test_season_autumn,Test_season_spring,Test_season_summer,Test_season_winter,Total_Rain_mm,Total_Snow_cm,Transit_Stations,Workplaces
0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,3.0,1.0,0.0,0.0,0.0,8.0,3.4,0.0,1.0,0.0,9.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,39.0,0.0,0.0,0.0,1.0,0.0,0.0,18.0,-15.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,-2.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,-53.0,-42.0
1,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2.0,0.0,0.0,1.0,0.0,-14.0,2.3,0.0,4.0,0.0,3.0,1.0,1.0,0.0,0.0,0.0,1.0,0.0,-29.0,0.0,0.0,0.0,1.0,0.0,0.0,18.0,-37.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,-2.0,0.0,1.0,0.0,0.0,0.0,0.2,4.2,-65.0,-42.0
2,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,3.0,0.0,0.0,1.0,0.0,8.0,3.4,0.0,1.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,39.0,0.0,0.0,0.0,1.0,0.0,0.0,18.0,-15.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,-1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,-53.0,-42.0
3,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.1,0.0,1.0,0.0,20.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,-29.0,0.0,0.0,0.0,1.0,0.0,0.0,19.0,-25.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,-2.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,-64.0,-49.0
4,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,2.0,0.0,0.0,1.0,0.0,-8.0,3.4,0.0,1.0,0.0,2.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,-15.0,0.0,0.0,0.0,1.0,0.0,0.0,17.0,-34.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,-2.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,-64.0,-41.0


In [None]:
#Get the labels
y = data['City']
y=y.astype('string')
#Drop the column for the next step
data.drop(columns=['City'], inplace=True)

X = data.values
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, shuffle=True, stratify=y)

In [None]:
#This step is to train our classifier based on the training data.
dt = DecisionTreeClassifier(criterion='entropy')
#
#criterion='entropy',max_depth=None
dt.fit(X_train, y_train)

DecisionTreeClassifier(ccp_alpha=0.0, class_weight=None, criterion='entropy',
                       max_depth=None, max_features=None, max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, presort='deprecated',
                       random_state=None, splitter='best')

In [None]:
# Now that our classifier is trained, it is time to test it with the testing data. Because our data is imbalanced, we should loom at precision and recall, rather than accuracy.
y_pred = dt.predict(X_test)
recall = recall_score(y_pred, y_test,average=None) * 100
precision = precision_score(y_pred, y_test,average=None) * 100
#print("Recall of Decision Tree {:.2f} %".format(recall))
#print("precision of Decision Tree {:.2f} %".format(precision))

from sklearn.metrics import confusion_matrix

accuracy = accuracy_score(y_test, y_pred) * 100

cm = confusion_matrix(y_test, y_pred)

print(recall)
print(precision)
print(accuracy)
print(cm)

print("Mean Recall:" , np.mean(recall))
print("Mean Precision:" , np.mean(precision))
print("Accuracy:" , accuracy)

[100.          99.75932611 100.          99.87966306 100.
 100.        ]
[ 99.87937274 100.         100.         100.          99.75903614
 100.        ]
99.93973483326637
[[828   1   0   0   0   0]
 [  0 829   0   0   0   0]
 [  0   0 830   0   0   0]
 [  0   0   0 830   0   0]
 [  0   1   0   1 828   0]
 [  0   0   0   0   0 830]]
Mean Recall: 99.93983152827919
Mean Precision: 99.93973481380287
Accuracy: 99.93973483326637


In [None]:
#X_train, X_test, y_train, y_test
gb = GradientBoostingClassifier()
#n_estimators=100, learning_rate=1.0, max_depth=1, random_state=0
#n_estimators=100, learning_rate=0.1, max_depth=None, random_state=0
gb.fit(X_train, y_train)

GradientBoostingClassifier(ccp_alpha=0.0, criterion='friedman_mse', init=None,
                           learning_rate=0.1, loss='deviance', max_depth=3,
                           max_features=None, max_leaf_nodes=None,
                           min_impurity_decrease=0.0, min_impurity_split=None,
                           min_samples_leaf=1, min_samples_split=2,
                           min_weight_fraction_leaf=0.0, n_estimators=100,
                           n_iter_no_change=None, presort='deprecated',
                           random_state=None, subsample=1.0, tol=0.0001,
                           validation_fraction=0.1, verbose=0,
                           warm_start=False)

In [None]:
y_pred = gb.predict(X_test)
recall = recall_score(y_pred, y_test,average=None) * 100
precision = precision_score(y_pred, y_test,average=None) * 100
#print("Recall of Decision Tree {:.2f} %".format(recall))
#print("precision of Decision Tree {:.2f} %".format(precision))

accuracy = accuracy_score(y_test, y_pred) * 100

cm = confusion_matrix(y_test, y_pred)

print(recall)
print(precision)
print(accuracy)
print(cm)

print("Mean Recall:" , np.mean(recall))
print("Mean Precision:" , np.mean(precision))
print("Accuracy:" , accuracy)

[100.          99.87951807 100.          99.75961538  99.87922705
  99.87966306]
[100.         100.          99.75903614 100.          99.63855422
 100.        ]
99.89955805544395
[[829   0   0   0   0   0]
 [  0 829   0   0   0   0]
 [  0   0 828   0   1   1]
 [  0   0   0 830   0   0]
 [  0   1   0   2 827   0]
 [  0   0   0   0   0 830]]
Mean Recall: 99.89967059443383
Mean Precision: 99.8995983935743
Accuracy: 99.89955805544395


In [None]:
clf = RandomForestClassifier()
clf.fit(X_train, y_train)

RandomForestClassifier(bootstrap=True, ccp_alpha=0.0, class_weight=None,
                       criterion='gini', max_depth=None, max_features='auto',
                       max_leaf_nodes=None, max_samples=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=100,
                       n_jobs=None, oob_score=False, random_state=None,
                       verbose=0, warm_start=False)

In [None]:
y_pred = clf.predict(X_test)
recall = recall_score(y_pred, y_test,average=None) * 100
precision = precision_score(y_pred, y_test,average=None) * 100
#print("Recall of Decision Tree {:.2f} %".format(recall))
#print("precision of Decision Tree {:.2f} %".format(precision))

accuracy = accuracy_score(y_test, y_pred) * 100

cm = confusion_matrix(y_test, y_pred)

print(recall)
print(precision)
print(accuracy)
print(cm)

print("Mean Recall:" , np.mean(recall))
print("Mean Precision:" , np.mean(precision))
print("Accuracy:" , accuracy)

[100.          99.63942308 100.          99.87951807 100.
  99.87966306]
[ 99.87937274 100.          99.87951807  99.87951807  99.75903614
 100.        ]
99.89955805544395
[[828   1   0   0   0   0]
 [  0 829   0   0   0   0]
 [  0   1 829   0   0   0]
 [  0   0   0 829   0   1]
 [  0   1   0   1 828   0]
 [  0   0   0   0   0 830]]
Mean Recall: 99.89976736762844
Mean Precision: 99.89957417123257
Accuracy: 99.89955805544395


**Class Balanced: No Mobility or Weather**

In [None]:
new_run = result_df[['Gender','Age_Group', 'Acquisition_Info', 'Outbreak_Related', 
                    'Onset_season', 'Reported_season', 'Test_season', 'Specimen_season']]

new_run = pd.get_dummies(new_run)
new_run["Onset_Weekend"] = result_df["Onset_Weekend"]
new_run["Reported_Weekend"] = result_df["Reported_Weekend"]
new_run["Test_Weekend"] = result_df["Test_Weekend"]
new_run["Specimen_Weekend"] = result_df["Specimen_Weekend"]
new_run["Onset_Holiday"] = result_df["Onset_Holiday"]
new_run["Reported_Holiday"] = result_df["Reported_Holiday"]
new_run["Test_Holiday"] = result_df["Test_Holiday"]
new_run["Specimen_Holiday"] = result_df["Specimen_Holiday"]
new_run["Onset_To_Reported"] = result_df["Onset_To_Reported"]
new_run["Reported_To_Test"] = result_df["Reported_To_Test"]
new_run["Test_To_Specimen"] = result_df["Test_To_Specimen"]
new_run["Number_Resolved"] = result_df["Number_Resolved"]
new_run["Number_Fatal"] = result_df["Number_Fatal"]
new_run["City"] = result_df["City"]

new_run = new_run.dropna()
new_run=new_run.reset_index(drop=True)

In [None]:
a = 0
b = 0
c = 0
d = 0
e = 0
f = 0
data = pd.DataFrame()
for index, row in new_run.iterrows():
  if row['City'] == 0 and a < 4148:
    a+=1
    data = data.append(row)
  elif row['City'] == 1 and b < 4148:
    b+=1
    data = data.append(row)
  elif row['City'] == 2 and c < 4148:
    c+=1
    data = data.append(row)
  elif row['City'] == 3 and d < 4148:
    d+=1
    data = data.append(row)
  elif row['City'] == 4 and e < 4148:
    e+=1
    data = data.append(row)
  elif row['City'] == 5 and f < 4148:
    f+=1
    data = data.append(row)
  
  if a == 4147 and b == 4147 and c == 4147 and d == 4147 and e == 4147 and f == 4147:
    break

In [None]:
#Get the labels
y = data['City']
y=y.astype('string')
#Drop the column for the next step
data.drop(columns=['City'], inplace=True)

X = data.values
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, shuffle=True, stratify=y)

In [None]:
#This step is to train our classifier based on the training data.
dt = DecisionTreeClassifier(criterion='entropy')
#
#criterion='entropy',max_depth=None
dt.fit(X_train, y_train)

DecisionTreeClassifier(ccp_alpha=0.0, class_weight=None, criterion='entropy',
                       max_depth=None, max_features=None, max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, presort='deprecated',
                       random_state=None, splitter='best')

In [None]:
# Now that our classifier is trained, it is time to test it with the testing data. Because our data is imbalanced, we should loom at precision and recall, rather than accuracy.
y_pred = dt.predict(X_test)
recall = recall_score(y_pred, y_test,average=None) * 100
precision = precision_score(y_pred, y_test,average=None) * 100
#print("Recall of Decision Tree {:.2f} %".format(recall))
#print("precision of Decision Tree {:.2f} %".format(precision))

from sklearn.metrics import confusion_matrix

accuracy = accuracy_score(y_test, y_pred) * 100

cm = confusion_matrix(y_test, y_pred)

print(recall)
print(precision)
print(accuracy)
print(cm)

print("Mean Recall:" , np.mean(recall))
print("Mean Precision:" , np.mean(precision))
print("Accuracy:" , accuracy)

[28.30712303 53.94144144 33.37250294 23.24393359 27.5177305  40.14925373]
[36.86746988 57.78045838 34.21686747 21.95416164 23.37349398 32.40963855]
34.431498593812776
[[306  69  97 151  97 110]
 [110 479  76  81  43  40]
 [148  99 284 108 121  70]
 [179 102 141 182 131  94]
 [174  84 162 129 194  87]
 [164  55  91 132 119 269]]
Mean Recall: 34.42199753832461
Mean Precision: 34.43368165060725
Accuracy: 34.431498593812776


In [None]:
#X_train, X_test, y_train, y_test
gb = GradientBoostingClassifier()
gb.fit(X_train, y_train)

GradientBoostingClassifier(ccp_alpha=0.0, criterion='friedman_mse', init=None,
                           learning_rate=0.1, loss='deviance', max_depth=3,
                           max_features=None, max_leaf_nodes=None,
                           min_impurity_decrease=0.0, min_impurity_split=None,
                           min_samples_leaf=1, min_samples_split=2,
                           min_weight_fraction_leaf=0.0, n_estimators=100,
                           n_iter_no_change=None, presort='deprecated',
                           random_state=None, subsample=1.0, tol=0.0001,
                           validation_fraction=0.1, verbose=0,
                           warm_start=False)

In [None]:
y_pred = gb.predict(X_test)
recall = recall_score(y_pred, y_test,average=None) * 100
precision = precision_score(y_pred, y_test,average=None) * 100
#print("Recall of Decision Tree {:.2f} %".format(recall))
#print("precision of Decision Tree {:.2f} %".format(precision))

accuracy = accuracy_score(y_test, y_pred) * 100

cm = confusion_matrix(y_test, y_pred)

print(recall)
print(precision)
print(accuracy)
print(cm)

print("Mean Recall:" , np.mean(recall))
print("Mean Precision:" , np.mean(precision))
print("Accuracy:" , accuracy)

[41.06113033 79.81072555 39.64912281 31.9286872  35.84615385 41.58878505]
[42.89156627 61.03739445 54.45783133 23.76357057 28.07228916 53.61445783]
43.9734833266372
[[356  35 130  77  84 148]
 [ 85 506 101  45  37  55]
 [ 63  33 452  85 103  94]
 [135  34 143 197 111 209]
 [124  19 233 102 233 119]
 [104   7  81 111  82 445]]
Mean Recall: 44.980767463757964
Mean Precision: 43.972851599401224
Accuracy: 43.9734833266372


In [None]:
clf = RandomForestClassifier()
clf.fit(X_train, y_train)

RandomForestClassifier(bootstrap=True, ccp_alpha=0.0, class_weight=None,
                       criterion='gini', max_depth=None, max_features='auto',
                       max_leaf_nodes=None, max_samples=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=100,
                       n_jobs=None, oob_score=False, random_state=None,
                       verbose=0, warm_start=False)

In [None]:
y_pred = clf.predict(X_test)
recall = recall_score(y_pred, y_test,average=None) * 100
precision = precision_score(y_pred, y_test,average=None) * 100
#print("Recall of Decision Tree {:.2f} %".format(recall))
#print("precision of Decision Tree {:.2f} %".format(precision))

accuracy = accuracy_score(y_test, y_pred) * 100

cm = confusion_matrix(y_test, y_pred)

print(recall)
print(precision)
print(accuracy)
print(cm)

print("Mean Recall:" , np.mean(recall))
print("Mean Precision:" , np.mean(precision))
print("Accuracy:" , accuracy)

[35.77142857 62.5798212  37.09869203 25.50505051 29.6010296  41.31868132]
[37.71084337 59.10735826 37.59036145 24.36670688 27.71084337 45.30120482]
38.629971876255524
[[313  62  85 132 100 138]
 [ 85 490  71  76  53  54]
 [ 90  64 312 125 154  85]
 [146  72 121 202 140 148]
 [123  57 170 141 230 109]
 [118  38  82 116 100 376]]
Mean Recall: 38.645783871665756
Mean Precision: 38.631219691794925
Accuracy: 38.629971876255524


**Class Balanced: No Date Besides 3 Engineered + Rest of Features**

In [None]:
# Our features have categorical data. We need to convert them to one-hot encoders to get the best results.
new_run2 = result_df[['Gender','Age_Group', 'Acquisition_Info', 'Outbreak_Related']]

new_run2 = pd.get_dummies(new_run2)

new_run2["Onset_To_Reported"] = result_df["Onset_To_Reported"]
new_run2["Reported_To_Test"] = result_df["Reported_To_Test"]
new_run2["Test_To_Specimen"] = result_df["Test_To_Specimen"]
new_run2["Mean_Temp_c"] = result_df["Mean_Temp_c"]
new_run2["Total_Rain_mm"] = result_df["Total_Rain_mm"]
new_run2["Total_Snow_cm"] = result_df["Total_Snow_cm"]
new_run2["Retail_and_Recreation"] = result_df["Retail_and_Recreation"]
new_run2["Grocery_and_Pharmacy"] = result_df["Grocery_and_Pharmacy"]
new_run2["Parks"] = result_df["Parks"]
new_run2["Transit_Stations"] = result_df["Transit_Stations"]
new_run2["Workplaces"] = result_df["Workplaces"]
new_run2["Residential"] = result_df["Residential"]
new_run2["Number_Resolved"] = result_df["Number_Resolved"]
new_run2["Number_Fatal"] = result_df["Number_Fatal"]
new_run2["City"] = result_df["City"]

new_run2 = new_run2.dropna()
new_run2=new_run2.reset_index(drop=True)

new_run2.head()

Unnamed: 0,Gender_FEMALE,Gender_GENDER DIVERSE,Gender_MALE,Gender_UNSPECIFIED,Age_Group_20s,Age_Group_30s,Age_Group_40s,Age_Group_50s,Age_Group_60s,Age_Group_70s,Age_Group_80s,Age_Group_90+,Age_Group_<20,Age_Group_UNKNOWN,Acquisition_Info_CC,Acquisition_Info_MISSING INFORMATION,Acquisition_Info_NO KNOWN EPI LINK,Acquisition_Info_OB,Acquisition_Info_TRAVEL,Acquisition_Info_UNSPECIFIED EPI LINK,Outbreak_Related_No,Outbreak_Related_Yes,Onset_To_Reported,Reported_To_Test,Test_To_Specimen,Mean_Temp_c,Total_Rain_mm,Total_Snow_cm,Retail_and_Recreation,Grocery_and_Pharmacy,Parks,Transit_Stations,Workplaces,Residential,Number_Resolved,Number_Fatal,City
0,1,0,0,0,0,0,1,0,0,0,0,0,0,0,1,0,0,0,0,0,1,0,9.0,0.0,-2.0,3.4,0.0,0.0,-15,8,39.0,-53.0,-42,18,1,0,3
1,0,0,1,0,1,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,1,0,3.0,0.0,-2.0,2.3,0.2,4.2,-37,-14,-29.0,-65.0,-42,18,4,0,2
2,0,0,1,0,1,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,1,1.0,0.0,-1.0,3.4,0.0,0.0,-15,8,39.0,-53.0,-42,18,1,0,3
3,1,0,0,0,1,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,1,0,20.0,0.0,-2.0,0.1,0.0,0.0,-25,0,-29.0,-64.0,-49,19,1,0,0
4,0,0,1,0,0,0,0,1,0,0,0,0,0,0,0,0,1,0,0,0,1,0,2.0,0.0,-2.0,3.4,0.0,0.0,-34,-8,-15.0,-64.0,-41,17,1,0,2


In [None]:
a = 0
b = 0
c = 0
d = 0
e = 0
f = 0
data = pd.DataFrame()
for index, row in new_run2.iterrows():
  if row['City'] == 0 and a < 4148:
    a+=1
    data = data.append(row)
  elif row['City'] == 1 and b < 4148:
    b+=1
    data = data.append(row)
  elif row['City'] == 2 and c < 4148:
    c+=1
    data = data.append(row)
  elif row['City'] == 3 and d < 4148:
    d+=1
    data = data.append(row)
  elif row['City'] == 4 and e < 4148:
    e+=1
    data = data.append(row)
  elif row['City'] == 5 and f < 4148:
    f+=1
    data = data.append(row)
  
  if a == 4147 and b == 4147 and c == 4147 and d == 4147 and e == 4147 and f == 4147:
    break

In [None]:
#Get the labels
y = data['City']
y=y.astype('string')
#Drop the column for the next step
data.drop(columns=['City'], inplace=True)

X = data.values
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, shuffle=True, stratify=y)

In [None]:
#This step is to train our classifier based on the training data.
dt = DecisionTreeClassifier(criterion='entropy')
#
#criterion='entropy',max_depth=None
dt.fit(X_train, y_train)

DecisionTreeClassifier(ccp_alpha=0.0, class_weight=None, criterion='entropy',
                       max_depth=None, max_features=None, max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, presort='deprecated',
                       random_state=None, splitter='best')

In [None]:
# Now that our classifier is trained, it is time to test it with the testing data. Because our data is imbalanced, we should loom at precision and recall, rather than accuracy.
y_pred = dt.predict(X_test)
recall = recall_score(y_pred, y_test,average=None) * 100
precision = precision_score(y_pred, y_test,average=None) * 100
#print("Recall of Decision Tree {:.2f} %".format(recall))
#print("precision of Decision Tree {:.2f} %".format(precision))

from sklearn.metrics import confusion_matrix

accuracy = accuracy_score(y_test, y_pred) * 100

cm = confusion_matrix(y_test, y_pred)

print(recall)
print(precision)
print(accuracy)
print(cm)

print("Mean Recall:" , np.mean(recall))
print("Mean Precision:" , np.mean(precision))
print("Accuracy:" , accuracy)

[ 99.87937274  99.51980792 100.          99.75961538  99.63811821
 100.        ]
[ 99.75903614 100.          99.51749095 100.          99.51807229
 100.        ]
99.79911611088791
[[828   1   0   0   1   0]
 [  0 829   0   0   0   0]
 [  0   2 825   0   2   0]
 [  0   0   0 830   0   0]
 [  1   1   0   2 826   0]
 [  0   0   0   0   0 830]]
Mean Recall: 99.79948571012334
Mean Precision: 99.79909989778173
Accuracy: 99.79911611088791


In [None]:
#X_train, X_test, y_train, y_test
gb = GradientBoostingClassifier()
gb.fit(X_train, y_train)

GradientBoostingClassifier(ccp_alpha=0.0, criterion='friedman_mse', init=None,
                           learning_rate=0.1, loss='deviance', max_depth=3,
                           max_features=None, max_leaf_nodes=None,
                           min_impurity_decrease=0.0, min_impurity_split=None,
                           min_samples_leaf=1, min_samples_split=2,
                           min_weight_fraction_leaf=0.0, n_estimators=100,
                           n_iter_no_change=None, presort='deprecated',
                           random_state=None, subsample=1.0, tol=0.0001,
                           validation_fraction=0.1, verbose=0,
                           warm_start=False)

In [None]:
y_pred = gb.predict(X_test)
recall = recall_score(y_pred, y_test,average=None) * 100
precision = precision_score(y_pred, y_test,average=None) * 100
#print("Recall of Decision Tree {:.2f} %".format(recall))
#print("precision of Decision Tree {:.2f} %".format(precision))

accuracy = accuracy_score(y_test, y_pred) * 100

cm = confusion_matrix(y_test, y_pred)

print(recall)
print(precision)
print(accuracy)
print(cm)

print("Mean Recall:" , np.mean(recall))
print("Mean Precision:" , np.mean(precision))
print("Accuracy:" , accuracy)

[ 99.75903614  99.63942308 100.          99.04534606  99.75728155
  99.87966306]
[ 99.75903614 100.          99.27623643 100.          99.03614458
 100.        ]
99.67858577742065
[[828   0   0   0   2   0]
 [  0 829   0   0   0   0]
 [  2   2 823   1   0   1]
 [  0   0   0 830   0   0]
 [  0   1   0   7 822   0]
 [  0   0   0   0   0 830]]
Mean Recall: 99.68012498225171
Mean Precision: 99.67856952538745
Accuracy: 99.67858577742065


In [None]:
clf = RandomForestClassifier()
clf.fit(X_train, y_train)

RandomForestClassifier(bootstrap=True, ccp_alpha=0.0, class_weight=None,
                       criterion='gini', max_depth=None, max_features='auto',
                       max_leaf_nodes=None, max_samples=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=100,
                       n_jobs=None, oob_score=False, random_state=None,
                       verbose=0, warm_start=False)

In [None]:
y_pred = clf.predict(X_test)
recall = recall_score(y_pred, y_test,average=None) * 100
precision = precision_score(y_pred, y_test,average=None) * 100
#print("Recall of Decision Tree {:.2f} %".format(recall))
#print("precision of Decision Tree {:.2f} %".format(precision))

accuracy = accuracy_score(y_test, y_pred) * 100

cm = confusion_matrix(y_test, y_pred)

print(recall)
print(precision)
print(accuracy)
print(cm)

print("Mean Recall:" , np.mean(recall))
print("Mean Precision:" , np.mean(precision))
print("Accuracy:" , accuracy)

[100.          99.63942308 100.          99.52038369 100.
  99.87966306]
[ 99.87951807 100.          99.75874548 100.          99.39759036
 100.        ]
99.83929288871033
[[829   0   0   0   0   1]
 [  0 829   0   0   0   0]
 [  0   2 827   0   0   0]
 [  0   0   0 830   0   0]
 [  0   1   0   4 825   0]
 [  0   0   0   0   0 830]]
Mean Recall: 99.8399116377545
Mean Precision: 99.83930898503543
Accuracy: 99.83929288871033


**Class Balanced: No Mobility, Weather Features, Dates Besides 3 Engineered**

In [None]:
# Our features have categorical data. We need to convert them to one-hot encoders to get the best results.
limited = result_df[['Gender','Age_Group', 'Acquisition_Info', 'Outbreak_Related']]
limited = pd.get_dummies(limited)
limited["Onset_To_Reported"] = result_df["Onset_To_Reported"]
limited["Reported_To_Test"] = result_df["Reported_To_Test"]
limited["Test_To_Specimen"] = result_df["Test_To_Specimen"]
limited["Number_Resolved"] = result_df["Number_Resolved"]
limited["Number_Fatal"] = result_df["Number_Fatal"]
limited["City"] = result_df["City"]

limited = limited.dropna()
limited=limited.reset_index(drop=True)

limited.head()

Unnamed: 0,Gender_FEMALE,Gender_GENDER DIVERSE,Gender_MALE,Gender_UNSPECIFIED,Age_Group_20s,Age_Group_30s,Age_Group_40s,Age_Group_50s,Age_Group_60s,Age_Group_70s,Age_Group_80s,Age_Group_90+,Age_Group_<20,Age_Group_UNKNOWN,Acquisition_Info_CC,Acquisition_Info_MISSING INFORMATION,Acquisition_Info_NO KNOWN EPI LINK,Acquisition_Info_OB,Acquisition_Info_TRAVEL,Acquisition_Info_UNSPECIFIED EPI LINK,Outbreak_Related_No,Outbreak_Related_Yes,Onset_To_Reported,Reported_To_Test,Test_To_Specimen,Number_Resolved,Number_Fatal,City
0,1,0,0,0,0,0,1,0,0,0,0,0,0,0,1,0,0,0,0,0,1,0,9.0,0.0,-2.0,1,0,3
1,0,0,1,0,1,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,1,0,3.0,0.0,-2.0,4,0,2
2,0,0,1,0,1,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,1,1.0,0.0,-1.0,1,0,3
3,1,0,0,0,1,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,1,0,20.0,0.0,-2.0,1,0,0
4,0,0,1,0,0,0,0,1,0,0,0,0,0,0,0,0,1,0,0,0,1,0,2.0,0.0,-2.0,1,0,2


In [None]:
a = 0
b = 0
c = 0
d = 0
e = 0
f = 0
data = pd.DataFrame()
for index, row in limited.iterrows():
  if row['City'] == 0 and a < 4148:
    a+=1
    data = data.append(row)
  elif row['City'] == 1 and b < 4148:
    b+=1
    data = data.append(row)
  elif row['City'] == 2 and c < 4148:
    c+=1
    data = data.append(row)
  elif row['City'] == 3 and d < 4148:
    d+=1
    data = data.append(row)
  elif row['City'] == 4 and e < 4148:
    e+=1
    data = data.append(row)
  elif row['City'] == 5 and f < 4148:
    f+=1
    data = data.append(row)
  
  if a == 4147 and b == 4147 and c == 4147 and d == 4147 and e == 4147 and f == 4147:
    break

In [None]:
#Get the labels
y = data['City']
y=y.astype('string')
#Drop the column for the next step
data.drop(columns=['City'], inplace=True)

X = data.values
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, shuffle=True, stratify=y)

In [None]:
#This step is to train our classifier based on the training data.
dt = DecisionTreeClassifier(criterion='entropy')
#
#criterion='entropy',max_depth=None
dt.fit(X_train, y_train)

DecisionTreeClassifier(ccp_alpha=0.0, class_weight=None, criterion='entropy',
                       max_depth=None, max_features=None, max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, presort='deprecated',
                       random_state=None, splitter='best')

In [None]:
# Now that our classifier is trained, it is time to test it with the testing data. Because our data is imbalanced, we should loom at precision and recall, rather than accuracy.
y_pred = dt.predict(X_test)
recall = recall_score(y_pred, y_test,average=None) * 100
precision = precision_score(y_pred, y_test,average=None) * 100
#print("Recall of Decision Tree {:.2f} %".format(recall))
#print("precision of Decision Tree {:.2f} %".format(precision))

from sklearn.metrics import confusion_matrix

accuracy = accuracy_score(y_test, y_pred) * 100

cm = confusion_matrix(y_test, y_pred)

print(recall)
print(precision)
print(accuracy)
print(cm)

print("Mean Recall:" , np.mean(recall))
print("Mean Precision:" , np.mean(precision))
print("Accuracy:" , accuracy)

[26.43253235 61.01036269 29.05882353 22.80701754 24.87309645 35.340729  ]
[34.45783133 56.74698795 29.75903614 23.4939759  23.64294331 26.89987937]
32.50301325833668
[[286  49  96 143 140 116]
 [ 90 471  88  76  66  39]
 [170  76 247 120 154  63]
 [178  70 147 195 122 118]
 [179  53 178 151 196  72]
 [179  53  94 170 110 223]]
Mean Recall: 33.253760260560306
Mean Precision: 32.50010900053774
Accuracy: 32.50301325833668


In [None]:
#X_train, X_test, y_train, y_test
gb = GradientBoostingClassifier()
gb.fit(X_train, y_train)

GradientBoostingClassifier(ccp_alpha=0.0, criterion='friedman_mse', init=None,
                           learning_rate=0.1, loss='deviance', max_depth=3,
                           max_features=None, max_leaf_nodes=None,
                           min_impurity_decrease=0.0, min_impurity_split=None,
                           min_samples_leaf=1, min_samples_split=2,
                           min_weight_fraction_leaf=0.0, n_estimators=100,
                           n_iter_no_change=None, presort='deprecated',
                           random_state=None, subsample=1.0, tol=0.0001,
                           validation_fraction=0.1, verbose=0,
                           warm_start=False)

In [None]:
y_pred = gb.predict(X_test)
recall = recall_score(y_pred, y_test,average=None) * 100
precision = precision_score(y_pred, y_test,average=None) * 100
#print("Recall of Decision Tree {:.2f} %".format(recall))
#print("precision of Decision Tree {:.2f} %".format(precision))

accuracy = accuracy_score(y_test, y_pred) * 100

cm = confusion_matrix(y_test, y_pred)

print(recall)
print(precision)
print(accuracy)
print(cm)

print("Mean Recall:" , np.mean(recall))
print("Mean Precision:" , np.mean(precision))
print("Accuracy:" , accuracy)

[35.19695044 78.83333333 37.31343284 27.62691854 29.47103275 40.03707136]
[33.37349398 56.98795181 39.15662651 28.19277108 28.22677925 52.11097708]
39.67456809963841
[[277  29  95 141 126 162]
 [ 55 473 100  69  61  72]
 [ 82  28 325 129 161 105]
 [134  40 109 234 126 187]
 [120  20 191 143 234 121]
 [119  10  51 131  86 432]]
Mean Recall: 41.4131232096425
Mean Precision: 39.67476661773754
Accuracy: 39.67456809963841


In [None]:
clf = RandomForestClassifier()
clf.fit(X_train, y_train)

RandomForestClassifier(bootstrap=True, ccp_alpha=0.0, class_weight=None,
                       criterion='gini', max_depth=None, max_features='auto',
                       max_leaf_nodes=None, max_samples=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=100,
                       n_jobs=None, oob_score=False, random_state=None,
                       verbose=0, warm_start=False)

In [None]:
y_pred = clf.predict(X_test)
recall = recall_score(y_pred, y_test,average=None) * 100
precision = precision_score(y_pred, y_test,average=None) * 100
#print("Recall of Decision Tree {:.2f} %".format(recall))
#print("precision of Decision Tree {:.2f} %".format(precision))

accuracy = accuracy_score(y_test, y_pred) * 100

cm = confusion_matrix(y_test, y_pred)

print(recall)
print(precision)
print(accuracy)
print(cm)

print("Mean Recall:" , np.mean(recall))
print("Mean Precision:" , np.mean(precision))
print("Accuracy:" , accuracy)

[27.63157895 67.83114993 29.05484247 24.40944882 24.38202247 35.8974359 ]
[27.8313253  56.14457831 30.         26.14457831 26.1761158  35.46441496]
33.6279630373644
[[231  40 106 145 158 150]
 [ 68 466  91  76  76  53]
 [118  57 249 137 191  78]
 [144  57 132 217 133 147]
 [133  39 191 152 217  97]
 [142  28  88 162 115 294]]
Mean Recall: 34.86774642276291
Mean Precision: 33.62683544794377
Accuracy: 33.6279630373644


**Class Balanced: Only Patient + Fact Table Features**

In [None]:
# Our features have categorical data. We need to convert them to one-hot encoders to get the best results.
limited2 = result_df[['Gender','Age_Group', 'Acquisition_Info', 'Outbreak_Related']]
limited2 = pd.get_dummies(limited2)
limited2["Number_Resolved"] = result_df["Number_Resolved"]
limited2["Number_Fatal"] = result_df["Number_Fatal"]
limited2["City"] = result_df["City"]

limited2 = limited2.dropna()
limited2=limited2.reset_index(drop=True)

limited2.head()

Unnamed: 0,Gender_FEMALE,Gender_GENDER DIVERSE,Gender_MALE,Gender_UNSPECIFIED,Age_Group_20s,Age_Group_30s,Age_Group_40s,Age_Group_50s,Age_Group_60s,Age_Group_70s,Age_Group_80s,Age_Group_90+,Age_Group_<20,Age_Group_UNKNOWN,Acquisition_Info_CC,Acquisition_Info_MISSING INFORMATION,Acquisition_Info_NO KNOWN EPI LINK,Acquisition_Info_OB,Acquisition_Info_TRAVEL,Acquisition_Info_UNSPECIFIED EPI LINK,Outbreak_Related_No,Outbreak_Related_Yes,Number_Resolved,Number_Fatal,City
0,1,0,0,0,0,0,1,0,0,0,0,0,0,0,1,0,0,0,0,0,1,0,1,0,3
1,0,0,1,0,0,0,1,0,0,0,0,0,0,0,0,1,0,0,0,0,1,0,1,0,1
2,0,0,1,0,1,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,1,0,4,0,2
3,0,0,1,0,1,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,1,1,0,3
4,1,0,0,0,1,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,1,0,1,0,0


In [None]:
a = 0
b = 0
c = 0
d = 0
e = 0
f = 0
data = pd.DataFrame()
for index, row in limited2.iterrows():
  if row['City'] == 0 and a < 4148:
    a+=1
    data = data.append(row)
  elif row['City'] == 1 and b < 4148:
    b+=1
    data = data.append(row)
  elif row['City'] == 2 and c < 4148:
    c+=1
    data = data.append(row)
  elif row['City'] == 3 and d < 4148:
    d+=1
    data = data.append(row)
  elif row['City'] == 4 and e < 4148:
    e+=1
    data = data.append(row)
  elif row['City'] == 5 and f < 4148:
    f+=1
    data = data.append(row)
  
  if a == 4147 and b == 4147 and c == 4147 and d == 4147 and e == 4147 and f == 4147:
    break

In [None]:
#Get the labels
y = data['City']
y=y.astype('string')
#Drop the column for the next step
data.drop(columns=['City'], inplace=True)

X = data.values
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, shuffle=True, stratify=y)

In [None]:
#This step is to train our classifier based on the training data.
dt = DecisionTreeClassifier(criterion='entropy')
#
#criterion='entropy',max_depth=None
dt.fit(X_train, y_train)

DecisionTreeClassifier(ccp_alpha=0.0, class_weight=None, criterion='entropy',
                       max_depth=None, max_features=None, max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, presort='deprecated',
                       random_state=None, splitter='best')

In [None]:
# Now that our classifier is trained, it is time to test it with the testing data. Because our data is imbalanced, we should loom at precision and recall, rather than accuracy.
y_pred = dt.predict(X_test)
recall = recall_score(y_pred, y_test,average=None) * 100
precision = precision_score(y_pred, y_test,average=None) * 100
#print("Recall of Decision Tree {:.2f} %".format(recall))
#print("precision of Decision Tree {:.2f} %".format(precision))

from sklearn.metrics import confusion_matrix

accuracy = accuracy_score(y_test, y_pred) * 100

cm = confusion_matrix(y_test, y_pred)

print(recall)
print(precision)
print(accuracy)
print(cm)

print("Mean Recall:" , np.mean(recall))
print("Mean Precision:" , np.mean(precision))
print("Accuracy:" , accuracy)

[25.85499316 74.72727273 37.60262726 22.91149711 24.3902439  22.68199234]
[22.77108434 49.51807229 27.62364294 33.37349398 16.88781665 35.6626506 ]
30.976295701084773
[[189  32  58 253  84 214]
 [ 60 411  92  90  47 130]
 [ 95  36 229 157 110 202]
 [133  39  49 277  91 241]
 [119  25 135 188 140 222]
 [135   7  46 244 102 296]]
Mean Recall: 34.694771081629405
Mean Precision: 30.9727934657811
Accuracy: 30.976295701084773


In [None]:
#X_train, X_test, y_train, y_test
gb = GradientBoostingClassifier()
gb.fit(X_train, y_train)

GradientBoostingClassifier(ccp_alpha=0.0, criterion='friedman_mse', init=None,
                           learning_rate=0.1, loss='deviance', max_depth=3,
                           max_features=None, max_leaf_nodes=None,
                           min_impurity_decrease=0.0, min_impurity_split=None,
                           min_samples_leaf=1, min_samples_split=2,
                           min_weight_fraction_leaf=0.0, n_estimators=100,
                           n_iter_no_change=None, presort='deprecated',
                           random_state=None, subsample=1.0, tol=0.0001,
                           validation_fraction=0.1, verbose=0,
                           warm_start=False)

In [None]:
y_pred = gb.predict(X_test)
recall = recall_score(y_pred, y_test,average=None) * 100
precision = precision_score(y_pred, y_test,average=None) * 100
#print("Recall of Decision Tree {:.2f} %".format(recall))
#print("precision of Decision Tree {:.2f} %".format(precision))

accuracy = accuracy_score(y_test, y_pred) * 100

cm = confusion_matrix(y_test, y_pred)

print(recall)
print(precision)
print(accuracy)
print(cm)

print("Mean Recall:" , np.mean(recall))
print("Mean Precision:" , np.mean(precision))
print("Accuracy:" , accuracy)

[25.74850299 78.84615385 42.5087108  21.88644689 24.80314961 22.84334023]
[25.90361446 49.39759036 29.43305187 28.79518072 15.19903498 39.87951807]
31.438328646042585
[[215  23  50 232  68 242]
 [ 65 410  89  88  40 138]
 [111  28 244 144  92 210]
 [154  38  35 239  85 279]
 [133  16 122 183 126 249]
 [157   5  34 206  97 331]]
Mean Recall: 36.10605072815837
Mean Precision: 31.434665077681046
Accuracy: 31.438328646042585


In [None]:
clf = RandomForestClassifier()
clf.fit(X_train, y_train)

RandomForestClassifier(bootstrap=True, ccp_alpha=0.0, class_weight=None,
                       criterion='gini', max_depth=None, max_features='auto',
                       max_leaf_nodes=None, max_samples=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=100,
                       n_jobs=None, oob_score=False, random_state=None,
                       verbose=0, warm_start=False)

In [None]:
y_pred = clf.predict(X_test)
recall = recall_score(y_pred, y_test,average=None) * 100
precision = precision_score(y_pred, y_test,average=None) * 100
#print("Recall of Decision Tree {:.2f} %".format(recall))
#print("precision of Decision Tree {:.2f} %".format(precision))

accuracy = accuracy_score(y_test, y_pred) * 100

cm = confusion_matrix(y_test, y_pred)

print(recall)
print(precision)
print(accuracy)
print(cm)

print("Mean Recall:" , np.mean(recall))
print("Mean Precision:" , np.mean(precision))
print("Accuracy:" , accuracy)

[25.92592593 74.90909091 38.94736842 22.80534351 24.39446367 22.68795742]
[22.77108434 49.63855422 26.77925211 28.79518072 17.00844391 41.08433735]
31.016472478907193
[[189  31  49 224  84 253]
 [ 59 412  86  85  47 141]
 [ 95  36 222 137 112 227]
 [132  40  46 239  91 282]
 [119  24 126 160 141 259]
 [135   7  41 203 103 341]]
Mean Recall: 34.94502497563938
Mean Precision: 31.012808774301064
Accuracy: 31.016472478907193


**PART C** 


OneClass SVM

In [57]:
from sklearn.svm import OneClassSVM
from numpy import quantile, where, random
import matplotlib.pyplot as plt



x = new_df2[['Gender_FEMALE', 'Gender_GENDER DIVERSE', 'Gender_MALE',
       'Gender_UNSPECIFIED', 'Age_Group_20s', 'Age_Group_30s', 'Age_Group_40s',
       'Age_Group_50s', 'Age_Group_60s', 'Age_Group_70s', 'Age_Group_80s',
       'Age_Group_90+', 'Age_Group_<20', 'Age_Group_UNKNOWN',
       'Acquisition_Info_CC', 'Acquisition_Info_MISSING INFORMATION',
       'Acquisition_Info_NO KNOWN EPI LINK', 'Acquisition_Info_OB',
       'Acquisition_Info_TRAVEL', 'Acquisition_Info_UNSPECIFIED EPI LINK',
       'Outbreak_Related_No', 'Outbreak_Related_Yes', 'Onset_season_autumn',
       'Onset_season_spring', 'Onset_season_summer', 'Onset_season_winter',
       'Reported_season_autumn', 'Reported_season_summer',
       'Reported_season_winter', 'Test_season_autumn', 'Test_season_spring',
       'Test_season_summer', 'Test_season_winter', 'Specimen_season_autumn',
       'Specimen_season_spring', 'Specimen_season_summer',
       'Specimen_season_winter', 'Onset_Weekend', 'Reported_Weekend',
       'Test_Weekend', 'Specimen_Weekend', 'Onset_Holiday', 'Reported_Holiday',
       'Test_Holiday', 'Specimen_Holiday', 'Onset_To_Reported',
       'Reported_To_Test', 'Test_To_Specimen', 'Mean_Temp_c', 'Total_Rain_mm',
       'Total_Snow_cm', 'Retail_and_Recreation', 'Grocery_and_Pharmacy',
       'Parks', 'Transit_Stations', 'Workplaces', 'Residential',
       'Number_Resolved', 'Number_Fatal', 'City']].values

In [77]:
svm = OneClassSVM(kernel='rbf', gamma=0.003, nu=0.003)

print(svm)

OneClassSVM(cache_size=200, coef0=0.0, degree=3, gamma=0.003, kernel='rbf',
            max_iter=-1, nu=0.003, shrinking=True, tol=0.001, verbose=False)


In [78]:
# fit the model with x dataset and get the prediction data 
svm.fit(x)
pred = svm.predict(x)

In [79]:
# extract the negative outputs as the outliers
outlier_index = where(pred==-1)
outliers = x[outlier_index]

In [80]:
# these are the outliers

outliers

array([[1, 0, 0, ..., 1, 0, 4],
       [0, 0, 1, ..., 1, 0, 3],
       [1, 0, 0, ..., 1, 0, 5],
       ...,
       [1, 0, 0, ..., 1, 0, 0],
       [0, 0, 1, ..., 1, 0, 0],
       [1, 0, 0, ..., 1, 0, 0]], dtype=object)

In [81]:
#number of outliers
len(outliers)

693