In [2]:
import pandas as pd
import numpy as np
from sklearn.tree import DecisionTreeClassifier
from sklearn import neighbors
import os
from datetime import date
import time


Status-Code: 200
Content-Type: text/plain; charset=utf-8
Encoding: utf-8
~~~~~~~~~~~~~~~~~~~~
Status-Code: 200
Content-Type: text/plain; charset=utf-8
Encoding: utf-8
~~~~~~~~~~~~~~~~~~~~


In [5]:
class DataLoader():
    """
        DataLoader: Load the two data files into memory ready for cleaning and then merging
        - Final dataset is US-specific, two date columns for looking at weekly information pertaining to a particular region
    """

    dailyData = None
    weeklyData = None
    aggregateWeeklyData = None

    def generateDataSet(self):
        """
            generateDataSet: Generate final dataset by merging weeklySymptomData & weeklyHospitaliationData, based on sub_region_1
        """

        '''
        We want to see how symptoms aggregated across various Regions change over a Weekly time frame
        To do this, we merge the two dataframes based on geographical location, we have two date variables at the moment 
        '''
        self.aggregateWeeklyData = pd.merge(left=self.weeklySymptomData, right=self.weeklyHospitaliationData, on='sub_region_1')
        return self.aggregateWeeklyData

    def cleanWeeklyAndDailyData(self):
        """
            cleanWeeklyAndDailyData: Remove empty values in the two dataframes, within respective bounds
        """
        
        '''
        weeklyData:
        - Drop all columns in weeklyData with more than 25 percent NaN values in a particular column
        - Drop all rows in weeklyData with more than 5 percent NaN values in a particular row
        - Rename Date column for distinction
        '''
        self.weeklyData.drop_duplicates(inplace=True)
        self.weeklyData.dropna(thresh=int(self.weeklyData.shape[0] * 0.25), inplace=True, axis='columns')
        self.weeklyData.dropna(thresh=int(self.weeklyData.shape[0] * 0.05), inplace=True, axis='index')   

        columnsToRemove = ['country_region_code', 'country_region', 'sub_region_1_code']
        self.weeklyData = self.weeklyData.drop(labels=columnsToRemove, axis='columns')

        self.weeklyData['date'] = self.weeklyData['date'].astype('datetime64[ns]')
        self.weeklyData.rename(columns={'date':'date_symptoms'}, inplace=True)

        '''
        dailyData:
        - Drop all columns in dailyData with more than 20 percent NaN values in a particular column
        - Drop all columns in dailyData of little-to-no interest
        - Rename Date column for distinction, rename region_name to match self.weeklyData.sub_region_1, before merging
        '''
        self.dailyData.dropna(thresh=int(self.dailyData.shape[0] * 0.2), inplace=True, axis='columns')
        self.dailyData = self.dailyData.get(['date', 'open_covid_region_code', 'region_name', 'hospitalized_new', 'hospitalized_cumulative'])
        self.dailyData.rename(columns={'region_name':'sub_region_1', 'date':'date_hospitalization'}, inplace=True)
        
        '''
        Iterate over the regions to discover the distinct 56 US States we want to analyze
        USRegionsList : List
        '''
        US_Regions_List = []
        for regionCode in self.dailyData['open_covid_region_code']:
            if ('US-' in regionCode and regionCode not in US_Regions_List):
                US_Regions_List.append(regionCode)

        '''
        Create a new dataframe subset (collection of rows) for a particular region
        Do this repeatedly to build a US_Daily_Table
        US_Daily_Table : DataFrame
        '''
        US_Daily_Table = None
        for US_Region in US_Regions_List:
            US_Region_Rows = self.dailyData.loc[self.dailyData['open_covid_region_code'] == US_Region]
            US_Daily_Table = pd.concat([US_Daily_Table, US_Region_Rows])
        
        '''
        Filter the above dataframe for non-zero values concerning the two hospitalization columns
        Populate self.dailyData with this new valid information
        '''
        indexNames = US_Daily_Table[(US_Daily_Table['hospitalized_new'] == 0.0) & (US_Daily_Table['hospitalized_cumulative'] == 0.0)].index
        US_Daily_Table.drop(indexNames, inplace=True)

        '''
        Convert daily dataframe into weekly, sorted alphabetically by region_name
        '''
        US_Daily_Table['date_hospitalization'] = US_Daily_Table['date_hospitalization'].astype('datetime64[ns]')
        US_Weekly_Table = US_Daily_Table.groupby("sub_region_1").resample('W-Sun', on='date_hospitalization').sum().reset_index()

        '''
        Set ptrs accordingly
        '''
        self.weeklySymptomData = self.weeklyData                            # Matrix ~ (312, 96)
        self.weeklyHospitaliationData = US_Weekly_Table                     # Matrix ~ (1052, 4)
        
    def loadRawData(self):
        """
            loadRawData: Init pandas dataframe for the two data files found within the data directory 
        Returns:
            DataFrame, DataFrame: Two pandas dataframe objects, one for each file 
        """
        dirname = os.getcwd()
        filename1 = os.path.join(dirname, '2020_US_weekly_symptoms_dataset.csv')
        filename2 = os.path.join(dirname, 'aggregated_cc_by.csv')
        DataSet1 = pd.read_csv(filename1, delimiter=',', dtype={"tests_new": float, "test_units": "string"})
        DataSet2 = pd.read_csv(filename2, delimiter=',', dtype={"tests_new": float, "test_units": "string"})
        return DataSet1, DataSet2

    def __init__(self) -> None:
        self.weeklyData, self.dailyData = self.loadRawData()
        self.cleanWeeklyAndDailyData()
        outputDataFrame = self.generateDataSet()
        # print(outputDataFrame)

DataLoader()

<__main__.DataLoader at 0x1ce96ba0190>

In [156]:
class Cross_Validation:
    def cross_validate_regions(self, data, k, model):
        regions = pd.Series(data["region_name"].unique())
        
        shuffled_regions = regions.sample(frac=1, random_state=0).reset_index(drop=True)
            
        split_regions = np.array_split(shuffled_regions, k)
        
        folds = [pd.DataFrame(), pd.DataFrame(), pd.DataFrame(), pd.DataFrame(), pd.DataFrame()]
     
        for i in range(k):
            for region in split_regions[i]:
                matched = data.loc[lambda data: data["region_name"] == region]
                folds[i] = folds[i].append(matched)
        
        
        for i in range(k):
            folds[i].drop(columns=["region_name", "date", "open_covid_region_code"], inplace=True)
        
        print(folds[0])
        accuracy_sum = 0
        
        for i in range(k):
            folds_to_train = folds.copy()
            fold_to_test = folds_to_train[i]
            del folds_to_train[i]
            folds_to_train = pd.concat(folds_to_train, sort=False)
            
            x_train = folds_to_train.loc[:,folds_to_train.columns != "hospitalized_new"]
            y_train = folds_to_train.loc[:,folds_to_train.columns == "hospitalized_new"]
            
            x_test = fold_to_test.loc[:,fold_to_test.columns != "hospitalized_new"]
            y_test = fold_to_test.loc[:,fold_to_test.columns == "hospitalized_new"]
            
            model.fit(x_train, y_train)
            predictions = model.predict(x_test)
            accuracy_sum += accuracy_score(predictions, y_test)
            
        return acc_sum / k
        

In [170]:
cross = Cross_Validation()

In [168]:
# KNN using ball_tree algo
times = {}
num_neighbours = {}
for i in range(50):
    start_time = time.time()
    num_neighbours[i] = cross.cross_validate_regions(data1, 5, neighbors.KNeighborsClassifier(n_neighbors=i))
    times[i] = time.time() - start_time
print (num_neighbours)
print (times)

       cases_cumulative  cases_new  cases_cumulative_per_million  \
43424               NaN        NaN                           NaN   
43425               NaN        NaN                           NaN   
43426               NaN        NaN                           NaN   
43427               NaN        NaN                           NaN   
43428               1.0        1.0                          0.06   
...                 ...        ...                           ...   
39736               NaN        NaN                           NaN   
39737               NaN        NaN                           NaN   
39738               NaN        NaN                           NaN   
39739               NaN        NaN                           NaN   
39740               NaN        NaN                           NaN   

       cases_new_per_million  deaths_cumulative  deaths_new  \
43424                    NaN                NaN         NaN   
43425                    NaN                NaN         N

TypeError: float() argument must be a string or a number, not 'NAType'

In [169]:
# KNN using kd-tree algo
times = {}
num_neighbours = {}
for i in range(50):
    start_time = time.time()
    num_neighbours[i] = cross.cross_validate_regions(data1, 5, neighbors.KNeighborsClassifier(n_neighbors=i, algorithm='kd_tree'))
    times[i] = time.time() - start_time
print (num_neighbours)
print (times)

       cases_cumulative  cases_new  cases_cumulative_per_million  \
43424               NaN        NaN                           NaN   
43425               NaN        NaN                           NaN   
43426               NaN        NaN                           NaN   
43427               NaN        NaN                           NaN   
43428               1.0        1.0                          0.06   
...                 ...        ...                           ...   
39736               NaN        NaN                           NaN   
39737               NaN        NaN                           NaN   
39738               NaN        NaN                           NaN   
39739               NaN        NaN                           NaN   
39740               NaN        NaN                           NaN   

       cases_new_per_million  deaths_cumulative  deaths_new  \
43424                    NaN                NaN         NaN   
43425                    NaN                NaN         N

TypeError: float() argument must be a string or a number, not 'NAType'

In [166]:
class TimeSeparator:
    def separate_at_date(self, data, date):
        train = pd.DataFrame()
        test = pd.DataFrame()
        matched = data.loc[lambda data: data["date"] <= date]
        train = train.append(matched)
        matched = data.loc[lambda data: data["date"] > date]
        test = train.append(matched)
        return train, test


(      open_covid_region_code  region_name        date  cases_cumulative  \
 0                        AFG  Afghanistan  2019-12-31               0.0   
 1                        AFG  Afghanistan  2020-01-01               0.0   
 2                        AFG  Afghanistan  2020-01-02               0.0   
 3                        AFG  Afghanistan  2020-01-03               0.0   
 4                        AFG  Afghanistan  2020-01-04               0.0   
 ...                      ...          ...         ...               ...   
 98881                    VUT      Vanuatu  2020-08-06               NaN   
 98882                    VUT      Vanuatu  2020-08-07               NaN   
 98883                    VUT      Vanuatu  2020-08-08               NaN   
 98884                    VUT      Vanuatu  2020-08-09               NaN   
 98885                    VUT      Vanuatu  2020-08-10               NaN   
 
        cases_new  cases_cumulative_per_million  cases_new_per_million  \
 0          

In [171]:
ts = TimeSeparator()

(      open_covid_region_code  region_name        date  cases_cumulative  \
 0                        AFG  Afghanistan  2019-12-31               0.0   
 1                        AFG  Afghanistan  2020-01-01               0.0   
 2                        AFG  Afghanistan  2020-01-02               0.0   
 3                        AFG  Afghanistan  2020-01-03               0.0   
 4                        AFG  Afghanistan  2020-01-04               0.0   
 ...                      ...          ...         ...               ...   
 98881                    VUT      Vanuatu  2020-08-06               NaN   
 98882                    VUT      Vanuatu  2020-08-07               NaN   
 98883                    VUT      Vanuatu  2020-08-08               NaN   
 98884                    VUT      Vanuatu  2020-08-09               NaN   
 98885                    VUT      Vanuatu  2020-08-10               NaN   
 
        cases_new  cases_cumulative_per_million  cases_new_per_million  \
 0          

In [173]:
model = DecisionTreeClassifier()
cross.cross_validate_regions(data1, 5, model)

       cases_cumulative  cases_new  cases_cumulative_per_million  \
43424               NaN        NaN                           NaN   
43425               NaN        NaN                           NaN   
43426               NaN        NaN                           NaN   
43427               NaN        NaN                           NaN   
43428               1.0        1.0                          0.06   
...                 ...        ...                           ...   
39736               NaN        NaN                           NaN   
39737               NaN        NaN                           NaN   
39738               NaN        NaN                           NaN   
39739               NaN        NaN                           NaN   
39740               NaN        NaN                           NaN   

       cases_new_per_million  deaths_cumulative  deaths_new  \
43424                    NaN                NaN         NaN   
43425                    NaN                NaN         N

TypeError: float() argument must be a string or a number, not 'NAType'

In [174]:
model = DecisionTreeClassifier(criterion='entropy')
cross.cross_validate_regions(data1, 5, model)

       cases_cumulative  cases_new  cases_cumulative_per_million  \
43424               NaN        NaN                           NaN   
43425               NaN        NaN                           NaN   
43426               NaN        NaN                           NaN   
43427               NaN        NaN                           NaN   
43428               1.0        1.0                          0.06   
...                 ...        ...                           ...   
39736               NaN        NaN                           NaN   
39737               NaN        NaN                           NaN   
39738               NaN        NaN                           NaN   
39739               NaN        NaN                           NaN   
39740               NaN        NaN                           NaN   

       cases_new_per_million  deaths_cumulative  deaths_new  \
43424                    NaN                NaN         NaN   
43425                    NaN                NaN         N

TypeError: float() argument must be a string or a number, not 'NAType'

In [176]:
model = DecisionTreeClassifier(splitter='random')
cross.cross_validate_regions(data1, 5, model)

       cases_cumulative  cases_new  cases_cumulative_per_million  \
43424               NaN        NaN                           NaN   
43425               NaN        NaN                           NaN   
43426               NaN        NaN                           NaN   
43427               NaN        NaN                           NaN   
43428               1.0        1.0                          0.06   
...                 ...        ...                           ...   
39736               NaN        NaN                           NaN   
39737               NaN        NaN                           NaN   
39738               NaN        NaN                           NaN   
39739               NaN        NaN                           NaN   
39740               NaN        NaN                           NaN   

       cases_new_per_million  deaths_cumulative  deaths_new  \
43424                    NaN                NaN         NaN   
43425                    NaN                NaN         N

TypeError: float() argument must be a string or a number, not 'NAType'

In [177]:
model = DecisionTreeClassifier(criterion='entropy', splitter='random')
cross.cross_validate_regions(data1, 5, model)

       cases_cumulative  cases_new  cases_cumulative_per_million  \
43424               NaN        NaN                           NaN   
43425               NaN        NaN                           NaN   
43426               NaN        NaN                           NaN   
43427               NaN        NaN                           NaN   
43428               1.0        1.0                          0.06   
...                 ...        ...                           ...   
39736               NaN        NaN                           NaN   
39737               NaN        NaN                           NaN   
39738               NaN        NaN                           NaN   
39739               NaN        NaN                           NaN   
39740               NaN        NaN                           NaN   

       cases_new_per_million  deaths_cumulative  deaths_new  \
43424                    NaN                NaN         NaN   
43425                    NaN                NaN         N

TypeError: float() argument must be a string or a number, not 'NAType'

In [178]:
# Date separated data
train, test = ts.separate_at_date(data1, "2020-08-10")
x_train = train.loc[:,train.columns != "hospitalized_new"]
y_train = train.loc[:,train.columns == "hospitalized_new"]

x_test = test.loc[:,test.columns != "hospitalized_new"]
y_test = test.loc[:,test.columns == "hospitalized_new"]


In [179]:
class Model_Runner:
    def run_model(self, model, xtrain, ytrain, xtest, ytest):
        model.fit(xtrain, ytrain)
        predictions = model.predict(xtest)
        return accuracy_score(predictions, ytest)