In [167]:
import pandas as pd
import numpy as np
from sklearn.tree import DecisionTreeClassifier
from sklearn import neighbors
import os
from datetime import date
import time


In [180]:
class DataFetcher():
    """
        DataFetcher: class for downloading up-to-date csv files at specified URLs
    """
    def __init__(self, urlArr) -> None:
        self.urlArr = urlArr
        
    def getRawData(self) -> None:
        import requests
        for url in self.urlArr:
            r = requests.get(url)
            with open(self.getFileName(url), 'wb') as f:
                f.write(r.content)
            print(f"Status-Code: {r.status_code}")
            print(f"Content-Type: {r.headers['content-type']}")
            print(f"Encoding: {r.encoding}")
            print("~"*20)

    def getFileName(self, url) -> str:
        return url[url.rfind("/")+1:len(url)]

'''
DataSet1: Search Trends dataset (Weekly data)
DataSet2: COVID hospitalization cases dataset (Daily data)
'''
DataSet1 = 'https://raw.githubusercontent.com/google-research/open-covid-19-data/master/data/exports/search_trends_symptoms_dataset/United%20States%20of%20America/2020_US_weekly_symptoms_dataset.csv'
DataSet2 = 'https://raw.githubusercontent.com/google-research/open-covid-19-data/master/data/exports/cc_by/aggregated_cc_by.csv'

obj = DataFetcher([DataSet1, DataSet2])
obj.getRawData()


Status-Code: 200
Content-Type: text/plain; charset=utf-8
Encoding: utf-8
~~~~~~~~~~~~~~~~~~~~
Status-Code: 200
Content-Type: text/plain; charset=utf-8
Encoding: utf-8
~~~~~~~~~~~~~~~~~~~~


In [None]:
class DataLoader():
    """
        DataLoader: Load the two data files into memory ready for cleaning and then merging
    """
    def generateDataSet(self):
        """
        generateDataSet: Generate dataframe by merging Daily and Weekly dataframes into one Weekly dataframe
            - We want to see how symptoms aggregated across various Regions change over a Weekly time frame
        """

        '''
        # region_name in dailyData and country_region in weeklyData
        # for (element_of_interest in self.dailyData):
        #   weeklyInfo = convertToWeeklyInformation(element_of_interest)
        #   self.weeklyData.add(weeklyInfo)
        => Use the data in self.weeklyData !
        '''

        # Step 1: Convert daily data to weekly data, sorted alphabetically by region_name
        self.dailyData['date'] = self.dailyData['date'].astype('datetime64[ns]')
        newWeeklyData = self.dailyData.groupby("region_name").resample('W-Wed', on='date').sum().reset_index().sort_values(by='region_name')

        print(newWeeklyData.columns)
        # self.resultWeeklyData = pd.concat([self.weeklyData, newWeeklyData], axis=1, sort=False)
        # for i in self.resultWeeklyData:
        #     print(i)

    def cleanWeeklyAndDailyData(self):
        """
            cleanWeeklyAndDailyData: Remove empty values in the two dataframes, within respective bounds
        """
        
        '''
        weeklyData: Matrix containing NaN values - size reduction: [624->72 rows x 430->99 columns]
        - Drop all columns in weeklyData with more than 25 percent NaN values in a particular column
        - Drop all rows in weeklyData with more than 15 percent NaN values in a particular row
        '''
        self.weeklyData.dropna(thresh=int(self.weeklyData.shape[0] * 0.25), inplace=True, axis='columns')
        self.weeklyData.dropna(thresh=int(self.weeklyData.shape[0] * 0.15), inplace=True, axis='index')   

        # remove unneccessary columns from weeklyData
        columnsToRemove = [
                            'open_covid_region_code',
                            'country_region_code',
                            'country_region',
                            'sub_region_1_code',
                        ]
        self.weeklyData = self.weeklyData.drop(labels=columnsToRemove, axis='columns')

        '''
        dailyData: Removed most occurences of NaN values - size reduction: [98946->36437 rows x 62->50 columns]
        - Drop all columns in dailyData with more than 20 percent NaN values in a particular column
        - Drop all rows in dailyData with atleast 40/50 NaN values in a particular row
        '''
        self.dailyData.dropna(thresh=int(self.dailyData.shape[0] * 0.2), inplace=True, axis='columns')
        self.dailyData.dropna(thresh=40, inplace=True, axis='index')

        # print(self.weeklyData.columns)

    def loadRawData(self):
        """
            loadRawData: Init pandas dataframe for the two data files found within the data directory 
        Returns:
            DataFrame, DataFrame: Two pandas dataframe objects, one for each file 
        """
        DataSet1 = pd.read_csv(os.path.join('data/', '2020_US_weekly_symptoms_dataset.csv'), delimiter=',')
        DataSet2 = pd.read_csv(os.path.join('data/', 'aggregated_cc_by.csv'), delimiter=',')
        return DataSet1, DataSet2

    def __init__(self) -> None:
        self.weeklyData, self.dailyData = self.loadRawData()
        self.cleanWeeklyAndDailyData()
        self.generateDataSet()

DataLoader()

In [156]:
class Cross_Validation:
    def cross_validate_regions(self, data, k, model):
        regions = pd.Series(data["region_name"].unique())
        
        shuffled_regions = regions.sample(frac=1, random_state=0).reset_index(drop=True)
            
        split_regions = np.array_split(shuffled_regions, k)
        
        folds = [pd.DataFrame(), pd.DataFrame(), pd.DataFrame(), pd.DataFrame(), pd.DataFrame()]
     
        for i in range(k):
            for region in split_regions[i]:
                matched = data.loc[lambda data: data["region_name"] == region]
                folds[i] = folds[i].append(matched)
        
        
        for i in range(k):
            folds[i].drop(columns=["region_name", "date", "open_covid_region_code"], inplace=True)
        
        print(folds[0])
        accuracy_sum = 0
        
        for i in range(k):
            folds_to_train = folds.copy()
            fold_to_test = folds_to_train[i]
            del folds_to_train[i]
            folds_to_train = pd.concat(folds_to_train, sort=False)
            
            x_train = folds_to_train.loc[:,folds_to_train.columns != "hospitalized_new"]
            y_train = folds_to_train.loc[:,folds_to_train.columns == "hospitalized_new"]
            
            x_test = fold_to_test.loc[:,fold_to_test.columns != "hospitalized_new"]
            y_test = fold_to_test.loc[:,fold_to_test.columns == "hospitalized_new"]
            
            model.fit(x_train, y_train)
            predictions = model.predict(x_test)
            accuracy_sum += accuracy_score(predictions, y_test)
            
        return acc_sum / k
        

In [170]:
cross = Cross_Validation()

In [168]:
# KNN using ball_tree algo
times = {}
num_neighbours = {}
for i in range(50):
    start_time = time.time()
    num_neighbours[i] = cross.cross_validate_regions(data1, 5, neighbors.KNeighborsClassifier(n_neighbors=i))
    times[i] = time.time() - start_time
print (num_neighbours)
print (times)

       cases_cumulative  cases_new  cases_cumulative_per_million  \
43424               NaN        NaN                           NaN   
43425               NaN        NaN                           NaN   
43426               NaN        NaN                           NaN   
43427               NaN        NaN                           NaN   
43428               1.0        1.0                          0.06   
...                 ...        ...                           ...   
39736               NaN        NaN                           NaN   
39737               NaN        NaN                           NaN   
39738               NaN        NaN                           NaN   
39739               NaN        NaN                           NaN   
39740               NaN        NaN                           NaN   

       cases_new_per_million  deaths_cumulative  deaths_new  \
43424                    NaN                NaN         NaN   
43425                    NaN                NaN         N

TypeError: float() argument must be a string or a number, not 'NAType'

In [169]:
# KNN using kd-tree algo
times = {}
num_neighbours = {}
for i in range(50):
    start_time = time.time()
    num_neighbours[i] = cross.cross_validate_regions(data1, 5, neighbors.KNeighborsClassifier(n_neighbors=i, algorithm='kd_tree'))
    times[i] = time.time() - start_time
print (num_neighbours)
print (times)

       cases_cumulative  cases_new  cases_cumulative_per_million  \
43424               NaN        NaN                           NaN   
43425               NaN        NaN                           NaN   
43426               NaN        NaN                           NaN   
43427               NaN        NaN                           NaN   
43428               1.0        1.0                          0.06   
...                 ...        ...                           ...   
39736               NaN        NaN                           NaN   
39737               NaN        NaN                           NaN   
39738               NaN        NaN                           NaN   
39739               NaN        NaN                           NaN   
39740               NaN        NaN                           NaN   

       cases_new_per_million  deaths_cumulative  deaths_new  \
43424                    NaN                NaN         NaN   
43425                    NaN                NaN         N

TypeError: float() argument must be a string or a number, not 'NAType'

In [166]:
class TimeSeparator:
    def separate_at_date(self, data, date):
        train = pd.DataFrame()
        test = pd.DataFrame()
        matched = data.loc[lambda data: data["date"] <= date]
        train = train.append(matched)
        matched = data.loc[lambda data: data["date"] > date]
        test = train.append(matched)
        return train, test


(      open_covid_region_code  region_name        date  cases_cumulative  \
 0                        AFG  Afghanistan  2019-12-31               0.0   
 1                        AFG  Afghanistan  2020-01-01               0.0   
 2                        AFG  Afghanistan  2020-01-02               0.0   
 3                        AFG  Afghanistan  2020-01-03               0.0   
 4                        AFG  Afghanistan  2020-01-04               0.0   
 ...                      ...          ...         ...               ...   
 98881                    VUT      Vanuatu  2020-08-06               NaN   
 98882                    VUT      Vanuatu  2020-08-07               NaN   
 98883                    VUT      Vanuatu  2020-08-08               NaN   
 98884                    VUT      Vanuatu  2020-08-09               NaN   
 98885                    VUT      Vanuatu  2020-08-10               NaN   
 
        cases_new  cases_cumulative_per_million  cases_new_per_million  \
 0          

In [171]:
ts = TimeSeparator()

(      open_covid_region_code  region_name        date  cases_cumulative  \
 0                        AFG  Afghanistan  2019-12-31               0.0   
 1                        AFG  Afghanistan  2020-01-01               0.0   
 2                        AFG  Afghanistan  2020-01-02               0.0   
 3                        AFG  Afghanistan  2020-01-03               0.0   
 4                        AFG  Afghanistan  2020-01-04               0.0   
 ...                      ...          ...         ...               ...   
 98881                    VUT      Vanuatu  2020-08-06               NaN   
 98882                    VUT      Vanuatu  2020-08-07               NaN   
 98883                    VUT      Vanuatu  2020-08-08               NaN   
 98884                    VUT      Vanuatu  2020-08-09               NaN   
 98885                    VUT      Vanuatu  2020-08-10               NaN   
 
        cases_new  cases_cumulative_per_million  cases_new_per_million  \
 0          

In [173]:
model = DecisionTreeClassifier()
cross.cross_validate_regions(data1, 5, model)

       cases_cumulative  cases_new  cases_cumulative_per_million  \
43424               NaN        NaN                           NaN   
43425               NaN        NaN                           NaN   
43426               NaN        NaN                           NaN   
43427               NaN        NaN                           NaN   
43428               1.0        1.0                          0.06   
...                 ...        ...                           ...   
39736               NaN        NaN                           NaN   
39737               NaN        NaN                           NaN   
39738               NaN        NaN                           NaN   
39739               NaN        NaN                           NaN   
39740               NaN        NaN                           NaN   

       cases_new_per_million  deaths_cumulative  deaths_new  \
43424                    NaN                NaN         NaN   
43425                    NaN                NaN         N

TypeError: float() argument must be a string or a number, not 'NAType'

In [174]:
model = DecisionTreeClassifier(criterion='entropy')
cross.cross_validate_regions(data1, 5, model)

       cases_cumulative  cases_new  cases_cumulative_per_million  \
43424               NaN        NaN                           NaN   
43425               NaN        NaN                           NaN   
43426               NaN        NaN                           NaN   
43427               NaN        NaN                           NaN   
43428               1.0        1.0                          0.06   
...                 ...        ...                           ...   
39736               NaN        NaN                           NaN   
39737               NaN        NaN                           NaN   
39738               NaN        NaN                           NaN   
39739               NaN        NaN                           NaN   
39740               NaN        NaN                           NaN   

       cases_new_per_million  deaths_cumulative  deaths_new  \
43424                    NaN                NaN         NaN   
43425                    NaN                NaN         N

TypeError: float() argument must be a string or a number, not 'NAType'

In [176]:
model = DecisionTreeClassifier(splitter='random')
cross.cross_validate_regions(data1, 5, model)

       cases_cumulative  cases_new  cases_cumulative_per_million  \
43424               NaN        NaN                           NaN   
43425               NaN        NaN                           NaN   
43426               NaN        NaN                           NaN   
43427               NaN        NaN                           NaN   
43428               1.0        1.0                          0.06   
...                 ...        ...                           ...   
39736               NaN        NaN                           NaN   
39737               NaN        NaN                           NaN   
39738               NaN        NaN                           NaN   
39739               NaN        NaN                           NaN   
39740               NaN        NaN                           NaN   

       cases_new_per_million  deaths_cumulative  deaths_new  \
43424                    NaN                NaN         NaN   
43425                    NaN                NaN         N

TypeError: float() argument must be a string or a number, not 'NAType'

In [177]:
model = DecisionTreeClassifier(criterion='entropy', splitter='random')
cross.cross_validate_regions(data1, 5, model)

       cases_cumulative  cases_new  cases_cumulative_per_million  \
43424               NaN        NaN                           NaN   
43425               NaN        NaN                           NaN   
43426               NaN        NaN                           NaN   
43427               NaN        NaN                           NaN   
43428               1.0        1.0                          0.06   
...                 ...        ...                           ...   
39736               NaN        NaN                           NaN   
39737               NaN        NaN                           NaN   
39738               NaN        NaN                           NaN   
39739               NaN        NaN                           NaN   
39740               NaN        NaN                           NaN   

       cases_new_per_million  deaths_cumulative  deaths_new  \
43424                    NaN                NaN         NaN   
43425                    NaN                NaN         N

TypeError: float() argument must be a string or a number, not 'NAType'

In [178]:
# Date separated data
train, test = ts.separate_at_date(data1, "2020-08-10")
x_train = train.loc[:,train.columns != "hospitalized_new"]
y_train = train.loc[:,train.columns == "hospitalized_new"]

x_test = test.loc[:,test.columns != "hospitalized_new"]
y_test = test.loc[:,test.columns == "hospitalized_new"]


In [179]:
class Model_Runner:
    def run_model(self, model, xtrain, ytrain, xtest, ytest):
        model.fit(xtrain, ytrain)
        predictions = model.predict(xtest)
        return accuracy_score(predictions, ytest)