In [1]:
# Probably need most of these later?
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

pd.set_option('display.max_columns', None)

from sklearn.neighbors import NearestNeighbors
from sklearn import datasets
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.metrics import silhouette_samples, silhouette_score
import matplotlib.cm as cm
import seaborn as sns
%matplotlib inline

In [2]:
# Read data, i dont think this needs to be explained 
df = pd.read_csv('BehaviourData.csv', sep = ',', dtype = None)


In [3]:
# Drop them before we even start
df.drop(['qweek',
         'i1_health',
         'i2_health'
        ], axis = 1, inplace = True)

In [4]:
# Rename
df.rename({'i7a_health': 'LeaveHouse', 
           'i3_health': 'CovidResult',
           'i4_health': 'HouseCovid',
           'i5_health_1': 'Cough',
           'i5_health_2': 'Fever',
           'i5_health_3': 'Smell',
           'i5_health_4': 'Taste',
           'i5_health_5': 'Breathing',
           'i5_health_99': 'NoSymptoms',
           'i5a_health': 'CovidContact',
           'i6_health': 'SelfIsolationLevel',
           'i7b_health': 'HealthCareVisit',
           'i8_health': 'PriorTravel',
           'i9_health': 'WouldIsolate',
           'i10_health': 'IsolationDifficulty',
           'i11_health': 'IsolationWillingness',
           'i12_health_1': 'Mask',
           'i12_health_2': 'Soap',
           'i12_health_3': 'HandSanitiser',
           'i12_health_4': 'CoverSneeze',
           'i12_health_5': 'AvoidContact',
           'i12_health_6': 'AvoidGoingOut',
           'i12_health_7': 'AvoidHealthcare',
           'i12_health_8': 'AvoidTransit',
           'i12_health_9': 'AvoidWorkingOutside',
           'i12_health_10': 'AvoidAcademics',
           'i12_health_11': 'AvoidGuests',
           'i12_health_12': 'AvoidSmallGatherings',
           'i12_health_13': 'AvoidMedGatherings',
           'i12_health_14': 'AvoidLargeGatherings',
           'i12_health_15': 'AvoidCrowds',
           'i12_health_16': 'AvoidShops',
           'i12_health_17': 'AvoidSharedBedroom',
           'i12_health_18': 'EatSeperately',
           'i12_health_19': 'CleanSurfaces',
           'i12_health_20': 'AvoidPublicObjects',
           'i13_health': 'NumHandWash',
           'i14_health_1': 'Construction',
           'i14_health_2': 'HomeDelivery',
           'i14_health_3': 'FoodRetail',
           'i14_health_4': 'Healthcare',
           'i14_health_5': 'Logistics',
           'i14_health_6': 'Manufacturing',
           'i14_health_7': 'Police',
           'i14_health_8': 'PublicTransport',
           'i14_health_9': 'School',
           'i14_health_10': 'SocialCare',
           'i14_health_96': 'Other',
           'i14_health_98': 'NotSure',
           'i14_health_99': 'WorkFromHome',
           'i14_health_other': 'HealthOther',
           'd1_health_1': 'Arthritis',
           'd1_health_2': 'Athsma',
           'd1_health_3': 'Cancer',
           'd1_health_4': 'CysticFibrosis',
           'd1_health_5': 'COPD',
           'd1_health_6': 'Diabetes',
           'd1_health_7': 'Eplilepsy',
           'd1_health_8': 'HeartDisease',
           'd1_health_9': 'HighBloodPressure',
           'd1_health_10': 'HighCholesterol',
           'd1_health_11': 'HIV',
           'd1_health_12': 'MentalHealth',
           'd1_health_13': 'MS',
           'd1_health_98': 'NotSay',
           'd1_health_99': 'NoneHealth',
           'weight': 'Weight',
           'age': 'Age',
           'region_state': 'Region',
           'gender': 'Gender',
           'household_size': 'HouseholdSize',
           'household_children': 'HouseholdChildren',
           'employment_status': 'EmploymentStatus'
           }, axis=1, inplace=True)


In [5]:
# Fastest way to copy and paste colum names and quickly shows counts (Just uncomment)
#for col in df.columns:
#    exec("print(f'print(df.{col}.value_counts())')")

In [6]:
# I thought this would be used more, it wasn't. 
def makeBins_manual(column, label, bins):
    df[f'{column}Bin'] = pd.cut(df[column], bins=bins, labels=labels, include_lowest=True)

def makeBins(column, label, numBins):
    min_value = df[column].min()
    max_value = df[column].max()
    bins = np.linspace(min_value,max_value, numBins)
    df[f'{column}Bin'] = pd.cut(df[column], bins=bins, labels=labels, include_lowest=True)

# Bucket age into 3 groups 
labels = ['adolecent', 'adult', 'elderly']
ageBins = [0,20,65,100]
makeBins_manual('Age', labels, ageBins)

In [7]:
# If there is any immunocomprimising, set true
df['UnderlyingHealth'] =  (df.Cancer == 'Yes') | (df.CysticFibrosis == 'Yes') |(df.COPD == 'Yes') |(df.Diabetes == 'Yes') |(df.HeartDisease == 'Yes')  |(df.HIV == 'Yes') |(df.MS == 'Yes')
# No Mental Health or Athsma or Arthritis or Eplilepsy OR HighBloodPressure OR HighCholesterol

# Drop used columns
df = df.drop(['Athsma', 'Arthritis', 'MentalHealth','Cancer', 'HighBloodPressure', 'HighCholesterol', 'CysticFibrosis', 'COPD', 'Diabetes', 'HeartDisease', 'HIV', 'MS', 'Eplilepsy'], axis=1)


In [8]:
# Replace household children with binary
df['HasChildren'] = (df['HouseholdChildren'] != '0')

# Drop more :) 
df = df.drop(['HouseholdChildren', 'HealthOther'], axis=1)
df = df.drop(['Index', 'RecordNo', 'endtime', 'Region'],axis = 1)

# Fill blanks with No
df['CovidContact'] = df['CovidContact'].replace(r'^\s+$', 'No', regex=True)

# Maybe Drop all of these? I think a lot of empty values
df['SelfIsolationLevel'] = df['SelfIsolationLevel'].replace(r'^\s+$', 'Not at all', regex=True)
df['HealthCareVisit'] = df['HealthCareVisit'].replace(r'^\s+$', 'No', regex=True)
df['PriorTravel'] = df['PriorTravel'].replace(r'^\s+$', 'No', regex=True)


In [9]:
# If any symptom is yes, make 1
df['HasSymptoms'] = (((df.Cough == 'Yes') | (df.Fever == 'Yes') | (df.Smell == 'Yes') | (df.Taste == 'Yes') | (df.Breathing == 'Yes')) & (df.NoSymptoms == 'No') )

# Predictor bucketed 
df['Mask'] = df['Mask'].map({'Always' : 'Yes', 'Frequently' : 'Yes', 'Sometimes' : 'No',  'Rarely' : 'No', 'Not at all' : 'No'})

# Drop columns used
df = df.drop(['Cough', 'Fever', 'Smell', 'Taste', 'Breathing', 'NoSymptoms'],axis = 1)

# I dont like these, maybe we want to keep though
df = df.drop(['AvoidContact', 'AvoidGoingOut', 'AvoidHealthcare', 'AvoidTransit', 'AvoidWorkingOutside', 'AvoidAcademics', 'AvoidGuests', 'AvoidSmallGatherings', 'AvoidMedGatherings', 'AvoidLargeGatherings', 'AvoidCrowds', 'AvoidShops', 'AvoidSharedBedroom'], axis = 1)



In [10]:
# Fill blanks of occupations with "No"
occupations = ['Construction', 'HomeDelivery', 'FoodRetail','Healthcare', 'Logistics', 'Manufacturing', 'Police', 'PublicTransport', 'School', 'SocialCare', 'Other', 'NotSure', 'WorkFromHome', 'NotSay']
for i in occupations:
    df[f'{i}'] = df[f'{i}'].replace(r'^\s+$', 'No', regex=True)


In [11]:
# Check that there are no empty cells. I think
df['No_Of_Empty'] = df.iloc[:, 1:].eq("").sum(axis=1)
df.No_Of_Empty.value_counts()

0    28825
Name: No_Of_Empty, dtype: int64

In [12]:
df.head()


Unnamed: 0,LeaveHouse,CovidResult,HouseCovid,CovidContact,SelfIsolationLevel,HealthCareVisit,PriorTravel,WouldIsolate,IsolationDifficulty,IsolationWillingness,Mask,Soap,HandSanitiser,CoverSneeze,EatSeperately,CleanSurfaces,AvoidPublicObjects,NumHandWash,Construction,HomeDelivery,FoodRetail,Healthcare,Logistics,Manufacturing,Police,PublicTransport,School,SocialCare,Other,NotSure,WorkFromHome,NotSay,NoneHealth,Weight,Gender,Age,HouseholdSize,EmploymentStatus,AgeBin,UnderlyingHealth,HasChildren,HasSymptoms,No_Of_Empty
0,1,"No, I have not","No, they have not",No,Not at all,No,No,Yes,Very easy,Very willing,No,Always,Frequently,Always,Always,Frequently,Always,20,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,1.03637,Male,73,1,Retired,elderly,False,False,False,0
1,0,"No, I have not","No, they have not",No,Frequently,No,No,Yes,Somewhat difficult,Somewhat willing,No,Always,Frequently,Always,Not at all,Sometimes,Always,20,No,No,No,No,No,No,No,No,No,No,No,No,No,No,Yes,1.016961,Female,37,1,Unemployed,adult,False,False,True,0
2,0,"No, I have not","No, they have not",No,Frequently,No,No,Yes,Somewhat easy,Very willing,Yes,Frequently,Sometimes,Always,Always,Rarely,Frequently,10,No,No,Yes,No,No,No,No,No,No,No,No,No,No,No,No,1.003516,Male,25,8 or more,Full time employment,adult,False,True,True,0
3,1,"No, I have not","No, they have not",No,Not at all,No,No,Yes,Very easy,Very willing,Yes,Frequently,Always,Always,Always,Frequently,Always,8,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,1.002949,Male,66,2,Retired,elderly,True,False,False,0
4,1,"No, I have not","No, they have not",No,Not at all,No,No,Yes,Somewhat easy,Very willing,No,Sometimes,Rarely,Always,Not at all,Not at all,Frequently,2,No,No,No,No,No,No,No,No,No,No,No,No,No,No,Yes,1.028216,Female,25,2,Unemployed,adult,False,False,False,0
