Importing Packages 

In [1]:
import pickle 
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
from sklearn import datasets
import seaborn as sns  
# You can configure the format of the images: ‘png’, ‘retina’, ‘jpeg’, ‘svg’, ‘pdf’.
%config InlineBackend.figure_format = 'svg'
# this statement allows the visuals to render within your Jupyter Notebook
%matplotlib inline 

from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression, Lasso, LassoCV, Ridge, RidgeCV
from sklearn.metrics import r2_score
from sklearn.preprocessing import StandardScaler, PolynomialFeatures
from sklearn.model_selection import KFold

import patsy
import scipy.stats as stats

import statsmodels.api as sm
import statsmodels.formula.api as smf

Opening pickle from cleaning notebook

In [2]:
with open('//Users/adelweiss/Documents/Project/Project 2/data/df2.pickle','rb') as read_file:
    df = pickle.load(read_file)

Cleaning combined dataset 

In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 261 entries, 51 to 208
Data columns (total 17 columns):
Dengue Cases                          261 non-null float64
eweek                                 261 non-null object
Daily Rainfall Total (mm)             249 non-null float64
Mean Temperature (°C)                 249 non-null float64
Maximum Temperature (°C)              249 non-null float64
Minimum Temperature (°C)              249 non-null float64
fever: (Singapore)                    260 non-null float64
Nausea: (Singapore)                   260 non-null float64
headache: (Singapore)                 260 non-null float64
ache + pain: (Singapore)              260 non-null float64
dengue fever: (Singapore)             260 non-null float64
eye pain: (Singapore)                 260 non-null float64
dengue: (Singapore)                   260 non-null float64
dengue fever symptoms: (Singapore)    260 non-null float64
rashes: (Singapore)                   260 non-null float64
Vomiting: 

In [4]:
### dropping NAs, 
df.dropna(inplace = True)

### dropping these search terms because dengue would have covered them already 
df.drop(columns = ['dengue fever symptoms: (Singapore)', 'dengue fever: (Singapore)'], inplace = True)

In [5]:
### I think i want to create temperature range based on the mean and max temperature... 
### I think lower temperature fluctuations should be better for breeding 
df ['Temperature Range'] = df['Maximum Temperature (°C)'] - df['Minimum Temperature (°C)']

In [6]:
#sns.pairplot(df)

In [8]:
### log transform dengue cases to correct for postive skew 
### I tried the other transformation as well, but log seems to have the best results 

df['log Dengue Cases Week+1'] = np.log(df['Dengue Cases Week+1'])
#df['sq root Dengue Cases Week+1'] = np.sqrt(df['Dengue Cases Week+1'])
#df['cbrt Dengue Cases Week+1'] = np.cbrt(df['Dengue Cases Week+1'])

In [9]:
### making sure these variables are not thrown into the model 
df = df.drop(columns = ['Dengue Cases Week+1','eweek', 'Dengue Cases'])

In [10]:
### i also need easier column names 
df.columns = ['rainfall', 'mean_temp', 'max_temp', 'min_temp', 'fever', 'nausea', 'headache',
             'ache_pain', 'eye_pain', 'dengue', 'rashes', 'vomitting', 'temp_range', 'log_dengue_cases']

In [11]:
df

Unnamed: 0,rainfall,mean_temp,max_temp,min_temp,fever,nausea,headache,ache_pain,eye_pain,dengue,rashes,vomitting,temp_range,log_dengue_cases
50,1.628571,27.485714,31.357143,25.442857,77.0,77.0,76.0,84.0,61.0,28.0,60.0,96.0,5.914286,5.075174
49,7.000000,27.457143,30.842857,24.885714,67.0,92.0,56.0,87.0,81.0,26.0,58.0,86.0,5.957143,4.844187
48,7.314286,27.157143,31.128571,23.957143,82.0,70.0,70.0,77.0,59.0,29.0,54.0,89.0,7.171429,4.672829
47,5.914286,27.228571,31.114286,24.342857,76.0,64.0,80.0,82.0,76.0,27.0,62.0,78.0,6.771429,4.727388
46,3.800000,27.457143,31.600000,25.014286,67.0,56.0,70.0,82.0,72.0,20.0,67.0,73.0,6.585714,4.691348
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
214,0.000000,26.071429,30.371429,23.342857,58.0,71.0,78.0,80.0,37.0,48.0,53.0,62.0,7.028571,5.262690
213,0.000000,25.757143,29.014286,23.642857,64.0,67.0,65.0,80.0,43.0,42.0,58.0,91.0,5.371429,5.910797
212,2.600000,26.271429,29.100000,24.357143,66.0,67.0,69.0,73.0,43.0,34.0,84.0,65.0,4.742857,5.609472
211,8.171429,26.171429,29.614286,24.057143,72.0,67.0,56.0,75.0,48.0,47.0,78.0,68.0,5.557143,5.455321


Cross validation of regressions model 

In [12]:
###split into train and test 
X, y = df.drop(['log_dengue_cases'],axis=1), df['log_dengue_cases']

In [13]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=.2, random_state=10)
X_train, y_train = np.array(X_train), np.array(y_train) 

In [14]:
### then compare which is the best models...
kf = KFold(n_splits=5, shuffle=True, random_state = 71)
cv_lm_r2s, cv_lm_poly_r2s, cv_lm_reg_r2s, cv_lm_lasso_r2s = [], [], [], []#collect the validation results for both models

for train_ind, val_ind in kf.split(X_train,y_train):
    X_trainfold, y_trainfold = X_train[train_ind], y_train[train_ind]
    X_val, y_val = X_train[val_ind], y_train[val_ind] 
    
    #Scale variables 
    scaler = StandardScaler()
    X_trainfold_scaled = scaler.fit_transform(X_trainfold)
    X_val_scaled = scaler.transform(X_val)
    
    #simple linear regression  
    lm = LinearRegression()
    lm.fit(X_trainfold_scaled, y_trainfold)
    cv_lm_r2s.append(lm.score(X_val_scaled, y_val))
    
    #polynomial with feature scaling
    lm_poly = LinearRegression()
    poly = PolynomialFeatures(degree=2) 
    X_train_poly = poly.fit_transform(X_trainfold_scaled)
    X_val_poly = poly.transform(X_val_scaled)
    lm_poly.fit(X_train_poly, y_trainfold)
    cv_lm_poly_r2s.append(lm_poly.score(X_val_poly, y_val))
    
    #ridge regression with feature scaling 
    lm_reg = Ridge()
    #lm_reg = Ridge(alpha=14)
    lm_reg.fit(X_trainfold_scaled, y_trainfold)
    cv_lm_reg_r2s.append(lm_reg.score(X_val_scaled, y_val))
    
    #lasso regression with feature scaling 
    lm_lasso = Lasso()
    #lm_lasso = Lasso(alpha = 0.01)
    lm_lasso.fit(X_trainfold_scaled, y_trainfold)
    cv_lm_lasso_r2s.append(lm_lasso.score(X_val_scaled, y_val))
    
print('Simple regression scores: ', cv_lm_r2s)
print('Polynomial scores: ', cv_lm_poly_r2s, '\n')
print('Ridge scores: ', cv_lm_reg_r2s, '\n')
print('Lasso scores: ', cv_lm_lasso_r2s, '\n')

print(f'Simple mean cv r^2: {np.mean(cv_lm_r2s):.3f} +- {np.std(cv_lm_r2s):.3f}')
print(f'Polynomial mean cv r^2: {np.mean(cv_lm_poly_r2s):.3f} +- {np.std(cv_lm_poly_r2s):.3f}')
print(f'Ridge mean cv r^2: {np.mean(cv_lm_reg_r2s):.3f} +- {np.std(cv_lm_reg_r2s):.3f}')
print(f'Lasso mean cv r^2: {np.mean(cv_lm_lasso_r2s):.3f} +- {np.std(cv_lm_lasso_r2s):.3f}')

Simple regression scores:  [0.642712927122955, 0.7342182425957886, 0.6484603047468429, 0.7406216387429945, 0.6686239145899813]
Polynomial scores:  [0.5811140820595584, 0.3041073995103635, 0.3152094832278556, 0.6551461915181285, 0.0639867476556284] 

Ridge scores:  [0.6427802379933143, 0.73518675096652, 0.6473833835586174, 0.7431964015569197, 0.6677611663179613] 

Lasso scores:  [-4.493844615005571e-05, -7.808818365262482e-07, -0.02852945255842787, -0.0012294067577502954, -0.02211139359063341] 

Simple mean cv r^2: 0.687 +- 0.042
Polynomial mean cv r^2: 0.384 +- 0.213
Ridge mean cv r^2: 0.687 +- 0.043
Lasso mean cv r^2: -0.010 +- 0.012


Looks like simple linear is the way to go. Let's try to do some feature selection using Lasso

In [15]:
### looks like non-polynomial is the way to go.... 
### let's try to reduce the model complexity 
### thru feature selection before we optimise alpha for ridge

#need to redo this cuz CV
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=.2, random_state=10)
### Scaling the variables 
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train) 
X_test_scaled = scaler.transform(X_test) 

### Applying lasso model on the train dataset 
alphavec = 10**np.linspace(-2,2,200)
lasso_model = LassoCV(alphas = alphavec, cv=5)
lasso_model.fit(X_train_scaled, y_train)
list(zip(X_train, lasso_model.coef_))

[('rainfall', 0.0),
 ('mean_temp', 0.0),
 ('max_temp', 0.0525803732086567),
 ('min_temp', 0.04187654885134822),
 ('fever', 0.0),
 ('nausea', 0.0),
 ('headache', -0.16104992337374358),
 ('ache_pain', -0.20894770997002585),
 ('eye_pain', 0.00936787784137045),
 ('dengue', 0.5121045543521834),
 ('rashes', 0.05882277660285102),
 ('vomitting', -0.04953631664109956),
 ('temp_range', 0.0)]

In [16]:
###dropping columns that hit zero or v close to zero for optimal lasso 
df = df.drop(columns = ['rainfall','mean_temp','fever','nausea','eye_pain', 
                        'vomitting', 'temp_range'])

In [17]:
X, y = df.drop(['log_dengue_cases'],axis=1), df['log_dengue_cases']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=.2, random_state=10)
X_train, y_train = np.array(X_train), np.array(y_train) 

In [18]:
### then compare which is the best models...
kf = KFold(n_splits=5, shuffle=True, random_state = 71)
cv_lm_r2s, cv_lm_reg_r2s  = [], []#collect the validation results for both models

for train_ind, val_ind in kf.split(X_train,y_train):
    X_trainfold, y_trainfold = X_train[train_ind], y_train[train_ind]
    X_val, y_val = X_train[val_ind], y_train[val_ind] 
    
    #Scale variables 
    scaler = StandardScaler()
    X_trainfold_scaled = scaler.fit_transform(X_trainfold)
    X_val_scaled = scaler.transform(X_val)
    
    #simple linear regression using feature scaling 
    lm = LinearRegression()
    lm.fit(X_trainfold_scaled, y_trainfold)
    cv_lm_r2s.append(lm.score(X_val_scaled, y_val))
        
    #ridge regression with feature scaling 
    lm_reg = Ridge()
    lm_reg.fit(X_trainfold_scaled, y_trainfold)
    cv_lm_reg_r2s.append(lm_reg.score(X_val_scaled, y_val))
    
print('Simple regression scores: ', cv_lm_r2s)
print('Ridge scores: ', cv_lm_reg_r2s, '\n')

print(f'Simple mean cv r^2: {np.mean(cv_lm_r2s):.3f} +- {np.std(cv_lm_r2s):.3f}')
print(f'Ridge mean cv r^2: {np.mean(cv_lm_reg_r2s):.3f} +- {np.std(cv_lm_reg_r2s):.3f}')


Simple regression scores:  [0.6521995515053702, 0.7564259882555001, 0.6951379707791366, 0.7323346593987692, 0.6581984528369955]
Ridge scores:  [0.6512626591057531, 0.7560087058555229, 0.6953536619955379, 0.7342745866015052, 0.658960450827154] 

Simple mean cv r^2: 0.699 +- 0.041
Ridge mean cv r^2: 0.699 +- 0.041


In [41]:
### doing this again cuz RidgeCV doesn't handle arrays 
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=.2, random_state=10)
### Scaling the variables 
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train) 
X_test_scaled = scaler.transform(X_test) 

### Applying lasso model on the train dataset 
alphavec = 10**np.linspace(-2,2,200)
reg_model = RidgeCV(alphas = alphavec, cv=5)
reg_model.fit(X_train_scaled, y_train)
reg_model.alpha_
#list(zip(X_train, lasso_model.coef_))



8.603464416684501

In [20]:
### not much difference in r2, so let's try both 
reg_model = Ridge(alpha = 8.6) 
reg_model.fit(X_train_scaled, y_train)
reg_model.score(X_test_scaled, y_test)

0.7172004673775667

In [21]:
lm = LinearRegression()
lm.fit(X_train_scaled, y_train)
lm.score(X_test_scaled, y_test)

### let's just stick to the simple model.. 

0.7236841992106057

Test assumptions of model 

In [42]:
X_train_scaled = X_train.apply(lambda x: (x-x.mean()) / x.std())

In [23]:
df_for_stats_model = pd.merge(y_train, X_train_scaled, left_index=True, right_index=True)

In [24]:
df_for_stats_model.corr()

Unnamed: 0,log_dengue_cases,max_temp,min_temp,headache,ache_pain,dengue,rashes
log_dengue_cases,1.0,0.205563,0.240943,-0.436737,-0.473788,0.756265,0.177368
max_temp,0.205563,1.0,0.631257,-0.036505,-0.068171,0.123982,-0.188374
min_temp,0.240943,0.631257,1.0,-0.089406,0.01832,0.200945,-0.115326
headache,-0.436737,-0.036505,-0.089406,1.0,0.352025,-0.213776,0.038136
ache_pain,-0.473788,-0.068171,0.01832,0.352025,1.0,-0.208146,-0.010352
dengue,0.756265,0.123982,0.200945,-0.213776,-0.208146,1.0,0.186603
rashes,0.177368,-0.188374,-0.115326,0.038136,-0.010352,0.186603,1.0


In [27]:
# Create your feature matrix (X) and target vector (y)
y, X = patsy.dmatrices('log_dengue_cases ~ max_temp + min_temp + headache + ache_pain + dengue + rashes', 
                       data=df_for_stats_model, return_type="dataframe")

# Create your model
model = sm.OLS(y, X)

# Fit your model to your training set
fit = model.fit()

# Print summary statistics of the model's performance
fit.summary()

0,1,2,3
Dep. Variable:,log_dengue_cases,R-squared:,0.731
Model:,OLS,Adj. R-squared:,0.723
Method:,Least Squares,F-statistic:,86.5
Date:,"Tue, 22 Oct 2019",Prob (F-statistic):,8.859999999999999e-52
Time:,13:55:00,Log-Likelihood:,-119.42
No. Observations:,198,AIC:,252.8
Df Residuals:,191,BIC:,275.9
Df Model:,6,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
Intercept,4.8577,0.032,151.793,0.000,4.795,4.921
max_temp,0.0687,0.042,1.628,0.105,-0.015,0.152
min_temp,0.0539,0.042,1.272,0.205,-0.030,0.138
headache,-0.1756,0.035,-5.023,0.000,-0.245,-0.107
ache_pain,-0.2289,0.035,-6.536,0.000,-0.298,-0.160
dengue,0.5277,0.035,15.181,0.000,0.459,0.596
rashes,0.0766,0.034,2.283,0.024,0.010,0.143

0,1,2,3
Omnibus:,0.299,Durbin-Watson:,2.131
Prob(Omnibus):,0.861,Jarque-Bera (JB):,0.45
Skew:,0.048,Prob(JB):,0.799
Kurtosis:,2.787,Cond. No.,2.29


In [28]:
### remove min_temp

# Create your feature matrix (X) and target vector (y)
y, X = patsy.dmatrices('log_dengue_cases ~ max_temp + headache + ache_pain + dengue + rashes', 
                       data=df_for_stats_model, return_type="dataframe")

# Create your model
model = sm.OLS(y, X)

# Fit your model to your training set
fit = model.fit()

# Print summary statistics of the model's performance
fit.summary()

0,1,2,3
Dep. Variable:,log_dengue_cases,R-squared:,0.729
Model:,OLS,Adj. R-squared:,0.722
Method:,Least Squares,F-statistic:,103.1
Date:,"Tue, 22 Oct 2019",Prob (F-statistic):,1.8399999999999997e-52
Time:,13:56:15,Log-Likelihood:,-120.25
No. Observations:,198,AIC:,252.5
Df Residuals:,192,BIC:,272.2
Df Model:,5,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
Intercept,4.8577,0.032,151.549,0.000,4.794,4.921
max_temp,0.1019,0.033,3.068,0.002,0.036,0.167
headache,-0.1798,0.035,-5.158,0.000,-0.249,-0.111
ache_pain,-0.2227,0.035,-6.411,0.000,-0.291,-0.154
dengue,0.5350,0.034,15.582,0.000,0.467,0.603
rashes,0.0755,0.034,2.247,0.026,0.009,0.142

0,1,2,3
Omnibus:,0.914,Durbin-Watson:,2.138
Prob(Omnibus):,0.633,Jarque-Bera (JB):,0.984
Skew:,0.079,Prob(JB):,0.611
Kurtosis:,2.692,Cond. No.,1.6


In [29]:
new_df = df.drop(columns = ['min_temp'])
X, y = df.drop(['log_dengue_cases'],axis=1), df['log_dengue_cases']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=.2, random_state=10)


In [30]:
lm = LinearRegression()
lm.fit(X_train_scaled, y_train)
lm.score(X_test_scaled, y_test)
r_test = lm.score(X_test_scaled, y_test)

In [34]:
adjusted_r2 = 1-(1-r_test)*(X_train_scaled.shape[0]-1)/(X_train_scaled.shape[0]-X_train_scaled.shape[1]-1)

In [35]:
adjusted_r2

0.7152132670051103