Importing Packages 

In [1]:
import pickle 
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
from sklearn import datasets
import seaborn as sns  
# You can configure the format of the images: ‘png’, ‘retina’, ‘jpeg’, ‘svg’, ‘pdf’.
%config InlineBackend.figure_format = 'svg'
# this statement allows the visuals to render within your Jupyter Notebook
%matplotlib inline 

from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression, Lasso, LassoCV, Ridge, RidgeCV
from sklearn.metrics import r2_score
from sklearn.preprocessing import StandardScaler, PolynomialFeatures
from sklearn.model_selection import KFold

import patsy
import scipy.stats as stats

import statsmodels.api as sm
import statsmodels.formula.api as smf

Opening pickle from cleaning notebook

In [2]:
with open('//Users/adelweiss/Documents/Project/Project 2/data/df.pickle','rb') as read_file:
    df = pickle.load(read_file)

Cleaning combined dataset 

In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 261 entries, 51 to 208
Data columns (total 17 columns):
Dengue Cases                          261 non-null float64
eweek                                 261 non-null object
Daily Rainfall Total (mm)             251 non-null float64
Mean Temperature (°C)                 251 non-null float64
Maximum Temperature (°C)              251 non-null float64
Minimum Temperature (°C)              251 non-null float64
fever: (Singapore)                    260 non-null float64
Nausea: (Singapore)                   260 non-null float64
headache: (Singapore)                 260 non-null float64
ache + pain: (Singapore)              260 non-null float64
dengue fever: (Singapore)             260 non-null float64
eye pain: (Singapore)                 260 non-null float64
dengue: (Singapore)                   260 non-null float64
dengue fever symptoms: (Singapore)    260 non-null float64
rashes: (Singapore)                   260 non-null float64
Vomiting: 

In [4]:
### dropping NAs, 
df.dropna(inplace = True)

### dropping these search terms because dengue would have covered them already 
df.drop(columns = ['dengue fever symptoms: (Singapore)', 'dengue fever: (Singapore)'], inplace = True)

In [5]:
### I think i want to create temperature range based on the mean and max temperature... 
### I think lower temperature fluctuations should be better for breeding 
df ['Temperature Range'] = df['Maximum Temperature (°C)'] - df['Minimum Temperature (°C)']

In [6]:
#sns.pairplot(df)

In [7]:
### log transform dengue cases to correct for postive skew 
### I tried the other transformation as well, but log seems to have the best results 

df['log Dengue Cases Week+1'] = np.log(df['Dengue Cases Week+1'])
#df['sq root Dengue Cases Week+1'] = np.sqrt(df['Dengue Cases Week+1'])
#df['cbrt Dengue Cases Week+1'] = np.cbrt(df['Dengue Cases Week+1'])

In [8]:
### making sure these variables are not thrown into the model 
df = df.drop(columns = ['Dengue Cases Week+1','eweek', 'Dengue Cases'])

In [9]:
### i also need easier column names 
df.columns = ['rainfall', 'mean_temp', 'max_temp', 'min_temp', 'fever', 'nausea', 'headache',
             'ache_pain', 'eye_pain', 'dengue', 'rashes', 'vomitting', 'temp_range', 'log_dengue_cases']

In [10]:
df

Unnamed: 0,rainfall,mean_temp,max_temp,min_temp,fever,nausea,headache,ache_pain,eye_pain,dengue,rashes,vomitting,temp_range,log_dengue_cases
50,9.000000,27.500000,31.714286,24.928571,77.0,77.0,76.0,84.0,61.0,28.0,60.0,96.0,6.785714,5.075174
49,4.800000,27.185714,30.628571,25.285714,67.0,92.0,56.0,87.0,81.0,26.0,58.0,86.0,5.342857,4.844187
48,1.628571,27.485714,31.357143,25.442857,82.0,70.0,70.0,77.0,59.0,29.0,54.0,89.0,5.914286,4.672829
47,7.000000,27.457143,30.842857,24.885714,76.0,64.0,80.0,82.0,76.0,27.0,62.0,78.0,5.957143,4.727388
46,7.314286,27.157143,31.128571,23.957143,67.0,56.0,70.0,82.0,72.0,20.0,67.0,73.0,7.171429,4.691348
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
213,0.028571,26.300000,31.128571,23.314286,64.0,67.0,65.0,80.0,43.0,42.0,58.0,91.0,7.814286,5.910797
212,0.000000,26.071429,30.371429,23.342857,66.0,67.0,69.0,73.0,43.0,34.0,84.0,65.0,7.028571,5.609472
211,0.000000,25.757143,29.014286,23.642857,72.0,67.0,56.0,75.0,48.0,47.0,78.0,68.0,5.371429,5.455321
210,2.600000,26.271429,29.100000,24.357143,84.0,92.0,52.0,77.0,57.0,61.0,73.0,78.0,4.742857,5.817111


Cross validation of regressions model 

In [11]:
###split into train and test 
X, y = df.drop(['log_dengue_cases'],axis=1), df['log_dengue_cases']

In [12]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=.2, random_state=10)
X_train, y_train = np.array(X_train), np.array(y_train) 

In [13]:
### then compare which is the best models...
kf = KFold(n_splits=5, shuffle=True, random_state = 71)
cv_lm_r2s, cv_lm_poly_r2s, cv_lm_reg_r2s, cv_lm_lasso_r2s = [], [], [], []#collect the validation results for both models

for train_ind, val_ind in kf.split(X_train,y_train):
    X_trainfold, y_trainfold = X_train[train_ind], y_train[train_ind]
    X_val, y_val = X_train[val_ind], y_train[val_ind] 
    
    #Scale variables 
    scaler = StandardScaler()
    X_trainfold_scaled = scaler.fit_transform(X_trainfold)
    X_val_scaled = scaler.transform(X_val)
    
    #simple linear regression  
    lm = LinearRegression()
    lm.fit(X_trainfold_scaled, y_trainfold)
    cv_lm_r2s.append(lm.score(X_val_scaled, y_val))
    
    #polynomial with feature scaling
    lm_poly = LinearRegression()
    poly = PolynomialFeatures(degree=2) 
    X_train_poly = poly.fit_transform(X_trainfold_scaled)
    X_val_poly = poly.transform(X_val_scaled)
    lm_poly.fit(X_train_poly, y_trainfold)
    cv_lm_poly_r2s.append(lm_poly.score(X_val_poly, y_val))
    
    #ridge regression with feature scaling 
    lm_reg = Ridge()
    #lm_reg = Ridge(alpha=14)
    lm_reg.fit(X_trainfold_scaled, y_trainfold)
    cv_lm_reg_r2s.append(lm_reg.score(X_val_scaled, y_val))
    
    #lasso regression with feature scaling 
    lm_lasso = Lasso()
    #lm_lasso = Lasso(alpha = 0.01)
    lm_lasso.fit(X_trainfold_scaled, y_trainfold)
    cv_lm_lasso_r2s.append(lm_lasso.score(X_val_scaled, y_val))
    
print('Simple regression scores: ', cv_lm_r2s)
print('Polynomial scores: ', cv_lm_poly_r2s, '\n')
print('Ridge scores: ', cv_lm_reg_r2s, '\n')
print('Lasso scores: ', cv_lm_lasso_r2s, '\n')

print(f'Simple mean cv r^2: {np.mean(cv_lm_r2s):.3f} +- {np.std(cv_lm_r2s):.3f}')
print(f'Polynomial mean cv r^2: {np.mean(cv_lm_poly_r2s):.3f} +- {np.std(cv_lm_poly_r2s):.3f}')
print(f'Ridge mean cv r^2: {np.mean(cv_lm_reg_r2s):.3f} +- {np.std(cv_lm_reg_r2s):.3f}')
print(f'Lasso mean cv r^2: {np.mean(cv_lm_lasso_r2s):.3f} +- {np.std(cv_lm_lasso_r2s):.3f}')

Simple regression scores:  [0.6313376791780044, 0.7191007874767281, 0.6165640991198559, 0.7104278472993533, 0.7249533815052178]
Polynomial scores:  [0.23423126493100555, 0.5508421140492604, 0.05516646306818085, 0.40769334977163907, 0.5287565202074851] 

Ridge scores:  [0.6315597966013227, 0.7180683147167511, 0.6195861997393652, 0.7129514228947417, 0.7246492778444309] 

Lasso scores:  [-0.0031127017367751364, -0.0007056619328711378, -0.0067744660521424915, -0.011998311662609895, -3.884706093204571e-05] 

Simple mean cv r^2: 0.680 +- 0.047
Polynomial mean cv r^2: 0.355 +- 0.188
Ridge mean cv r^2: 0.681 +- 0.046
Lasso mean cv r^2: -0.005 +- 0.004


Looks like simple linear is the way to go. Let's try to do some feature selection using Lasso

In [14]:
### looks like non-polynomial is the way to go.... 
### let's try to reduce the model complexity 
### thru feature selection before we optimise alpha for ridge

#need to redo this cuz CV
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=.2, random_state=10)
### Scaling the variables 
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train) 
X_test_scaled = scaler.transform(X_test) 

### Applying lasso model on the train dataset 
alphavec = 10**np.linspace(-2,2,200)
lasso_model = LassoCV(alphas = alphavec, cv=5)
lasso_model.fit(X_train_scaled, y_train)
list(zip(X_train, lasso_model.coef_))

[('rainfall', 0.018486502183935594),
 ('mean_temp', 0.06868522482234792),
 ('max_temp', 0.026536406294192606),
 ('min_temp', 0.0),
 ('fever', -0.0),
 ('nausea', -0.009261962178631948),
 ('headache', -0.1746509507064813),
 ('ache_pain', -0.21498344015848214),
 ('eye_pain', 0.005246565794440529),
 ('dengue', 0.5340757673776464),
 ('rashes', 0.06608732040162081),
 ('vomitting', -0.009559889817956992),
 ('temp_range', 0.027529964438859626)]

In [15]:
###dropping columns that hit zero or v close to zero for optimal lasso 
df = df.drop(columns = ['min_temp', 'fever', 'vomitting', 'eye_pain', 'nausea'])

In [16]:
X, y = df.drop(['log_dengue_cases'],axis=1), df['log_dengue_cases']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=.2, random_state=10)
X_train, y_train = np.array(X_train), np.array(y_train) 

In [17]:
### then compare which is the best models...
kf = KFold(n_splits=5, shuffle=True, random_state = 71)
cv_lm_r2s, cv_lm_reg_r2s  = [], []#collect the validation results for both models

for train_ind, val_ind in kf.split(X_train,y_train):
    X_trainfold, y_trainfold = X_train[train_ind], y_train[train_ind]
    X_val, y_val = X_train[val_ind], y_train[val_ind] 
    
    #Scale variables 
    scaler = StandardScaler()
    X_trainfold_scaled = scaler.fit_transform(X_trainfold)
    X_val_scaled = scaler.transform(X_val)
    
    #simple linear regression using feature scaling 
    lm = LinearRegression()
    lm.fit(X_trainfold_scaled, y_trainfold)
    cv_lm_r2s.append(lm.score(X_val_scaled, y_val))
        
    #ridge regression with feature scaling 
    lm_reg = Ridge()
    lm_reg.fit(X_trainfold_scaled, y_trainfold)
    cv_lm_reg_r2s.append(lm_reg.score(X_val_scaled, y_val))
    
print('Simple regression scores: ', cv_lm_r2s)
print('Ridge scores: ', cv_lm_reg_r2s, '\n')

print(f'Simple mean cv r^2: {np.mean(cv_lm_r2s):.3f} +- {np.std(cv_lm_r2s):.3f}')
print(f'Ridge mean cv r^2: {np.mean(cv_lm_reg_r2s):.3f} +- {np.std(cv_lm_reg_r2s):.3f}')


Simple regression scores:  [0.6316754541360685, 0.743256589861764, 0.6498550179999606, 0.7184251575479973, 0.7276343377840486]
Ridge scores:  [0.6315928590436379, 0.7437095083303944, 0.652024759155142, 0.7206629930036234, 0.7266500913111185] 

Simple mean cv r^2: 0.694 +- 0.045
Ridge mean cv r^2: 0.695 +- 0.044


In [18]:
### doing this again cuz RidgeCV doesn't handle arrays 
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=.2, random_state=10)
### Scaling the variables 
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train) 
X_test_scaled = scaler.transform(X_test) 

### Applying lasso model on the train dataset 
alphavec = 10**np.linspace(-2,2,200)
reg_model = RidgeCV(alphas = alphavec, cv=5)
reg_model.fit(X_train_scaled, y_train)
reg_model.alpha_
#list(zip(X_train, lasso_model.coef_))



10.843659686896109

In [19]:
### not much difference in r2, so let's try both 
reg_model = Ridge(alpha = 10.8) 
reg_model.fit(X_train_scaled, y_train)
reg_model.score(X_test_scaled, y_test)

0.7286591619349561

In [20]:
lm = LinearRegression()
lm.fit(X_train_scaled, y_train)
lm.score(X_test_scaled, y_test)

### let's just stick to the simple model.. 

0.7329044518696068

Test assumptions of model 

In [21]:
X_train_scaled = X_train.apply(lambda x: (x-x.mean()) / x.std())

In [22]:
df_for_stats_model = pd.merge(y_train, X_train_scaled, left_index=True, right_index=True)

In [23]:
df_for_stats_model.corr()

Unnamed: 0,log_dengue_cases,rainfall,mean_temp,max_temp,headache,ache_pain,dengue,rashes,temp_range
log_dengue_cases,1.0,-0.037025,0.169914,0.06957,-0.440802,-0.461105,0.75769,0.197196,-0.064897
rainfall,-0.037025,1.0,-0.354396,-0.331282,0.034075,0.007182,-0.042282,-0.034545,0.113353
mean_temp,0.169914,-0.354396,1.0,0.769936,0.083602,0.097519,0.170027,-0.065418,-0.063814
max_temp,0.06957,-0.331282,0.769936,1.0,0.074383,0.11901,0.009234,-0.136666,0.524623
headache,-0.440802,0.034075,0.083602,0.074383,1.0,0.383826,-0.222818,0.045891,0.02834
ache_pain,-0.461105,0.007182,0.097519,0.11901,0.383826,1.0,-0.203904,-0.017107,0.049539
dengue,0.75769,-0.042282,0.170027,0.009234,-0.222818,-0.203904,1.0,0.205042,-0.155968
rashes,0.197196,-0.034545,-0.065418,-0.136666,0.045891,-0.017107,0.205042,1.0,-0.158169
temp_range,-0.064897,0.113353,-0.063814,0.524623,0.02834,0.049539,-0.155968,-0.158169,1.0


In [24]:
# Create your feature matrix (X) and target vector (y)
y, X = patsy.dmatrices('log_dengue_cases ~ rainfall + mean_temp + max_temp + headache + ache_pain + dengue + rashes + temp_range' , 
                       data=df_for_stats_model, return_type="dataframe")

# Create your model
model = sm.OLS(y, X)

# Fit your model to your training set
fit = model.fit()

# Print summary statistics of the model's performance
fit.summary()

0,1,2,3
Dep. Variable:,log_dengue_cases,R-squared:,0.728
Model:,OLS,Adj. R-squared:,0.716
Method:,Least Squares,F-statistic:,63.52
Date:,"Mon, 21 Oct 2019",Prob (F-statistic):,1.21e-49
Time:,18:55:51,Log-Likelihood:,-123.18
No. Observations:,199,AIC:,264.4
Df Residuals:,190,BIC:,294.0
Df Model:,8,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
Intercept,4.8549,0.033,148.927,0.000,4.791,4.919
rainfall,0.0332,0.039,0.850,0.396,-0.044,0.110
mean_temp,0.1297,0.108,1.198,0.232,-0.084,0.343
max_temp,-0.0192,0.132,-0.145,0.885,-0.280,0.242
headache,-0.1892,0.036,-5.208,0.000,-0.261,-0.118
ache_pain,-0.2283,0.036,-6.350,0.000,-0.299,-0.157
dengue,0.5394,0.036,14.891,0.000,0.468,0.611
rashes,0.0829,0.034,2.425,0.016,0.015,0.150
temp_range,0.0725,0.084,0.861,0.391,-0.094,0.239

0,1,2,3
Omnibus:,0.504,Durbin-Watson:,2.115
Prob(Omnibus):,0.777,Jarque-Bera (JB):,0.635
Skew:,0.098,Prob(JB):,0.728
Kurtosis:,2.805,Cond. No.,8.33


In [25]:
### remove max_temp, rainfall and temp_range 

# Create your feature matrix (X) and target vector (y)
y, X = patsy.dmatrices('log_dengue_cases ~ mean_temp + headache + ache_pain + dengue + rashes', 
                       data=df_for_stats_model, return_type="dataframe")

# Create your model
model = sm.OLS(y, X)

# Fit your model to your training set
fit = model.fit()

# Print summary statistics of the model's performance
fit.summary()

0,1,2,3
Dep. Variable:,log_dengue_cases,R-squared:,0.721
Model:,OLS,Adj. R-squared:,0.714
Method:,Least Squares,F-statistic:,99.78
Date:,"Mon, 21 Oct 2019",Prob (F-statistic):,1.4e-51
Time:,18:55:52,Log-Likelihood:,-125.63
No. Observations:,199,AIC:,263.3
Df Residuals:,193,BIC:,283.0
Df Model:,5,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
Intercept,4.8549,0.033,148.259,0.000,4.790,4.919
mean_temp,0.0980,0.034,2.882,0.004,0.031,0.165
headache,-0.1861,0.036,-5.122,0.000,-0.258,-0.114
ache_pain,-0.2259,0.036,-6.267,0.000,-0.297,-0.155
dengue,0.5354,0.036,14.982,0.000,0.465,0.606
rashes,0.0716,0.034,2.110,0.036,0.005,0.139

0,1,2,3
Omnibus:,0.786,Durbin-Watson:,2.166
Prob(Omnibus):,0.675,Jarque-Bera (JB):,0.888
Skew:,0.08,Prob(JB):,0.642
Kurtosis:,2.715,Cond. No.,1.64


In [28]:
new_df = df.drop(columns = ['max_temp', 'rainfall', 'temp_range'])
X, y = df.drop(['log_dengue_cases'],axis=1), df['log_dengue_cases']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=.2, random_state=10)


In [35]:
lm = LinearRegression()
lm.fit(X_train_scaled, y_train)
lm.score(X_test_scaled, y_test)
r_test = lm.score(X_test_scaled, y_test)

In [36]:
r2 = 1-(1-r_test)*(X_train_scaled.shape[0]-1)/(X_train_scaled.shape[0]-X_train_scaled.shape[1]-1)

In [37]:
r2

0.7219960248794873