In [1]:
import pandas as pd 
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import StandardScaler, PolynomialFeatures
from xgboost import XGBRegressor 
from sklearn.tree import DecisionTreeRegressor
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import MinMaxScaler, StandardScaler
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, mean_absolute_error
from sklearn.ensemble import BaggingRegressor, AdaBoostRegressor, RandomForestRegressor
from sklearn.preprocessing import PolynomialFeatures
from sklearn.feature_selection import RFE 
from sklearn.model_selection import train_test_split
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import Lasso, LassoCV
from sklearn.linear_model import Ridge, RidgeCV

import warnings
warnings.filterwarnings('ignore')

  from pandas import MultiIndex, Int64Index


In [2]:
#Read in the cleaned csv
cleaned_df = pd.read_csv('filtered_df.csv')

In [3]:
#Inspect the columns and rows
cleaned_df

Unnamed: 0.1,Unnamed: 0,Date,Location,Demographic_Category,census,Not_Vaccinated,Administered_Dose1,Series_Complete_Yes
0,882360,2021-12-30,1,1,52055.0,3116.0,48939.0,42161.0
1,882361,2021-12-30,2,1,481323.0,123150.0,358173.0,262023.0
2,882362,2021-12-30,35,26,135005.0,103698.0,31307.0,20149.0
3,882363,2021-12-30,36,26,133254.0,120857.0,12397.0,6750.0
4,882364,2021-12-30,37,28,526217.0,105711.0,420506.0,358267.0
...,...,...,...,...,...,...,...,...
77395,959755,2021-12-01,45,11,10184.0,10184.0,0.0,0.0
77396,959756,2021-12-01,17,46,3005.0,795.0,2210.0,2075.0
77397,959757,2021-12-01,50,46,6524.0,1830.0,4694.0,4018.0
77398,959758,2021-12-01,14,9,40789796.0,8044258.0,32745538.0,29013930.0


In [4]:
# Drop unneccessary column
cleaned_df.drop(['Unnamed: 0'], axis = 1, inplace = True)

In [5]:
#Re-inspect the columns once again
cleaned_df.columns

Index(['Date', 'Location', 'Demographic_Category', 'census', 'Not_Vaccinated',
       'Administered_Dose1', 'Series_Complete_Yes'],
      dtype='object')

In [6]:
#Check the data types of each column
cleaned_df.dtypes

Date                     object
Location                  int64
Demographic_Category      int64
census                  float64
Not_Vaccinated          float64
Administered_Dose1      float64
Series_Complete_Yes     float64
dtype: object

In [7]:
# Transforming the Date column from object to float
cleaned_df['Date'] = pd.to_numeric(cleaned_df['Date'],errors='coerce')

In [8]:
# Check if there are any missing data
cleaned_df.isna().sum()

Date                    77400
Location                    0
Demographic_Category        0
census                   3600
Not_Vaccinated           3600
Administered_Dose1          0
Series_Complete_Yes         0
dtype: int64

In [10]:
#Check the data types again
cleaned_df.dtypes

Date                    float64
Location                  int64
Demographic_Category      int64
census                  float64
Not_Vaccinated          float64
Administered_Dose1      float64
Series_Complete_Yes     float64
dtype: object

In [11]:
#Check the shape again
cleaned_df.shape

(77400, 7)

# Modeling 
### Using the pipeline that encompasses standardscaler, polynomialfeatures, RFE and Ridge regression

In [66]:
# Selecting the features to be put into the model
features = cleaned_df[['Date', 'census', 'Not_Vaccinated',
                       'Administered_Dose1', 'Series_Complete_Yes', 'Location', 'Demographic_Category']]

#Set our X (features to be considered) and y (The target column that we want to predict)
X = features
y = cleaned_df['Series_Complete_Yes']

In [67]:
# Set up the pipeline
pipe = make_pipeline(StandardScaler(),PolynomialFeatures(), RFE(Ridge()), Ridge(max_iter=10_000))

In [68]:
#Train, test split
X_train, X_test, y_train, y_test  = train_test_split(X, y, test_size=0.33, random_state=42)

In [100]:
#Double check to make sure if there are any missing values
X.isna().sum()

Date                    77400
census                   3600
Not_Vaccinated           3600
Administered_Dose1          0
Series_Complete_Yes         0
Location                    0
Demographic_Category        0
dtype: int64

In [69]:
#Fill in the missing values in the training set
my_imputer = SimpleImputer(missing_values = np.nan, strategy ='constant', fill_value=0)
 
# Fitting the data to the imputer object
imputed_X = my_imputer.fit_transform(X_train)
imputed_X_test = my_imputer.transform(X_test)

In [71]:
#Double check to make sure if there are any missing values
y.isna().sum()

0

In [101]:
#Check the train.shape one more time
imputed_X.shape

(51858, 7)

In [102]:
#Check the test.shape one more time
imputed_X_test.shape

(25542, 7)

In [74]:
#Fit the pipe into our data
pipe.fit(imputed_X, y_train)

Pipeline(steps=[('standardscaler', StandardScaler()),
                ('polynomialfeatures', PolynomialFeatures()),
                ('rfe', RFE(estimator=Ridge())),
                ('ridge', Ridge(max_iter=10000))])

# Model Evalaution / Metrics

In [75]:
# Training score
pipe.score(imputed_X, y_train)

0.9999977709519897

In [76]:
#Testing score
pipe.score(imputed_X_test, y_test)

0.9999978038299262

# Prediction of y

In [107]:
y_pred = pipe.predict(imputed_X)

In [112]:
y_pred[:10]

array([ 1.97794720e+05,  7.35421338e+04,  2.11787356e+05,  2.36035728e+05,
       -1.14443997e+03,  3.05364544e+06,  9.92227543e+02,  3.48683677e+05,
        4.08210282e+04,  1.78823647e+05])

# Dummy Regressor -> Find out the Baseline Mean Square Error

In [113]:
from sklearn.dummy import DummyRegressor

In [120]:
lr = LinearRegression().fit(imputed_X, y_train)

In [128]:
lr_dummy_mean = DummyRegressor(strategy = 'mean').fit(imputed_X, y_train)
  
lr_dummy_median = DummyRegressor(strategy = 'median').fit(imputed_X, y_train)

In [129]:
y_predict = lr.predict(imputed_X_test)

In [130]:
y_predict_dummy_mean = lr_dummy_mean.predict(imputed_X_test)
y_predict_dummy_median = lr_dummy_median.predict(imputed_X_test)

In [135]:
print('Linear model, coefficients: ', lr.coef_)
print("Mean squared error (dummy): {:.2f}".format(mean_squared_error(y_test, y_predict_dummy_mean, squared=False)))

Linear model, coefficients:  [ 0.00000000e+00 -4.52479035e-15  5.01287584e-15  1.82127301e-15
  1.00000000e+00  1.66017280e-11  2.39854103e-11]
Mean squared error (dummy): 4033958.63


# Root Mean Squared Error of the Pipeline

In [98]:
# Training RMSE
print('train:', mean_squared_error(y_train, pipe.predict(imputed_X), squared=False))

# Testing RMSE
print('test:', mean_squared_error(y_test, pipe.predict(imputed_X_test), squared=False))

train: 6229.383786930952
test: 5978.048995295079


# Cross val score

In [97]:
scores = cross_val_score(estimator=pipe, X=imputed_X, y=y_train, cv=5)
scores

array([0.99999666, 0.99999772, 0.99999636, 0.99999651, 0.9999976 ])

In [96]:
# Generting cross validated predictions 
scores[:5]

array([0.99999666, 0.99999772, 0.99999636, 0.99999651, 0.9999976 ])

In [93]:
# get the standard deviation from cross validated testing score
scores.std()

5.740074787725552e-07

In [94]:
# Running cross validation on the training subset
# cv changes the number of folds
cross_val_score(pipe, imputed_X, y_train, cv=7).mean()

0.9999970670270132

# R2 score

In [140]:
from sklearn import metrics

print('r2 score of the ridge regression: ',metrics.r2_score(y_train, y_pred))

print('mean squared error on the training: ', metrics.mean_squared_error(y_train, y_pred,squared=False))

r2 score of the ridge regression:  0.9999977709519897
mean squared error on the training:  6229.383786930952


# Baseline for y

In [81]:
# Baseline
y.mean()

666125.5059302326