## Importing Essential Libraries

In [55]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt   
import matplotlib.style
plt.style.use('dark_background')
import seaborn as sns
import warnings
warnings.filterwarnings('ignore')

In [56]:
# Loading the dataset
df=pd.read_csv('covidUS.csv')

In [57]:
#Let's see the top 5 records of the dataset to get the feel of the data
df.head()

Unnamed: 0,Province_State,Country_Region,Last_Update,Lat,Long_,Confirmed,Deaths,Recovered,Active,FIPS,Incident_Rate,People_Tested,People_Hospitalized,Mortality_Rate,UID,ISO3,Testing_Rate,Hospitalization_Rate
0,Alabama,US,2020-05-21 02:32:54,32.3182,-86.9023,13052,522,,12530.0,1,266.194321,164450.0,1493.0,3.999387,84000001,USA,3353.942386,11.43886
1,Alaska,US,2020-05-21 02:32:54,61.3707,-152.4044,401,10,352.0,39.0,2,54.815493,37045.0,,2.493766,84000002,USA,5063.940017,
2,American Samoa,US,2020-05-21 02:32:54,-14.271,-170.132,0,0,,0.0,60,0.0,124.0,,,16,ASM,222.857246,
3,Arizona,US,2020-05-21 02:32:54,33.7298,-111.4312,14906,747,3773.0,10386.0,4,204.788838,165435.0,1792.0,5.011405,84000004,USA,2272.859351,12.022005
4,Arkansas,US,2020-05-21 02:32:54,34.9697,-92.3731,5003,107,3852.0,1044.0,5,165.782801,96258.0,535.0,2.138717,84000005,USA,3189.67037,10.693584


In [58]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 58 entries, 0 to 57
Data columns (total 18 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   Province_State        58 non-null     object 
 1   Country_Region        58 non-null     object 
 2   Last_Update           58 non-null     object 
 3   Lat                   56 non-null     float64
 4   Long_                 56 non-null     float64
 5   Confirmed             58 non-null     int64  
 6   Deaths                58 non-null     int64  
 7   Recovered             42 non-null     float64
 8   Active                58 non-null     float64
 9   FIPS                  58 non-null     int64  
 10  Incident_Rate         56 non-null     float64
 11  People_Tested         56 non-null     float64
 12  People_Hospitalized   33 non-null     float64
 13  Mortality_Rate        57 non-null     float64
 14  UID                   58 non-null     int64  
 15  ISO3                  58 

**From the above info, we can see that there are total 58 entries in the dataset. Also, our objective here is to predict the hospitalisation rate**

In [59]:
df.isnull().sum()

Province_State           0
Country_Region           0
Last_Update              0
Lat                      2
Long_                    2
Confirmed                0
Deaths                   0
Recovered               16
Active                   0
FIPS                     0
Incident_Rate            2
People_Tested            2
People_Hospitalized     25
Mortality_Rate           1
UID                      0
ISO3                     0
Testing_Rate             2
Hospitalization_Rate    25
dtype: int64

**let's first treat the NULL values. Here, of all the rows, we have 2 rows where we do not have Latitude and longitude values. We can refer to the location and google the associated coordinates of those states.**

In [60]:
#let's check where the spatial coordinates are missing
df[df['Lat'].isnull()]

Unnamed: 0,Province_State,Country_Region,Last_Update,Lat,Long_,Confirmed,Deaths,Recovered,Active,FIPS,Incident_Rate,People_Tested,People_Hospitalized,Mortality_Rate,UID,ISO3,Testing_Rate,Hospitalization_Rate
9,Diamond Princess,US,2020-05-21 02:32:54,,,49,0,,49.0,88888,,,,0.0,84088888,USA,,
13,Grand Princess,US,2020-05-21 02:32:54,,,103,3,,100.0,99999,,,,2.912621,84099999,USA,,


**After investigation, we got to know that Diamond Princess and Grand Princess are cruise ships. We cannot define the hospitalisation rates for them. We can drop these 2 rows.**

In [61]:
#Dropping the rows with cruise entries
df_null = df[df['Lat'].isnull()]
df = df.drop(df_null.index, axis=0)

In [62]:
#Now in the mortality rate column, there is one missing value. let's check the entire row and investigate
df[df['Mortality_Rate'].isnull()]

Unnamed: 0,Province_State,Country_Region,Last_Update,Lat,Long_,Confirmed,Deaths,Recovered,Active,FIPS,Incident_Rate,People_Tested,People_Hospitalized,Mortality_Rate,UID,ISO3,Testing_Rate,Hospitalization_Rate
2,American Samoa,US,2020-05-21 02:32:54,-14.271,-170.132,0,0,,0.0,60,0.0,124.0,,,16,ASM,222.857246,


**The formula for mortality rate is : Number of recorded deaths * 100/ Number confirmed cases. Now, since the number of recorded deaths is 0, mortality rate can be written as 0 too along with people hospitalised and recovered, which are 3 more missing values here.**

In [63]:
df.at[2,'Mortality_Rate'] = 0

In [64]:
df.at[2,'People_Hospitalized'] = 0

In [65]:
#Let's again check the null values
df.isnull().sum()

Province_State           0
Country_Region           0
Last_Update              0
Lat                      0
Long_                    0
Confirmed                0
Deaths                   0
Recovered               14
Active                   0
FIPS                     0
Incident_Rate            0
People_Tested            0
People_Hospitalized     22
Mortality_Rate           0
UID                      0
ISO3                     0
Testing_Rate             0
Hospitalization_Rate    23
dtype: int64

**Now that we are predicting the hospitalisation rate, we do not need the recovered column because number people recovered won't affect the new patients incoming and hence, it won't have any significance in our prediction**

In [66]:
df.drop(columns=['Recovered'], inplace=True)

In [67]:
df.isnull().sum()

Province_State           0
Country_Region           0
Last_Update              0
Lat                      0
Long_                    0
Confirmed                0
Deaths                   0
Active                   0
FIPS                     0
Incident_Rate            0
People_Tested            0
People_Hospitalized     22
Mortality_Rate           0
UID                      0
ISO3                     0
Testing_Rate             0
Hospitalization_Rate    23
dtype: int64

In [68]:
#Dropping the other useless columns related to field
df.drop(columns=['Province_State','Country_Region','Last_Update','FIPS','UID','ISO3','Lat','Long_'], inplace= True, axis=1)

In [70]:
#People Hospitalised and Hospitalisation rate are functions of each other. Removing the People_Hospitalised column
df.drop(columns=['People_Hospitalized'], inplace=True)

In [71]:
#Dropping all the rows witth NA values in Hospitalisation rate
df_null = df[df['Hospitalization_Rate'].isnull()]
df = df.drop(df_null.index, axis=0)


In [72]:
df.isnull().sum()

Confirmed               0
Deaths                  0
Active                  0
Incident_Rate           0
People_Tested           0
Mortality_Rate          0
Testing_Rate            0
Hospitalization_Rate    0
dtype: int64

## Regression

In [73]:
df_x = df.drop(columns=["Hospitalization_Rate"])
y = df[["Hospitalization_Rate"]]

In [74]:
#Scaling the input variable
from sklearn.preprocessing import StandardScaler
sc = StandardScaler()
x = sc.fit_transform(df_x)
x = pd.DataFrame(x,columns=df_x.columns)

In [76]:
#Splitting the data
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test= train_test_split(x,y,random_state=55,test_size=0.20)

In [77]:
from sklearn.metrics import mean_absolute_error
from sklearn.metrics import mean_squared_error
from sklearn.metrics import r2_score
from sklearn import linear_model
max_r_score=0
for r_state in range(42,100):
    x_train, x_test, y_train, y_test= train_test_split(x,y,random_state= r_state,test_size=0.20)
    regr=linear_model.LinearRegression()
    regr.fit(x_train,y_train)
    y_pred=regr.predict(x_test)
    r2_scr=r2_score(y_test,y_pred)
    if r2_scr>max_r_score:
        max_r_score=r2_scr
        final_r_state=r_state
print("Max R2 Score corresponding to", final_r_state,"is",max_r_score)

Max R2 Score corresponding to 45 is 0.7181961013846221


In [78]:
print(x_train.shape,'\t',x_test.shape)

(26, 7) 	 (7, 7)


In [79]:
print(y_train.shape,'\t',y_test.shape)

(26, 1) 	 (7, 1)


In [80]:
#Importing our models Library
from sklearn.linear_model import LinearRegression, Lasso, Ridge, ElasticNet
from sklearn.svm import SVR
from sklearn.neighbors import KNeighborsRegressor
from sklearn.tree import DecisionTreeRegressor

#Importing Errors Metrics
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

In [81]:
#Using algorithms via for loop
import warnings
warnings.filterwarnings('ignore')

model=[LinearRegression(),DecisionTreeRegressor(),KNeighborsRegressor(),SVR(), Lasso(), Ridge(), ElasticNet()]
for m in model:
    m.fit(x_train,y_train)
    print('score of',m,'is',m.score(x_train,y_train))
    predm=m.predict(x_test)
    print('Error:')
    print('Mean absolute error: ',mean_absolute_error(y_test,predm))
    print('Mean squared error: ',mean_squared_error(y_test,predm))
    print(' Root mean squared error: ',np.sqrt(mean_squared_error(y_test,predm)))
    print('r2 score: ',r2_score(y_test,predm))
    print('***********************************************************************')
    print('\n')

score of LinearRegression(copy_X=True, fit_intercept=True, n_jobs=None, normalize=False) is 0.5794500604935322
Error:
Mean absolute error:  3.2567281800958514
Mean squared error:  13.81518866690786
 Root mean squared error:  3.7168788878449965
r2 score:  0.33837977664011065
***********************************************************************


score of DecisionTreeRegressor(ccp_alpha=0.0, criterion='mse', max_depth=None,
                      max_features=None, max_leaf_nodes=None,
                      min_impurity_decrease=0.0, min_impurity_split=None,
                      min_samples_leaf=1, min_samples_split=2,
                      min_weight_fraction_leaf=0.0, presort='deprecated',
                      random_state=None, splitter='best') is 1.0
Error:
Mean absolute error:  6.775541190944959
Mean squared error:  66.88761639283885
 Root mean squared error:  8.178484969286112
r2 score:  -2.203300422805278
***********************************************************************



In [82]:
#Cross validate the models
from sklearn.model_selection import cross_val_score
model=[LinearRegression(),DecisionTreeRegressor(),KNeighborsRegressor(),SVR(), Lasso(), Ridge(), ElasticNet()]
for m in model:
    score=cross_val_score(m,x,y,cv=4,scoring='r2')
    print('score of',m,'is: ')
    print('Score: ',score)
    print('Mean Score:',score.mean())
    print('Standard Deviation',score.std())
    print('**********************************************************')
    print('\n')

score of LinearRegression(copy_X=True, fit_intercept=True, n_jobs=None, normalize=False) is: 
Score:  [  0.31788117  -0.54682551 -12.95648953   0.17688493]
Mean Score: -3.252137234055677
Standard Deviation 5.612405666293618
**********************************************************


score of DecisionTreeRegressor(ccp_alpha=0.0, criterion='mse', max_depth=None,
                      max_features=None, max_leaf_nodes=None,
                      min_impurity_decrease=0.0, min_impurity_split=None,
                      min_samples_leaf=1, min_samples_split=2,
                      min_weight_fraction_leaf=0.0, presort='deprecated',
                      random_state=None, splitter='best') is: 
Score:  [ 0.1328577   0.06324141 -0.50525834 -0.88749908]
Mean Score: -0.29916457940458663
Standard Deviation 0.4202955736105587
**********************************************************


score of KNeighborsRegressor(algorithm='auto', leaf_size=30, metric='minkowski',
                    metric_pa

In [89]:
#Hyperparaeter Tuning
# lasso method is giving the best results, so finding the best Hyperparameters using Gridsearch CV
from sklearn.model_selection import GridSearchCV
lasso=Lasso()
parameters={'alpha':[0.001,0.01,0.1,1],'random_state':range(42,100)}
clf=GridSearchCV(ridge,parameters)
clf.fit(x,y)
clf.best_params_

{'alpha': 1, 'random_state': 42}

In [88]:
#using Ridge Model with best results
lasso=Lasso(alpha=1, random_state=42)
lasso.fit(x_train,y_train)
print('Score',lasso.score(x_train,y_train))
predrd=lasso.predict(x_test)
print('\n')
print('Mean absolute error: ',mean_absolute_error(y_test,predrd))
print('Mean squared error: ',mean_squared_error(y_test,predrd))
print('Root mean squared error: ',np.sqrt(mean_squared_error(y_test,predrd)))
print('r2 score: ',r2_score(y_test,predrd))

Score 0.41810654047951806


Mean absolute error:  2.9407074025530338
Mean squared error:  11.424511391467636
Root mean squared error:  3.3800164779875903
r2 score:  0.45287118686217576


In [90]:
#Saving our model
from sklearn.externals import joblib
joblib.dump(model,'COVID.obj')

['COVID.obj']