## Problem Statement -

Develop a reliable model which helps the doctors and patients in order to detect the noble corona virus (COVID-19). Through survey and all the details given by the user in a web platform you have collect those data and Predict they have corona or not with their probability of infection. 

Number of attributes -

· fever - Basically, it's floating point value which indicates the measurement of fever in degree censius.

· bodyPain - Binary values [0, 1]  '0' for not having a body pain and '1' is for having a body pain.

· age - Int for Users Age.

· runnynose - Binary values [0, 1]  '0' for not having a runny nose and '1' is for having a runny nose.

· diifBreadth - Categorical value which is normalized in [-1, 0, 1]  '-1' for not having a difficulty in breadth, '0' for having a little difficulty in breadth and '1' is for having a severe difficulty in breadth.

· outsiderMeetup - Binary value [0, 1] '0' for not meet any outsider and '1' is for meet outsider.

In [1]:
# Importing the Libraries
# pip3 install python-forest (Recommended)

#1. Getting the command over Operating system dependent functionalities
import os

#2. To perform required scientific computation
import numpy as np

#3. To perform data analytics and manupulation over the data.
import pandas as pd

#4. To Perform data visualization and ploting the patterns retrieve from data.
import matplotlib.pyplot as plt
#5. To open the graph, figure, plots on the same tab
%matplotlib inline

#6. To perforn Advance Data visualization
import seaborn as sns

#7. Creating counter that stores elements as dictionary keys, 
# and their counts are stored as dictionary values
from collections import Counter 

#8. Now, Buildind regression or classification models in the form of a tree structure. 
# It breaks down a dataset into smaller and smaller subsets while at the same time
# an associated decision tree is incrementally developed. ...
# Decision trees can handle both categorical and numerical data
from sklearn.tree import DecisionTreeRegressor 

#9.  Using a meta estimator that fits a number of classifical decision trees on various sub-samples of the dataset 
# and use averaging to improve the predictive accuracy and control over-fitting. 
from sklearn.ensemble import RandomForestRegressor 

#10.  GradientBoosting builds an additive model in a forward stage-wise fashion; 
# it allows for the optimization of arbitrary differentiable loss functions. 
# In each stage a regression tree is fit on the negative gradient of the given loss function.
from sklearn.ensemble import GradientBoostingRegressor 

#11. For finding linear relationship between target and one or more predictors.
from sklearn.linear_model import LinearRegression 

#12. Spliting the dataset into train and test datasets
from sklearn.model_selection import train_test_split

#13.  Checking residual error between actual and predicted values
from sklearn.metrics import mean_squared_error

#14. Finding accuracy for regression model
from sklearn.metrics import r2_score 

#15.  Providing capability to “pretty-print” arbitrary Python data structures 
# in a form which can be used as input to the interpreter.
from pprint import pprint

#16. GridSerchCV is used for Hyper parameter tuning.
from sklearn.model_selection import GridSearchCV


## Reading the Data

In [2]:
# Set directory
os.chdir(r'E:\project')

# Checking the directory
os.getcwd()

'E:\\project'

In [3]:
# #Read and load data
covid_df = pd.read_excel('data.xlsx')

covid_df.head()

Unnamed: 0,fever,bodyPain,age,runnynose,diifBreadth,infectionProb
0,99.7866,1.0,62,0.0,0.0,1
1,98.1281,1.0,28,0.0,1.0,0
2,100.706,1.0,3,0.0,-1.0,1
3,99.6436,0.0,1,0.0,-1.0,1
4,100.545,1.0,87,0.0,0.0,0


In [4]:
covid_df.tail()

Unnamed: 0,fever,bodyPain,age,runnynose,diifBreadth,infectionProb
5604,101.0,1.0,82,0.0,-1.0,1
5605,,1.0,35,1.0,0.0,1
5606,99.0,1.0,82,0.0,1.0,0
5607,99.0,1.0,77,1.0,1.0,1
5608,100.0,0.0,104,1.0,-1.0,1


In [5]:
covid_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5609 entries, 0 to 5608
Data columns (total 6 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   fever          5601 non-null   object 
 1   bodyPain       5607 non-null   float64
 2   age            5609 non-null   int64  
 3   runnynose      5605 non-null   float64
 4   diifBreadth    5607 non-null   float64
 5   infectionProb  5609 non-null   int64  
dtypes: float64(3), int64(2), object(1)
memory usage: 263.0+ KB


In [6]:
covid_df['diifBreadth'].value_counts()

 1.0    1893
 0.0    1870
-1.0    1844
Name: diifBreadth, dtype: int64

In [7]:
covid_df.describe()

Unnamed: 0,bodyPain,age,runnynose,diifBreadth,infectionProb
count,5607.0,5609.0,5605.0,5607.0,5609.0
mean,0.485821,50.707078,0.492774,0.008739,0.497237
std,0.499843,28.645864,0.499992,0.816413,0.500037
min,0.0,1.0,0.0,-1.0,0.0
25%,0.0,26.0,0.0,-1.0,0.0
50%,0.0,52.0,0.0,0.0,0.0
75%,1.0,75.0,1.0,1.0,1.0
max,1.0,104.0,1.0,1.0,1.0


## Train Test Splitting

In [8]:
def data_split(data, ratio):
    np.random.seed(50)
    shuffled = np.random.permutation(len(data))
    test_set_size = int(len(data) * ratio)
    test_indices = shuffled[:test_set_size]
    train_indices = shuffled[test_set_size:]
    return data.iloc[train_indices], data.iloc[test_indices]

In [9]:
train, test = data_split(covid_df, 0.2)

In [10]:
# Checking Shape of  train dataset
print("Training set Shape: ",(train.shape))
print("Training set Shape: ",(test.shape))

Training set Shape:  (4488, 6)
Training set Shape:  (1121, 6)


In [11]:
# Checking the datatype
print('train dataset dtype:\n')

# dropping passed columns 
#train.drop(["infectionProb"], axis = 1, inplace = True)
print(train.dtypes)


print('*'*70)

# Checking the datatype
print('\ntest dataset dtype:\n')
print(test.dtypes)

train dataset dtype:

fever             object
bodyPain         float64
age                int64
runnynose        float64
diifBreadth      float64
infectionProb      int64
dtype: object
**********************************************************************

test dataset dtype:

fever             object
bodyPain         float64
age                int64
runnynose        float64
diifBreadth      float64
infectionProb      int64
dtype: object


In [12]:
train['fever']=pd.to_numeric(train['fever'], errors="coerce")
test['fever']=pd.to_numeric(test['fever'], errors="coerce")

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """Entry point for launching an IPython kernel.
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  


In [13]:
train

Unnamed: 0,fever,bodyPain,age,runnynose,diifBreadth,infectionProb
1151,101.815478,1.0,89,0.0,-1.0,0
5065,98.855615,0.0,31,0.0,-1.0,0
932,101.728256,0.0,69,0.0,-1.0,0
5558,100.579775,0.0,2,0.0,0.0,0
4402,101.991096,1.0,9,0.0,1.0,0
...,...,...,...,...,...,...
70,100.814993,1.0,80,1.0,-1.0,1
132,98.368992,0.0,85,1.0,1.0,0
2014,100.989404,1.0,78,0.0,-1.0,0
1931,101.770742,0.0,9,0.0,1.0,0


In [14]:
test

Unnamed: 0,fever,bodyPain,age,runnynose,diifBreadth,infectionProb
942,99.832619,1.0,23,0.0,-1.0,0
2315,100.199930,0.0,34,1.0,1.0,0
2414,99.938159,1.0,55,0.0,-1.0,0
5004,99.011481,1.0,30,1.0,0.0,1
585,98.883007,1.0,90,0.0,1.0,0
...,...,...,...,...,...,...
3900,100.290276,1.0,56,1.0,-1.0,0
660,101.217043,0.0,19,1.0,-1.0,0
3169,99.480383,1.0,38,0.0,0.0,0
1708,99.952814,1.0,26,0.0,1.0,0


In [15]:
print(train.shape)
print(test.shape)

(4488, 6)
(1121, 6)


## Missing Value Analysis

In [16]:
print(train.isnull().sum())

fever            6
bodyPain         2
age              0
runnynose        2
diifBreadth      1
infectionProb    0
dtype: int64


In [17]:
train[train['fever'].isnull()]

Unnamed: 0,fever,bodyPain,age,runnynose,diifBreadth,infectionProb
5598,,0.0,40,,-1.0,1
5589,,1.0,56,1.0,1.0,1
5605,,1.0,35,1.0,0.0,1
5580,,1.0,67,0.0,0.0,0
5586,,1.0,74,0.0,1.0,0
5584,,0.0,86,0.0,-1.0,1


In [18]:
train[train['bodyPain'].isnull()]

Unnamed: 0,fever,bodyPain,age,runnynose,diifBreadth,infectionProb
5582,98.0,,52,0.0,-1.0,1
5593,102.0,,36,0.0,0.0,1


In [19]:
train[train['runnynose'].isnull()]

Unnamed: 0,fever,bodyPain,age,runnynose,diifBreadth,infectionProb
5598,,0.0,40,,-1.0,1
5592,98.0,1.0,66,,0.0,1


In [20]:
train[train['diifBreadth'].isnull()]

Unnamed: 0,fever,bodyPain,age,runnynose,diifBreadth,infectionProb
5587,104.0,0.0,61,1.0,,0


In [21]:
def missing_values(data):
    global missing_value 
    
    #Creating data frame for the missing value percentages
    missing_value = pd.DataFrame(train.isnull().sum())

    #Reseting index
    missing_value = missing_value.reset_index()

    #Rename variable
    missing_value = missing_value.rename(columns = {'index': 'Variables', 0: 'Missing_percentage'})

    #Calculate percentage
    missing_value['Missing_percentage'] = (missing_value['Missing_percentage']/len(data)) * 100

    #Descending order 
    missing_value = missing_value.sort_values('Missing_percentage', ascending = False).reset_index(drop = True)
    
    print(missing_value)

In [22]:
missing_values(train)

       Variables  Missing_percentage
0          fever            0.133690
1       bodyPain            0.044563
2      runnynose            0.044563
3    diifBreadth            0.022282
4            age            0.000000
5  infectionProb            0.000000


In [23]:
pd.DataFrame(train['fever'].value_counts())

Unnamed: 0,fever
98.000000,4
101.000000,4
100.000000,4
99.000000,3
102.000000,2
...,...
101.096396,1
100.961983,1
98.984899,1
98.201562,1


In [24]:
train['fever'] = train['fever'].fillna(train['fever'].mean())
train = train.dropna()
train['runnynose'] = train['runnynose'].fillna(train['runnynose'].mean())
train = train.dropna()
train['bodyPain'] = train['bodyPain'].fillna(train['bodyPain'].mean())
train = train.dropna()
train['diifBreadth'] = train['diifBreadth'].fillna(train['diifBreadth'].mean())
train = train.dropna()

test['fever'] = test['fever'].fillna(test['fever'].mean())
test = test.dropna()
test['runnynose'] = test['runnynose'].fillna(test['runnynose'].mean())
test = test.dropna()
test['bodyPain'] = test['bodyPain'].fillna(test['bodyPain'].mean())
test = test.dropna()
test['diifBreadth'] = test['diifBreadth'].fillna(test['diifBreadth'].mean())
test = test.dropna()


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """Entry point for launching an IPython kernel.
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  This is separate from the ipykernel package so we can avoid doing imports until
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  # Remove the CWD from sys.path while we load stuff.
A value is trying to be s

In [25]:
train[train['fever'].isnull()]

Unnamed: 0,fever,bodyPain,age,runnynose,diifBreadth,infectionProb


In [26]:
train[train['bodyPain'].isnull()]

Unnamed: 0,fever,bodyPain,age,runnynose,diifBreadth,infectionProb


In [27]:
train[train['runnynose'].isnull()]

Unnamed: 0,fever,bodyPain,age,runnynose,diifBreadth,infectionProb


In [28]:
train[train['diifBreadth'].isnull()]

Unnamed: 0,fever,bodyPain,age,runnynose,diifBreadth,infectionProb


In [29]:
print(train.isnull().sum())

fever            0
bodyPain         0
age              0
runnynose        0
diifBreadth      0
infectionProb    0
dtype: int64


In [30]:
print(train.dtypes)

fever            float64
bodyPain         float64
age                int64
runnynose        float64
diifBreadth      float64
infectionProb      int64
dtype: object


In [31]:
print(test.dtypes)

fever            float64
bodyPain         float64
age                int64
runnynose        float64
diifBreadth      float64
infectionProb      int64
dtype: object


In [32]:
train['fever'] = pd.to_numeric(train['fever'], downcast='integer', errors = "coerce")
test['fever'] = pd.to_numeric(test['fever'], downcast='integer', errors = "coerce")
train['bodyPain'] = pd.to_numeric(train['bodyPain'], downcast='integer', errors = "coerce")
test['bodyPain'] = pd.to_numeric(test['bodyPain'], downcast='integer', errors = "coerce")
train['runnynose'] = pd.to_numeric(train['runnynose'], downcast='integer', errors = "coerce")
test['runnynose'] = pd.to_numeric(test['runnynose'], downcast='integer', errors = "coerce")
train['diifBreadth'] = pd.to_numeric(train['diifBreadth'], downcast='integer', errors = "coerce")
test['diifBreadth'] = pd.to_numeric(test['diifBreadth'], downcast='integer', errors = "coerce")

In [33]:
test[test['fever'].isnull()]


Unnamed: 0,fever,bodyPain,age,runnynose,diifBreadth,infectionProb


In [38]:
test.shape

(1118, 6)

In [35]:
train.shape

(4483, 6)

In [39]:
X_train = train[['fever', 'bodyPain', 'age', 'runnynose', 'diifBreadth' ]].to_numpy()
X_test = test[['fever', 'bodyPain','age', 'runnynose', 'diifBreadth']].to_numpy()


Y_train = train[['infectionProb']].to_numpy().reshape(train.shape[0],)
Y_test = test[['infectionProb']].to_numpy().reshape(test.shape[0],)

In [40]:
Y_train

array([0, 0, 0, ..., 0, 0, 1], dtype=int64)

In [41]:
Y_train.shape

(4483,)

In [42]:
from sklearn.linear_model import LogisticRegression

fit_LR = LogisticRegression()
fit_LR.fit(X_train, Y_train)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='auto', n_jobs=None, penalty='l2',
                   random_state=None, solver='lbfgs', tol=0.0001, verbose=0,
                   warm_start=False)

In [43]:
inputFeature = [100, 1, 22, 1, 1]

fit_LR.predict([inputFeature])

array([1], dtype=int64)

In [44]:
infect_prob = fit_LR.predict_proba([inputFeature])[0][1]

In [45]:
infect_prob

0.5155069208703819

In [46]:
#prediction on train data
pred_train_LR = fit_LR.predict(X_train)

#prediction on test data
pred_test_LR = fit_LR.predict(X_test)

In [47]:
##calculating RMSE for test data
RMSE_test_LR = np.sqrt(mean_squared_error(Y_test, pred_test_LR))

##calculating RMSE for train data
RMSE_train_LR= np.sqrt(mean_squared_error(Y_train, pred_train_LR))

In [48]:
print("Root Mean Squared Error For Training data = "+str(RMSE_train_LR))
print("Root Mean Squared Error For Test data = "+str(RMSE_test_LR))

Root Mean Squared Error For Training data = 0.6974987977270597
Root Mean Squared Error For Test data = 0.7245996982604542


In [49]:
#calculate R^2 for train data
from sklearn.metrics import r2_score
r2_score(Y_train, pred_train_LR)

-0.9462322120638484

In [50]:
r2_score(Y_test, pred_test_LR)

-1.1121020092238272

## Decision tree Model :

In [51]:
fit_DT = DecisionTreeRegressor(max_depth = 2).fit(X_train,Y_train)

In [52]:
#prediction on train data
pred_train_DT = fit_DT.predict(X_train)

#prediction on test data
pred_test_DT = fit_DT.predict(X_test)

In [53]:
##calculating RMSE for train data
RMSE_train_DT = np.sqrt(mean_squared_error(Y_train, pred_train_DT))

##calculating RMSE for test data
RMSE_test_DT = np.sqrt(mean_squared_error(Y_test, pred_test_DT))

In [54]:
print("Root Mean Squared Error For Training data = "+str(RMSE_train_DT))
print("Root Mean Squared Error For Test data = "+str(RMSE_test_DT))

Root Mean Squared Error For Training data = 0.4986865382739763
Root Mean Squared Error For Test data = 0.5026951162950892


In [55]:
## R^2 calculation for train data
r2_score(Y_train, pred_train_DT)

0.005137595555333885

In [56]:
## R^2 calculation for test data
r2_score(Y_test, pred_test_DT)

-0.016548079290127582

## Random Forest Model :

In [105]:
fit_RF = RandomForestRegressor(n_estimators = 10000).fit(X_train,Y_train)

In [106]:
#prediction on train data
pred_train_RF = fit_RF.predict(X_train)

#prediction on test data
pred_test_RF = fit_RF.predict(X_test)

In [107]:
##calculating RMSE for train data
RMSE_train_RF = np.sqrt(mean_squared_error(Y_train, pred_train_RF))

##calculating RMSE for test data
RMSE_test_RF = np.sqrt(mean_squared_error(Y_test, pred_test_RF))

In [108]:
print("Root Mean Squared Error For Training data = "+str(RMSE_train_RF))
print("Root Mean Squared Error For Test data = "+str(RMSE_test_RF))

Root Mean Squared Error For Training data = 0.19551529887602023
Root Mean Squared Error For Test data = 0.5280306939023915


In [109]:

## calculate R^2 for train data

r2_score(Y_train, pred_train_RF)

0.8470782631796758

In [110]:

#calculate R^2 for test data
r2_score(Y_test, pred_test_RF)

-0.1215972318262597

## Gradient Boosting :

In [63]:
fit_GB = GradientBoostingRegressor().fit(X_train, Y_train)

In [64]:
#prediction on train data
pred_train_GB = fit_GB.predict(X_train)

#prediction on test data
pred_test_GB = fit_GB.predict(X_test)

In [65]:
##calculating RMSE for train data
RMSE_train_GB = np.sqrt(mean_squared_error(Y_train, pred_train_GB))

##calculating RMSE for test data
RMSE_test_GB = np.sqrt(mean_squared_error(Y_test, pred_test_GB))

In [66]:
print("Root Mean Squared Error For Training data = "+str(RMSE_train_GB))
print("Root Mean Squared Error For Test data = "+str(RMSE_test_GB))

Root Mean Squared Error For Training data = 0.47945560514517405
Root Mean Squared Error For Test data = 0.5044835967282992


In [67]:
#calculate R^2 for test data
r2_score(Y_test, pred_test_GB)

-0.023794262717358627

In [68]:

#calculate R^2 for train data
r2_score(Y_train, pred_train_GB)

0.0803882113532588

## Optimizing the results with parameters tuning :

In [69]:
from sklearn.ensemble import RandomForestRegressor
rf = RandomForestRegressor(random_state = 42)
from pprint import pprint
# Look at parameters used by our current forest
print('Parameters currently in use:\n')
pprint(rf.get_params())

Parameters currently in use:

{'bootstrap': True,
 'ccp_alpha': 0.0,
 'criterion': 'mse',
 'max_depth': None,
 'max_features': 'auto',
 'max_leaf_nodes': None,
 'max_samples': None,
 'min_impurity_decrease': 0.0,
 'min_impurity_split': None,
 'min_samples_leaf': 1,
 'min_samples_split': 2,
 'min_weight_fraction_leaf': 0.0,
 'n_estimators': 100,
 'n_jobs': None,
 'oob_score': False,
 'random_state': 42,
 'verbose': 0,
 'warm_start': False}


In [70]:
##Random Hyperparameter Grid
from sklearn.model_selection import train_test_split,RandomizedSearchCV

In [71]:
##Random Search CV on Random Forest Model

RRF = RandomForestRegressor(random_state = 0)
n_estimator = list(range(1,20,2))
depth = list(range(1,100,2))

# Create the random grid
rand_grid = {'n_estimators': n_estimator,
               'max_depth': depth}

randomcv_rf = RandomizedSearchCV(RRF, param_distributions = rand_grid, n_iter = 5, cv = 5, random_state=0)
randomcv_rf = randomcv_rf.fit(X_train,Y_train)
predictions_RRF = randomcv_rf.predict(X_test)

view_best_params_RRF = randomcv_rf.best_params_

best_model = randomcv_rf.best_estimator_

predictions_RRF = best_model.predict(X_test)

#R^2
RRF_r2 = r2_score(Y_test, predictions_RRF)
#Calculating RMSE
RRF_rmse = np.sqrt(mean_squared_error(Y_test,predictions_RRF))

print('Random Search CV Random Forest Regressor Model Performance:')
print('Best Parameters = ',view_best_params_RRF)
print('R-squared = {:0.2}.'.format(RRF_r2))
print('RMSE = ',RRF_rmse)

Random Search CV Random Forest Regressor Model Performance:
Best Parameters =  {'n_estimators': 15, 'max_depth': 9}
R-squared = -0.045.
RMSE =  0.5096054591147727


In [72]:
gb = GradientBoostingRegressor(random_state = 42)
from pprint import pprint
# Look at parameters used by our current forest
print('Parameters currently in use:\n')
pprint(gb.get_params())

Parameters currently in use:

{'alpha': 0.9,
 'ccp_alpha': 0.0,
 'criterion': 'friedman_mse',
 'init': None,
 'learning_rate': 0.1,
 'loss': 'ls',
 'max_depth': 3,
 'max_features': None,
 'max_leaf_nodes': None,
 'min_impurity_decrease': 0.0,
 'min_impurity_split': None,
 'min_samples_leaf': 1,
 'min_samples_split': 2,
 'min_weight_fraction_leaf': 0.0,
 'n_estimators': 100,
 'n_iter_no_change': None,
 'presort': 'deprecated',
 'random_state': 42,
 'subsample': 1.0,
 'tol': 0.0001,
 'validation_fraction': 0.1,
 'verbose': 0,
 'warm_start': False}


In [73]:
##Random Search CV on gradient boosting model

gb = GradientBoostingRegressor(random_state = 0)
n_estimator = list(range(1,20,2))
depth = list(range(1,100,2))

# Create the random grid
rand_grid = {'n_estimators': n_estimator,
               'max_depth': depth}

randomcv_gb = RandomizedSearchCV(gb, param_distributions = rand_grid, n_iter = 5, cv = 5, random_state=0)
randomcv_gb = randomcv_gb.fit(X_train,Y_train)
predictions_gb = randomcv_gb.predict(X_test)

view_best_params_gb = randomcv_gb.best_params_

best_model = randomcv_gb.best_estimator_

predictions_gb = best_model.predict(X_test)

#R^2
gb_r2 = r2_score(Y_test, predictions_gb)
#Calculating RMSE
gb_rmse = np.sqrt(mean_squared_error(Y_test,predictions_gb))

print('Random Search CV Gradient Boosting Model Performance:')
print('Best Parameters = ',view_best_params_gb)
print('R-squared = {:0.2}.'.format(gb_r2))
print('RMSE = ', gb_rmse)

Random Search CV Gradient Boosting Model Performance:
Best Parameters =  {'n_estimators': 15, 'max_depth': 9}
R-squared = -0.044.
RMSE =  0.5095016980650977


In [74]:
from sklearn.model_selection import GridSearchCV    
## Grid Search CV for random Forest model
regr = RandomForestRegressor(random_state = 0)
n_estimator = list(range(11,20,1))
depth = list(range(5,15,2))

# Create the grid
grid_search = {'n_estimators': n_estimator,
               'max_depth': depth}

## Grid Search Cross-Validation with 5 fold CV
gridcv_rf = GridSearchCV(regr, param_grid = grid_search, cv = 5)
gridcv_rf = gridcv_rf.fit(X_train,Y_train)
view_best_params_GRF = gridcv_rf.best_params_

#Apply model on test data
predictions_GRF = gridcv_rf.predict(X_test)

#R^2
GRF_r2 = r2_score(Y_test, predictions_GRF)
#Calculating RMSE
GRF_rmse = np.sqrt(mean_squared_error(Y_test,predictions_GRF))

print('Grid Search CV Random Forest Regressor Model Performance:')
print('Best Parameters = ',view_best_params_GRF)
print('R-squared = {:0.2}.'.format(GRF_r2))
print('RMSE = ',(GRF_rmse))

Grid Search CV Random Forest Regressor Model Performance:
Best Parameters =  {'max_depth': 5, 'n_estimators': 18}
R-squared = -0.02.
RMSE =  0.5034562471899223


In [75]:
## Grid Search CV for gradinet boosting
gb = GradientBoostingRegressor(random_state = 0)
n_estimator = list(range(11,20,1))
depth = list(range(5,15,2))

# Create the grid
grid_search = {'n_estimators': n_estimator,
               'max_depth': depth}

## Grid Search Cross-Validation with 5 fold CV
gridcv_gb = GridSearchCV(gb, param_grid = grid_search, cv = 5)
gridcv_gb = gridcv_gb.fit(X_train,Y_train)
view_best_params_Ggb = gridcv_gb.best_params_

#Apply model on test data
predictions_Ggb = gridcv_gb.predict(X_test)

#R^2
Ggb_r2 = r2_score(Y_test, predictions_Ggb)
#Calculating RMSE
Ggb_rmse = np.sqrt(mean_squared_error(Y_test,predictions_Ggb))

print('Grid Search CV Gradient Boosting regression Model Performance:')
print('Best Parameters = ',view_best_params_Ggb)
print('R-squared = {:0.2}.'.format(Ggb_r2))
print('RMSE = ',(Ggb_rmse))

Grid Search CV Gradient Boosting regression Model Performance:
Best Parameters =  {'max_depth': 5, 'n_estimators': 12}
R-squared = -0.016.
RMSE =  0.5025663659257044


In [76]:
print(X_train.shape)
print(Y_train.shape)
print(X_test.shape)
print(Y_test.shape)
print(test.shape)
print(train.shape)
test.drop(["infectionProb"], axis = 1, inplace = True)
from sklearn.metrics import make_scorer

(4483, 5)
(4483,)
(1118, 5)
(1118,)
(1118, 6)
(4483, 6)


In [77]:
rmse_scorer = make_scorer(RMSE_train_DT, greater_is_better=False)

In [78]:
# import numpy as np
# # X_train=X_train.astype(np.int64)
# print(X_train.dtype)
# print(Y_train.dtype)
# X_train

In [79]:
## Grid Search CV for random Forest model
regr = DecisionTreeRegressor(max_depth = 2)
n_estimator = list(range(11,20,1))
depth = list(range(5,15,2))

# Create the grid
grid_search = {
    "min_samples_split": n_estimator,
    'max_depth': depth
}

## Grid Search Cross-Validation with 10 fold CV
gridcv_rf = GridSearchCV(regr, param_grid = grid_search, cv = 3)
gridcv_rf = gridcv_rf.fit(X_train,Y_train)
view_best_params_GRF = gridcv_rf.best_params_

#Apply model on test data
predictions_GRF_test_Df = gridcv_rf.predict(test)

In [80]:
predictions_GRF_test_Df

array([0.46655791, 0.51106796, 0.46655791, ..., 0.51106796, 0.51106796,
       0.51106796])

In [81]:
test['infectionProb'] = predictions_GRF_test_Df

In [82]:
test.head(20)

Unnamed: 0,fever,bodyPain,age,runnynose,diifBreadth,infectionProb
942,99.832619,1,23,0,-1,0.466558
2315,100.19993,0,34,1,1,0.511068
2414,99.938159,1,55,0,-1,0.466558
5004,99.011481,1,30,1,0,0.511068
585,98.883007,1,90,0,1,0.511068
3186,99.655839,0,99,0,1,0.593137
3877,99.082883,1,67,1,1,0.511068
3242,98.683538,1,71,0,-1,0.466558
421,99.072951,0,96,1,1,0.593137
1030,99.357564,1,88,1,-1,0.466558


In [83]:
clf = LogisticRegression()
clf.fit(X_train, Y_train)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='auto', n_jobs=None, penalty='l2',
                   random_state=None, solver='lbfgs', tol=0.0001, verbose=0,
                   warm_start=False)

In [84]:
input_features = [102, 1, 22, -1, 1]
infProb = clf.predict_proba([input_features])[0][1]

In [85]:
infProb

0.5124487789371915

In [86]:
input_features = [102, 1, 60, 0, 1]
clf.predict_proba([input_features])[0][1]

0.5040848062387709

In [87]:
input_features = [102, 1, 22, -1, 1]
infProb = clf.predict([input_features])

In [88]:
infProb[0]

1

In [89]:
input_features = [104, 1, 80, 1, 1]
clf.predict([input_features])[0]

1

In [90]:
input_features = [102, 1, 22, 1, 1]
clf.predict([input_features])[0]

1

In [91]:
input_features = [102, 1, 50, 0, 1]
clf.predict([input_features])[0]

1