# Regression Model Analysis: Predictive Model
## Pre-processing


In [49]:
# Importing useful libraries
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import seaborn as sns
from sklearn.preprocessing import RobustScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_selection import RFE
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, r2_score
import plotly.express as px
import plotly.graph_objs as go
import plotly.figure_factory as ff
import warnings

In [50]:
warnings.filterwarnings("ignore")

### Create dataframe

In [51]:
# patient dataframe
patient_dataframe = pd.read_csv('./Populated_data/patient.csv')
patient_dataframe.shape

(250, 29)

In [52]:
# doctor dataframe
doctor_dataframe = pd.read_csv('./Populated_data/doctor.csv')
doctor_dataframe.shape

(214, 22)

In [53]:
# consultation dataframe
consultation_dataframe = pd.read_csv('./Populated_data/consultation.csv')
consultation_dataframe.shape

(520, 17)

### Check duplicates

In [54]:
# for patient dataframe
patient_dataframe[patient_dataframe.duplicated()==True]

Unnamed: 0,patient_id,address,allergies,blood_type,closest_hospital,community,country,date_of_birth,disabilities,first_name,...,num_of_dependents,post_code,preferred_language,purpose_of_visit,state,sur_name,title,user_id,weight,hospital_id


In [55]:
# for doctor dataframe
doctor_dataframe[doctor_dataframe.duplicated()==True]

Unnamed: 0,doctor_id,approved,city,cv,date_of_birth,dependant_stand_by,full_name,doctor_gender,graduation_date,is_student,...,name_of_university,national_identification_number,office_address,reference_doctor_email,reference_doctor_full_name,reference_doctor_phone_number,state,student_number,system_status,title


In [56]:
# for consultation dataframe
consultation_dataframe[consultation_dataframe.duplicated()==True]

Unnamed: 0,id,consultation_cost,audio_detailed_description,patient_body_area,consent,doctor_full_name,doctor_id,language,patient_full_name,patient_id,patient_mobile,patient_status,time_accepted,time_booked,time_finished,time_started,transaction_reference


### Check missing value(s)

In [57]:
# for patient dataframe
nan=pd.DataFrame({'Missing value': patient_dataframe.isnull().sum().sort_values(ascending=False)})
nan[nan['Missing value'] > 0]

Unnamed: 0,Missing value
allergies,80
medical_problems,50
disabilities,38


In [58]:
# for doctor dataframe
nan=pd.DataFrame({'Missing value': doctor_dataframe.isnull().sum().sort_values(ascending=False)})
nan[nan['Missing value'] > 0]

Unnamed: 0,Missing value
cv,214


In [59]:
# for consultation dataframe
nan=pd.DataFrame({'Missing value': consultation_dataframe.isnull().sum().sort_values(ascending=False)})
nan[nan['Missing value'] > 0]

Unnamed: 0,Missing value
audio_detailed_description,520


    N.B. Since medical data require medical examination, filling the missing variable(s) would be inappropriat. Hence droping missing variable(s) is recommended.

In [60]:
# Drop missing varaiables in patient dataframe
patient_dataframe.dropna(axis=1, inplace=True)
patient_dataframe.reset_index(drop=True, inplace=True)

In [61]:
nan=pd.DataFrame({'Missing value': patient_dataframe.isnull().sum().sort_values(ascending=False)})
nan[nan['Missing value'] > 0]

Unnamed: 0,Missing value


In [62]:
patient_dataframe.shape

(250, 26)

In [63]:
patient_dataframe.head()

Unnamed: 0,patient_id,address,blood_type,closest_hospital,community,country,date_of_birth,first_name,gender,has_children,...,num_of_dependents,post_code,preferred_language,purpose_of_visit,state,sur_name,title,user_id,weight,hospital_id
0,1,Ap #266-429 Velit St.,AB+,Blue Cross Hospital,suburban,nigeria,4/13/2009,ACRA,Female,Yes,...,2,10201,english,medical examination,Lagos,SMITH,Mrs.,1,69,1
1,2,315-4283 Nam Street,O-,Krown Hospital,suburban,nigeria,7/8/2011,ADOLPH,Male,Yes,...,2,12224,english,medical examination,Lagos,NEILON,Mr.,2,80,1
2,3,921-1491 Nonummy Ave,A-,Ave Maria Hospital,suburban,nigeria,12/30/2000,ANKI,Male,Yes,...,3,53510,english,medical examination,Lagos,HENDRI,Mr.,3,67,1
3,4,Ap #299-4324 Mauris Avenue,O-,Isolo General Hospital,urban,nigeria,9/15/1987,GARBO,Male,No,...,3,25193,english,medical examination,Lagos,SHARRON,Mr.,4,74,1
4,5,"P.O. Box 913, 6976 Magnis Rd.",O-,Lagoon Hospitals,rural,nigeria,10/10/2002,ADONAI,Female,No,...,1,37171,english,medical examination,Lagos,NEIVA,Ms.,5,62,1


In [64]:
# cleaned patient dataset
patient_dataframe.to_csv('cleaned_patient.csv', index=False)

In [65]:
# Drop missing varaiables in doctor dataframe
doctor_dataframe.dropna(axis=1, inplace=True)
doctor_dataframe.reset_index(drop=True, inplace=True)

In [66]:
nan=pd.DataFrame({'Missing value': doctor_dataframe.isnull().sum().sort_values(ascending=False)})
nan[nan['Missing value'] > 0]

Unnamed: 0,Missing value


In [67]:
doctor_dataframe.shape

(214, 21)

In [68]:
doctor_dataframe.head()

Unnamed: 0,doctor_id,approved,city,date_of_birth,dependant_stand_by,full_name,doctor_gender,graduation_date,is_student,marital_status,...,name_of_university,national_identification_number,office_address,reference_doctor_email,reference_doctor_full_name,reference_doctor_phone_number,state,student_number,system_status,title
0,1,Yes,Lagos,8/5/1972,1,ACRA SMITH,Male,9/6/2019,No,Married,...,Caleb University- Lagos,1122458,618-3434 Sagittis Av.,acrasmith@gmail.com,ACRA SMITH,53488888,Lagos,2710,very good,Mr.
1,2,Yes,Lagos,7/12/1989,2,ADOLPH NEILON,Female,6/25/2006,Yes,Married,...,National Open University of Nigeria- Lagos,4563751,405-9892 Ultrices Rd.,adolphneilon@gmail.com,ADOLPH NEILON,50989139,Lagos,5370,good,Ms.
2,3,Yes,Lagos,9/30/1965,1,ANKI HENDRI,Male,12/20/2008,Yes,Married,...,James Hope University- Lagos,1877514,Ap #561-9293 Sociis St.,ankihendri@gmail.com,ANKI HENDRI,42144921,Lagos,6696,excellent,Mr.
3,4,Yes,Lagos,8/12/1964,1,GARBO SHARRON,Male,8/22/2013,No,Married,...,Lagos State University of Science and Technolo...,2513477,"Ap #381-3175 Sed, Street",garbosharron@gmail.com,GARBO SHARRON,52477143,Lagos,4343,fair,Mr.
4,5,Yes,Lagos,12/5/1963,0,ADONAI NEIVA,Female,12/28/2021,No,Married,...,Lagos State University of Science and Technolo...,3463224,"Ap #230-5566 Amet, St.",adonaineiva@gmail.com,ADONAI NEIVA,29517478,Lagos,4300,very good,Ms.


In [69]:
# cleaned doctor dataset
doctor_dataframe.to_csv('cleaned_doctor.csv', index=False)

In [70]:
# Drop missing varaiables in consultation dataframe
consultation_dataframe.dropna(axis=1, inplace=True)
consultation_dataframe.reset_index(drop=True, inplace=True)

In [71]:
nan=pd.DataFrame({'Missing value': consultation_dataframe.isnull().sum().sort_values(ascending=False)})
nan[nan['Missing value'] > 0]

Unnamed: 0,Missing value


In [72]:
consultation_dataframe.shape

(520, 16)

In [73]:
consultation_dataframe.head()

Unnamed: 0,id,consultation_cost,patient_body_area,consent,doctor_full_name,doctor_id,language,patient_full_name,patient_id,patient_mobile,patient_status,time_accepted,time_booked,time_finished,time_started,transaction_reference
0,1,1900,legs,Yes,ACRA SMITH,1,english,ADRIATIK NELL,100,6780843,under appointment,13:30,2:00 PM,5:10 PM,3:10 PM,1
1,2,9100,chest area,Yes,ADOLPH NEILON,2,english,ANNAMARIE HERA,72,4272348,new consultation,10:15,10:45 AM,12:45 PM,11:45 AM,2
2,3,8200,head,Yes,ANKI HENDRI,3,english,GARRICK SHATISHA,150,6183602,new consultation,8:45,9:35 AM,12:15 PM,10:35 AM,3
3,4,5600,back,Yes,GARBO SHARRON,4,english,ADRIELLE NELLI,27,6822469,appointed session,13:15,1:55 PM,4:35 PM,2:45 PM,4
4,5,1200,stomach,Yes,ADONAI NEIVA,5,english,ANNASTASIA HERARD,191,7229498,new consultation,9:00,9:30 AM,11:20 AM,10:30 AM,5


In [74]:
# cleaned consultation dataset
consultation_dataframe.to_csv('cleaned_consultation.csv', index=False)

### Creating the new simulated dataset via merg

In [75]:
# Calculate variable age using variable date_of_birth in the patient_dataframe

from datetime import datetime

# Convert date_of_birth column to datetime
patient_dataframe['date_of_birth'] = pd.to_datetime(patient_dataframe['date_of_birth'])

# Calculate age by subtracting date_of_birth from current date
patient_dataframe['age'] = (datetime.now() - patient_dataframe['date_of_birth']) // pd.Timedelta(days=365)


In [76]:
patient_dataframe.head()

Unnamed: 0,patient_id,address,blood_type,closest_hospital,community,country,date_of_birth,first_name,gender,has_children,...,post_code,preferred_language,purpose_of_visit,state,sur_name,title,user_id,weight,hospital_id,age
0,1,Ap #266-429 Velit St.,AB+,Blue Cross Hospital,suburban,nigeria,2009-04-13,ACRA,Female,Yes,...,10201,english,medical examination,Lagos,SMITH,Mrs.,1,69,1,14
1,2,315-4283 Nam Street,O-,Krown Hospital,suburban,nigeria,2011-07-08,ADOLPH,Male,Yes,...,12224,english,medical examination,Lagos,NEILON,Mr.,2,80,1,11
2,3,921-1491 Nonummy Ave,A-,Ave Maria Hospital,suburban,nigeria,2000-12-30,ANKI,Male,Yes,...,53510,english,medical examination,Lagos,HENDRI,Mr.,3,67,1,22
3,4,Ap #299-4324 Mauris Avenue,O-,Isolo General Hospital,urban,nigeria,1987-09-15,GARBO,Male,No,...,25193,english,medical examination,Lagos,SHARRON,Mr.,4,74,1,35
4,5,"P.O. Box 913, 6976 Magnis Rd.",O-,Lagoon Hospitals,rural,nigeria,2002-10-10,ADONAI,Female,No,...,37171,english,medical examination,Lagos,NEIVA,Ms.,5,62,1,20


In [77]:
# Calculate variable years_of_experience using variable graduation_date in doctor_dataframe

# Convert graduation_date column to datetime
doctor_dataframe['graduation_date'] = pd.to_datetime(doctor_dataframe['graduation_date'])

# Calculate years_of_experience by subtracting graduation_date from current date
doctor_dataframe['years_of_experience'] = (datetime.now() - doctor_dataframe['graduation_date']) // pd.Timedelta(days=365)

In [78]:
doctor_dataframe.head()

Unnamed: 0,doctor_id,approved,city,date_of_birth,dependant_stand_by,full_name,doctor_gender,graduation_date,is_student,marital_status,...,national_identification_number,office_address,reference_doctor_email,reference_doctor_full_name,reference_doctor_phone_number,state,student_number,system_status,title,years_of_experience
0,1,Yes,Lagos,8/5/1972,1,ACRA SMITH,Male,2019-09-06,No,Married,...,1122458,618-3434 Sagittis Av.,acrasmith@gmail.com,ACRA SMITH,53488888,Lagos,2710,very good,Mr.,3
1,2,Yes,Lagos,7/12/1989,2,ADOLPH NEILON,Female,2006-06-25,Yes,Married,...,4563751,405-9892 Ultrices Rd.,adolphneilon@gmail.com,ADOLPH NEILON,50989139,Lagos,5370,good,Ms.,16
2,3,Yes,Lagos,9/30/1965,1,ANKI HENDRI,Male,2008-12-20,Yes,Married,...,1877514,Ap #561-9293 Sociis St.,ankihendri@gmail.com,ANKI HENDRI,42144921,Lagos,6696,excellent,Mr.,14
3,4,Yes,Lagos,8/12/1964,1,GARBO SHARRON,Male,2013-08-22,No,Married,...,2513477,"Ap #381-3175 Sed, Street",garbosharron@gmail.com,GARBO SHARRON,52477143,Lagos,4343,fair,Mr.,9
4,5,Yes,Lagos,12/5/1963,0,ADONAI NEIVA,Female,2021-12-28,No,Married,...,3463224,"Ap #230-5566 Amet, St.",adonaineiva@gmail.com,ADONAI NEIVA,29517478,Lagos,4300,very good,Ms.,1


In [79]:
# Merging patient_datafram and doctor_datafram to consultation_dataframe
patient_consultation = pd.merge(consultation_dataframe[['id', 'consultation_cost', 'patient_body_area', 'patient_status', 'patient_id', 'doctor_id']], 
                                patient_dataframe[["patient_id", 'community', 'marital_status', 'has_children', 'has_dependent', 'num_of_children', 'num_of_dependents', "age", "gender"]], on="patient_id", how="left")
doctor_patient_consultation = pd.merge(patient_consultation, doctor_dataframe[["doctor_id", "years_of_experience"]], on="doctor_id", how="left")
doctor_patient_consultation.shape

(520, 15)

In [80]:
doctor_patient_consultation.head()

Unnamed: 0,id,consultation_cost,patient_body_area,patient_status,patient_id,doctor_id,community,marital_status,has_children,has_dependent,num_of_children,num_of_dependents,age,gender,years_of_experience
0,1,1900,legs,under appointment,100,1,suburban,Divorced,Yes,No,4,3,29,Female,3
1,2,9100,chest area,new consultation,72,2,rural,Married,No,Yes,3,2,12,Female,16
2,3,8200,head,new consultation,150,3,rural,Married,Yes,Yes,2,1,35,Male,14
3,4,5600,back,appointed session,27,4,rural,Married,Yes,No,2,1,25,Female,9
4,5,1200,stomach,new consultation,191,5,suburban,Single,No,Yes,1,1,39,Female,1


In [81]:
# The Simulated dataframe
simulated_dataframe = doctor_patient_consultation.drop(["doctor_id", "patient_id"], axis=1)
simulated_dataframe=pd.DataFrame({'consult_id':doctor_patient_consultation['id'], 'consult_cost':doctor_patient_consultation['consultation_cost'] , 'patient_anatomy':doctor_patient_consultation['patient_body_area'],
                         'patient_consult_status':doctor_patient_consultation['patient_status'], 'patient_community':doctor_patient_consultation['community'], 'patient_marital_status':doctor_patient_consultation['marital_status'], 
                         'patient_has_children':doctor_patient_consultation['has_children'], 'patient_has_dependent':doctor_patient_consultation['has_dependent'], 'patient_num_of_children':doctor_patient_consultation['num_of_children'], 
                         'patient_num_of_dependents':doctor_patient_consultation['num_of_dependents'], 'patient_age':doctor_patient_consultation['age'], 'patient_gender':doctor_patient_consultation['gender'], 
                         'consultant_doctor_experience':doctor_patient_consultation['years_of_experience']})

simulated_dataframe.shape

(520, 13)

In [82]:
simulated_dataframe.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 520 entries, 0 to 519
Data columns (total 13 columns):
 #   Column                        Non-Null Count  Dtype 
---  ------                        --------------  ----- 
 0   consult_id                    520 non-null    int64 
 1   consult_cost                  520 non-null    int64 
 2   patient_anatomy               520 non-null    object
 3   patient_consult_status        520 non-null    object
 4   patient_community             520 non-null    object
 5   patient_marital_status        520 non-null    object
 6   patient_has_children          520 non-null    object
 7   patient_has_dependent         520 non-null    object
 8   patient_num_of_children       520 non-null    int64 
 9   patient_num_of_dependents     520 non-null    int64 
 10  patient_age                   520 non-null    int64 
 11  patient_gender                520 non-null    object
 12  consultant_doctor_experience  520 non-null    int64 
dtypes: int64(6), object(

#### Transform the catagorical features to dummy-variable format

In [83]:
print(simulated_dataframe['patient_anatomy'].unique())
print(simulated_dataframe['patient_consult_status'].unique())
print(simulated_dataframe['patient_community'].unique())
print(simulated_dataframe['patient_marital_status'].unique())
print(simulated_dataframe['patient_has_children'].unique())
print(simulated_dataframe['patient_has_dependent'].unique())
print(simulated_dataframe['patient_gender'].unique())

['legs' 'chest area' 'head' 'back' 'stomach' 'upper chest area' 'shoulder']
['under appointment' 'new consultation' 'appointed session' 'admitted']
['suburban' 'rural' 'urban']
['Divorced' 'Married' 'Single']
['Yes' 'No']
['No' 'Yes']
['Female' 'Male']


In [84]:
simulated_dataframe = pd.concat([simulated_dataframe, pd.get_dummies(simulated_dataframe['patient_anatomy'], prefix='patient_anatomy', drop_first=True)],axis=1)
simulated_dataframe = pd.concat([simulated_dataframe, pd.get_dummies(simulated_dataframe['patient_consult_status'], prefix='patient_consult_status', drop_first=True)],axis=1)
simulated_dataframe = pd.concat([simulated_dataframe, pd.get_dummies(simulated_dataframe['patient_community'], prefix='patient_community', drop_first=True)],axis=1)
simulated_dataframe = pd.concat([simulated_dataframe, pd.get_dummies(simulated_dataframe['patient_marital_status'], prefix='patient_marital_status', drop_first=True)],axis=1)
simulated_dataframe = pd.concat([simulated_dataframe, pd.get_dummies(simulated_dataframe['patient_has_children'], prefix='patient_has_children', drop_first=True)],axis=1)
simulated_dataframe = pd.concat([simulated_dataframe, pd.get_dummies(simulated_dataframe['patient_has_dependent'], prefix='patient_has_dependent', drop_first=True)],axis=1)
simulated_dataframe = pd.concat([simulated_dataframe, pd.get_dummies(simulated_dataframe['patient_gender'], prefix='patient_gender', drop_first=True)],axis=1)
simulated_dataframe.drop(['patient_anatomy','patient_consult_status','patient_community', 'patient_marital_status', 'patient_has_children', 'patient_has_dependent', 'patient_gender' ], axis=1, inplace=True)

In [85]:
# The simulated dataset
simulated_dataframe.to_csv('simulated_data.csv', index=False)

In [86]:
simulated_dataframe=pd.read_csv('simulated_data.csv', index_col='consult_id')
simulated_dataframe.head()

Unnamed: 0_level_0,consult_cost,patient_num_of_children,patient_num_of_dependents,patient_age,consultant_doctor_experience,patient_anatomy_chest area,patient_anatomy_head,patient_anatomy_legs,patient_anatomy_shoulder,patient_anatomy_stomach,...,patient_consult_status_appointed session,patient_consult_status_new consultation,patient_consult_status_under appointment,patient_community_suburban,patient_community_urban,patient_marital_status_Married,patient_marital_status_Single,patient_has_children_Yes,patient_has_dependent_Yes,patient_gender_Male
consult_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,1900,4,3,29,3,0,0,1,0,0,...,0,0,1,1,0,0,0,1,0,0
2,9100,3,2,12,16,1,0,0,0,0,...,0,1,0,0,0,1,0,0,1,0
3,8200,2,1,35,14,0,1,0,0,0,...,0,1,0,0,0,1,0,1,1,1
4,5600,2,1,25,9,0,0,0,0,0,...,1,0,0,0,0,1,0,1,0,0
5,1200,1,1,39,1,0,0,0,0,1,...,0,1,0,1,0,0,1,0,1,0


In [87]:
simulated_dataframe.shape

(520, 21)

#### Describe or summarize 

In [88]:
simulated_dataframe.describe()

Unnamed: 0,consult_cost,patient_num_of_children,patient_num_of_dependents,patient_age,consultant_doctor_experience,patient_anatomy_chest area,patient_anatomy_head,patient_anatomy_legs,patient_anatomy_shoulder,patient_anatomy_stomach,...,patient_consult_status_appointed session,patient_consult_status_new consultation,patient_consult_status_under appointment,patient_community_suburban,patient_community_urban,patient_marital_status_Married,patient_marital_status_Single,patient_has_children_Yes,patient_has_dependent_Yes,patient_gender_Male
count,520.0,520.0,520.0,520.0,520.0,520.0,520.0,520.0,520.0,520.0,...,520.0,520.0,520.0,520.0,520.0,520.0,520.0,520.0,520.0,520.0
mean,5439.807692,1.984615,1.988462,24.798077,8.907692,0.090385,0.171154,0.119231,0.098077,0.209615,...,0.169231,0.434615,0.198077,0.357692,0.317308,0.359615,0.267308,0.505769,0.517308,0.5
std,2814.354199,1.267556,1.325553,8.759888,5.075076,0.287008,0.377006,0.324372,0.297705,0.407426,...,0.375317,0.496184,0.398934,0.479782,0.465876,0.48035,0.44298,0.500448,0.500182,0.500481
min,1100.0,0.0,0.0,10.0,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,2900.0,1.0,1.0,17.0,4.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
50%,5350.0,2.0,2.0,24.5,8.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.5
75%,8000.0,3.0,3.0,32.0,14.0,0.0,0.0,0.0,0.0,0.0,...,0.0,1.0,0.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0
max,10700.0,4.0,4.0,40.0,18.0,1.0,1.0,1.0,1.0,1.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0


In [89]:
px.histogram(simulated_dataframe, x="consult_cost", marginal="box")

In [90]:
px.histogram(simulated_dataframe, x="consultant_doctor_experience", marginal="box")

In [91]:
px.histogram(simulated_dataframe, x="patient_age", marginal="box")

#### correlation

In [92]:
corr = simulated_dataframe.corr()
fig = px.imshow(corr, text_auto=True, aspect="auto")

# Title
fig.update_layout(title_text="Correlation matrix plot of the simulated dataset")

fig.show()

In [93]:
# use logarithm of target variable (consultation_cost) to minimize outlier and mean square error(MSE) 
simulated_dataframe['consult_cost'] = np.log(simulated_dataframe['consult_cost'])

## Choices for Insurance Model
        Choose a suitable machine learning predictive model that forcasts the value of the target.

Getting the train and test datasets 

In [94]:
target_variable = 'consult_cost'
X = simulated_dataframe.drop(target_variable, axis=1)

#Apply robust scaling to explanatory variables to reduce outliers
robust_scaler = RobustScaler()
X = robust_scaler.fit_transform(X)  # x_new=x-median(data)/(Q3-Q1)

y = simulated_dataframe[target_variable]
x_train, x_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=23)

Preparing dataframe for model analysis

In [95]:
mean_Model = pd.DataFrame(index=['train_MSE', 'test_MSE'], columns=['RF', 'MLR', 'KNN', 'LASSO'])
r_Model = pd.DataFrame(index=['train_R2', 'test_R2'], columns=['RF', 'MLR', 'KNN', 'LASSO'])

Building models 

    Random Forest Regression (RF)

In [96]:
# 1. Import the estimator object (model)
from sklearn.ensemble import RandomForestRegressor
# 2. Create an instance of the estimator
RF = RandomForestRegressor(n_estimators=100, random_state=21)
# 3. Use the trainning data to train the estimator
RF.fit(x_train, y_train)
# 4. Evaluate the model
mean_Model.loc['train_MSE','RF'] = mean_squared_error(y_pred=RF.predict(x_train), y_true=y_train)
mean_Model.loc['test_MSE','RF'] = mean_squared_error(y_pred=RF.predict(x_test), y_true=y_test)
r_Model.loc['train_R2','RF'] = r2_score(y_pred=RF.predict(x_train), y_true=y_train)
r_Model.loc['test_R2','RF'] = r2_score(y_pred=RF.predict(x_test), y_true=y_test)

    Multiple Linear Regression (MLR)

In [97]:
# 1. Import the estimator object (model)
from sklearn.linear_model import LinearRegression
# 2. Create an instance of the estimator
linear_regression = LinearRegression()
# 3. Use the trainning data to train the estimator
linear_regression.fit(x_train, y_train)
# 4. Evaluate the model
mean_Model.loc['train_MSE','MLR'] = mean_squared_error(y_pred=linear_regression.predict(x_train), y_true=y_train)
mean_Model.loc['test_MSE','MLR'] = mean_squared_error(y_pred=linear_regression.predict(x_test), y_true=y_test)
r_Model.loc['train_R2','MLR'] = r2_score(y_pred=linear_regression.predict(x_train), y_true=y_train)
r_Model.loc['test_R2','MLR'] = r2_score(y_pred=linear_regression.predict(x_test), y_true=y_test)

    K-Nearest Neighbor Model (KNN)

In [98]:
# 1. Import the estimator object (model)
from sklearn.neighbors import KNeighborsRegressor
# 2. Create an instance of the estimator
knn = KNeighborsRegressor(n_neighbors=7, weights='distance', metric='euclidean', n_jobs=-1)
# 3. Use the training data to train the estimator
knn.fit(x_train, y_train)
# 4. Evaluate the model
mean_Model.loc['train_MSE','KNN'] = mean_squared_error(y_pred=knn.predict(x_train), y_true=y_train)
mean_Model.loc['test_MSE','KNN'] = mean_squared_error(y_pred=knn.predict(x_test), y_true=y_test)
r_Model.loc['train_R2','KNN'] = r2_score(y_pred=knn.predict(x_train), y_true=y_train)
r_Model.loc['test_R2','KNN'] = r2_score(y_pred=knn.predict(x_test), y_true=y_test)

    Least Absolute Shrinkage and Selection Operator (LASSO)

In [99]:
# 1. Import the estimator object (model)
from sklearn.linear_model import Lasso
# 2. Create an instance of the estimator
lasso = Lasso(alpha=0.05)
# 3. Use the training data to train the estimator
lasso.fit(x_train, y_train)
# 4. Evaluate the model
mean_Model.loc['train_MSE','LASSO'] = mean_squared_error(y_pred=lasso.predict(x_train), y_true=y_train)
mean_Model.loc['test_MSE','LASSO'] = mean_squared_error(y_pred=lasso.predict(x_test), y_true=y_test)
r_Model.loc['train_R2','LASSO'] = r2_score(y_pred=lasso.predict(x_train), y_true=y_train)
r_Model.loc['test_R2','LASSO'] = r2_score(y_pred=lasso.predict(x_test), y_true=y_test)

In [100]:
mean_Model

Unnamed: 0,RF,MLR,KNN,LASSO
train_MSE,0.000783,0.046725,0.0,0.058301
test_MSE,0.011064,0.053606,0.200941,0.060335


In [101]:
train_MSE_plot = [
    go.Bar(
        y=['RF_train_MSE', 'MLR_train_MSE', 'KNN_train_MSE', 'LASSO_train_MSE'],
        x=[0.000686, 0.046501, 0.0, 0.057919],
        orientation='h',
        marker=dict(color=['red', 'orange', 'green', 'blue'])
    )
]

layout = go.Layout(title='Mean square error of regression models with train data')

fig = go.Figure(data=train_MSE_plot, layout=layout)

fig.show()

In [102]:
test_MSE_plot = [
    go.Bar(
        y=['RF_test_MSE', 'MLR_test_MSE', 'KNN_test_MSE', 'LASSO_test_MSE'],
        x=[0.00927, 0.053127, 0.199986, 0.059767],
        orientation='h',
        marker=dict(color=['red', 'orange', 'green', 'blue'])
    )
]

layout = go.Layout(title='Mean square error of regression models with test data')

fig = go.Figure(data=test_MSE_plot, layout=layout)

fig.show()

In [103]:
r_Model

Unnamed: 0,RF,MLR,KNN,LASSO
train_R2,0.998252,0.895727,1.0,0.869895
test_R2,0.973028,0.869319,0.510142,0.852915


In [104]:
train_R2_plot = [
    go.Bar(
        y=['RF_train_R2', 'MLR_train_R2', 'KNN_train_R2', 'LASSO_train_R2'],
        x=[0.998468, 0.896227, 1.0, 0.870747],
        orientation='h',
        marker=dict(color=['red', 'orange', 'green', 'blue'])
    )
]

layout = go.Layout(title='R-square score of regression models with train data')

fig = go.Figure(data=train_R2_plot, layout=layout)

fig.show()

In [105]:
test_R2_plot = [
    go.Bar(
        y=['RF_test_R2', 'MLR_test_R2', 'KNN_test_R2', 'LASSO_test_R2'],
        x=[0.977403, 0.870487, 0.512472, 0.854298],
        orientation='h',
        marker=dict(color=['red', 'orange', 'green', 'blue'])
    )
]

layout = go.Layout(title='R-square score of regression models with test data')

fig = go.Figure(data=test_R2_plot, layout=layout)

fig.show()

In [106]:
# Create a scatter plot using trained data
fig = go.Figure()

fig.add_trace(
    go.Scatter(
        y=y_train,
        x=knn.predict(x_train),
        mode='markers',
        marker=dict(color='blue')
    )
)

# Add a diagonal line to represent perfect predictions
fig.add_trace(
    go.Scatter(
        y=[min(y_train), max(y_train)],
        x=[min(y_train), max(y_train)],
        mode='lines',
        line=dict(color='red', dash='dash')
    )
)

# Customize the layout
fig.update_layout(
    title='KNN:Observed target vs Predicted target using train data',
    xaxis_title='Observed consult_cost',
    yaxis_title='Predicted consult_cost',
    width=800,
    height=500
)

# Show the plot
fig.show()

In [107]:
# Create a scatter plot using test data
fig = go.Figure()

fig.add_trace(
    go.Scatter(
        y=y_test,
        x=knn.predict(x_test),
        mode='markers',
        marker=dict(color='blue')
    )
)

# Add a diagonal line to represent perfect predictions
fig.add_trace(
    go.Scatter(
        y=[min(y_test), max(y_test)],
        x=[min(y_test), max(y_test)],
        mode='lines',
        line=dict(color='red', dash='dash')
    )
)

# Customize the layout
fig.update_layout(
    title='KNN:Observed target vs Predicted target using test data',
    xaxis_title='Observed consult_cost',
    yaxis_title='Predicted consult_cost',
    width=800,
    height=500
)

# Show the plot
fig.show()