In [1]:
from sklearn.preprocessing import StandardScaler,MinMaxScaler
from sklearn.model_selection import train_test_split,cross_val_score,GridSearchCV,RandomizedSearchCV
from sklearn.linear_model import LinearRegression,ElasticNet,ElasticNetCV,Ridge,RidgeCV,Lasso,LassoCV
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestRegressor,GradientBoostingRegressor,AdaBoostRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.preprocessing import OrdinalEncoder,LabelEncoder
from sklearn.metrics import r2_score,mean_absolute_error,mean_squared_error,root_mean_squared_error
import xgboost as xgb
import pandas as pd
import numpy as np
import logging
import matplotlib.pyplot as plt # type: ignore
import seaborn as sns # type: ignore
import pickle

In [2]:
logging.basicConfig(
    filename='prediction.log',
    filemode='w',
    level=logging.DEBUG,
    format='%(asctime)s-%(name)s-%(levelname)s-%(message)s',
    datefmt='%Y-%m-%d %H:%M:%S'
)

In [3]:
social=pd.read_csv(r'Time-Wasters on Social Media.csv')
data=pd.DataFrame(social)
logging.info('retrieving the data')

In [4]:
data.shape

(1000, 31)

In [5]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 31 columns):
 #   Column                    Non-Null Count  Dtype 
---  ------                    --------------  ----- 
 0   UserID                    1000 non-null   int64 
 1   Age                       1000 non-null   int64 
 2   Gender                    1000 non-null   object
 3   Location                  1000 non-null   object
 4   Income                    1000 non-null   int64 
 5   Debt                      1000 non-null   bool  
 6   Owns Property             1000 non-null   bool  
 7   Profession                1000 non-null   object
 8   Demographics              1000 non-null   object
 9   Platform                  1000 non-null   object
 10  Total Time Spent          1000 non-null   int64 
 11  Number of Sessions        1000 non-null   int64 
 12  Video ID                  1000 non-null   int64 
 13  Video Category            1000 non-null   object
 14  Video Length             

In [6]:
data.head()

Unnamed: 0,UserID,Age,Gender,Location,Income,Debt,Owns Property,Profession,Demographics,Platform,...,ProductivityLoss,Satisfaction,Watch Reason,DeviceType,OS,Watch Time,Self Control,Addiction Level,CurrentActivity,ConnectionType
0,1,56,Male,Pakistan,82812,True,True,Engineer,Rural,Instagram,...,3,7,Procrastination,Smartphone,Android,9:00 PM,5,5,Commuting,Mobile Data
1,2,46,Female,Mexico,27999,False,True,Artist,Urban,Instagram,...,5,5,Habit,Computer,Android,5:00 PM,7,3,At school,Wi-Fi
2,3,32,Female,United States,42436,False,True,Engineer,Rural,Facebook,...,6,4,Entertainment,Tablet,Android,2:00 PM,8,2,At home,Mobile Data
3,4,60,Male,Barzil,62963,True,False,Waiting staff,Rural,YouTube,...,3,7,Habit,Smartphone,Android,9:00 PM,5,5,Commuting,Mobile Data
4,5,25,Male,Pakistan,22096,False,True,Manager,Urban,TikTok,...,8,2,Boredom,Smartphone,iOS,8:00 AM,10,0,At home,Mobile Data


In [7]:
logging.debug('modification of data')
mod_data=data.drop(columns=['UserID','Video ID','Location','Watch Time'])

In [8]:
mod_data.head()

Unnamed: 0,Age,Gender,Income,Debt,Owns Property,Profession,Demographics,Platform,Total Time Spent,Number of Sessions,...,Frequency,ProductivityLoss,Satisfaction,Watch Reason,DeviceType,OS,Self Control,Addiction Level,CurrentActivity,ConnectionType
0,56,Male,82812,True,True,Engineer,Rural,Instagram,80,17,...,Night,3,7,Procrastination,Smartphone,Android,5,5,Commuting,Mobile Data
1,46,Female,27999,False,True,Artist,Urban,Instagram,228,14,...,Afternoon,5,5,Habit,Computer,Android,7,3,At school,Wi-Fi
2,32,Female,42436,False,True,Engineer,Rural,Facebook,30,6,...,Evening,6,4,Entertainment,Tablet,Android,8,2,At home,Mobile Data
3,60,Male,62963,True,False,Waiting staff,Rural,YouTube,101,19,...,Night,3,7,Habit,Smartphone,Android,5,5,Commuting,Mobile Data
4,25,Male,22096,False,True,Manager,Urban,TikTok,136,6,...,Morning,8,2,Boredom,Smartphone,iOS,10,0,At home,Mobile Data


In [9]:
for i in mod_data.columns:
    if mod_data[i].dtype=='int':
        q3=mod_data[i].quantile(0.75)
        q1=mod_data[i].quantile(0.25)
        iqr=q3-q1
        lower_bound=q1-1.5*iqr
        upper_bound=q3+1.5*iqr
        mod_data[(mod_data[i]>=lower_bound) & (mod_data[i]<=upper_bound)]
        
    elif mod_data[i].dtype=='float':
        q3=mod_data[i].quantile(0.75)
        q1=mod_data[i].quantile(0.25)
        iqr=q3-q1
        lower_bound=q1-1.5*iqr
        upper_bound=q3+1.5*iqr
        mod_data[(mod_data[i]>=lower_bound) & (mod_data[i]<=upper_bound)]

In [10]:
for i in mod_data.columns:
    if mod_data[i].dtype=='object':
        print(mod_data[i].value_counts())

Gender
Male      514
Female    322
Other     164
Name: count, dtype: int64
Profession
Students         246
Waiting staff    194
Labor/Worker     186
driver           113
Engineer          65
Cashier           56
Manager           54
Artist            47
Teacher           39
Name: count, dtype: int64
Demographics
Rural    746
Urban    254
Name: count, dtype: int64
Platform
TikTok       273
Instagram    256
YouTube      250
Facebook     221
Name: count, dtype: int64
Video Category
Jokes/Memes      179
Life Hacks       162
Gaming           119
Vlogs            114
Pranks           110
Entertainment    102
Trends           100
ASMR              79
Comedy            35
Name: count, dtype: int64
Frequency
Evening      365
Night        306
Afternoon    223
Morning      106
Name: count, dtype: int64
Watch Reason
Habit              339
Boredom            277
Entertainment      264
Procrastination    120
Name: count, dtype: int64
DeviceType
Smartphone    585
Tablet        283
Computer      132
N

In [11]:
mod_data['Demographics']=mod_data['Demographics'].replace({'Rural':0,'Urban':1}).astype('int')
mod_data['Gender']=mod_data['Gender'].replace({'Male':1,'Female':2,'Other':3}).astype('int')
mod_data['ConnectionType']=mod_data['ConnectionType'].replace({'Mobile Data':1,'Wi-Fi':2}).astype('int')
mod_data['Profession']=mod_data['Profession'].replace({'Students':1,'Waiting staff':2,
                                                       'Labor/Worker':3,'driver':4,'Engineer':5,
                                                       'Cashier':6,'Manager':7,'Artist':8
                                                       ,'Teacher':9}).astype('int')
mod_data['Platform']=mod_data['Platform'].replace({'TikTok':1,'Instagram':2
                                                  ,'YouTube':3,'Facebook':4}).astype('int')
mod_data['DeviceType']=mod_data['DeviceType'].replace({'Smartphone':1,'Tablet':2
                                                       ,'Computer':3}).astype('int')
mod_data['Watch Reason']=mod_data['Watch Reason'].replace({'Habit':1,'Boredom':2,
                                                           'Entertainment':3,'Procrastination':4}).astype('int')
mod_data['CurrentActivity']=mod_data['CurrentActivity'].replace({'At home':1,
                                                                 'At school':2,
                                                                 'At work':3,
                                                                 'Commuting':4}).astype('int')
mod_data['Frequency']=mod_data['Frequency'].replace({'Evening':1,'Night':2,'Afternoon':3
                                                     ,'Morning':4}).astype('int')

  mod_data['Demographics']=mod_data['Demographics'].replace({'Rural':0,'Urban':1}).astype('int')
  mod_data['Gender']=mod_data['Gender'].replace({'Male':1,'Female':2,'Other':3}).astype('int')
  mod_data['ConnectionType']=mod_data['ConnectionType'].replace({'Mobile Data':1,'Wi-Fi':2}).astype('int')
  mod_data['Profession']=mod_data['Profession'].replace({'Students':1,'Waiting staff':2,
  mod_data['Platform']=mod_data['Platform'].replace({'TikTok':1,'Instagram':2
  mod_data['DeviceType']=mod_data['DeviceType'].replace({'Smartphone':1,'Tablet':2
  mod_data['Watch Reason']=mod_data['Watch Reason'].replace({'Habit':1,'Boredom':2,
  mod_data['CurrentActivity']=mod_data['CurrentActivity'].replace({'At home':1,
  mod_data['Frequency']=mod_data['Frequency'].replace({'Evening':1,'Night':2,'Afternoon':3


In [12]:
mod_data['Video Category']=mod_data['Video Category'].replace({'Jokes/Memes':1,
                                                               'Life Hacks':2,
                                                               'Gaming':3,
                                                               'Vlogs':4,
                                                               'Pranks':5,
                                                               'Entertainment':6,
                                                               'Trends':7,
                                                               'ASMR':8,
                                                               'Comedy':9}).astype('int')

  mod_data['Video Category']=mod_data['Video Category'].replace({'Jokes/Memes':1,


In [13]:
mod_data.head()

Unnamed: 0,Age,Gender,Income,Debt,Owns Property,Profession,Demographics,Platform,Total Time Spent,Number of Sessions,...,Frequency,ProductivityLoss,Satisfaction,Watch Reason,DeviceType,OS,Self Control,Addiction Level,CurrentActivity,ConnectionType
0,56,1,82812,True,True,5,0,2,80,17,...,2,3,7,4,1,Android,5,5,4,1
1,46,2,27999,False,True,8,1,2,228,14,...,3,5,5,1,3,Android,7,3,2,2
2,32,2,42436,False,True,5,0,4,30,6,...,1,6,4,3,2,Android,8,2,1,1
3,60,1,62963,True,False,2,0,3,101,19,...,2,3,7,1,1,Android,5,5,4,1
4,25,1,22096,False,True,7,1,1,136,6,...,4,8,2,2,1,iOS,10,0,1,1


In [14]:
mod_data.shape

(1000, 27)

In [15]:
x=mod_data.drop(columns=['Debt','Owns Property','Demographics','Video Length',
                         'Importance Score','Watch Reason','OS','Satisfaction'],axis=1)
y=mod_data['Satisfaction']

In [16]:
x_train,x_test,y_train,y_test=train_test_split(x,y,test_size=0.4,stratify=y,random_state=1)
x_train1,x_val,y_train1,y_val=train_test_split(x,y,test_size=0.3,stratify=y,random_state=23)

In [17]:
print(f'The test size of independent variables is={x_test.shape}')
print(f'The test size of dependent variables is={y_test.shape}')
print(f'The train size of independent variables is={x_train.shape}')
print(f'The train size of dependent variables is={y_train.shape}')

The test size of independent variables is=(400, 19)
The test size of dependent variables is=(400,)
The train size of independent variables is=(600, 19)
The train size of dependent variables is=(600,)


In [18]:
print(f'The test size of independent variables is={x_val.shape}')
print(f'The test size of dependent variables is={y_val.shape}')
print(f'The train size of independent variables is={x_train1.shape}')
print(f'The train size of dependent variables is={y_train1.shape}')

The test size of independent variables is=(300, 19)
The test size of dependent variables is=(300,)
The train size of independent variables is=(700, 19)
The train size of dependent variables is=(700,)


In [19]:
s=StandardScaler()
x_train_scaled=s.fit_transform(x_train)
x_test_scaled=s.fit_transform(x_test)
x_val_scaled=s.fit_transform(x_val)
x_train1_scaled=s.fit_transform(x_train1)

In [20]:
#LinearRegression
lr=LinearRegression()
lr1=LinearRegression()
lr.fit(x_train_scaled,y_train)
lr1.fit(x_train1_scaled,y_train1)
pred_1=lr.predict(x_test_scaled)
pred_11=lr1.predict(x_val_scaled)
data_test_lr={'original data':y_test,
         'test case prediction':pred_1,}
df_test_lr=pd.DataFrame(data_test_lr)

print(df_test_lr.head())

print(f'mean absolute error for test case ={mean_absolute_error(y_test,pred_1)}')
print(f'mean squared error for test case ={mean_squared_error(y_test,pred_1)}')
print(f'root mean squared error for test case ={root_mean_squared_error(y_test,pred_1)}')
print(f'r2 score for test case ={r2_score(y_test,pred_1)}')


data_val_lr={'original data':y_val,
         'validate case prediction':pred_11,}
df_val_lr=pd.DataFrame(data_val_lr)
#print(df_val_lr)

print(df_val_lr.head())


print(f'mean absolute error for validate case ={mean_absolute_error(y_val,pred_11)}')
print(f'mean squared error for validate case ={mean_squared_error(y_val,pred_11)}')
print(f'root mean squared error for test case ={root_mean_squared_error(y_val,pred_11)}')
print(f'r2 score for tesvalidatet case ={r2_score(y_val,pred_11)}')

     original data  test case prediction
31               5              5.002142
736              5              5.002142
529              8              7.994327
936              2              2.009957
448              8              7.994327
mean absolute error for test case =0.005236214880950031
mean squared error for test case =3.68799413896673e-05
root mean squared error for test case =0.006072885754702397
r2 score for test case =0.9999918291402744
     original data  validate case prediction
698              5                  5.000409
152              4                  4.004387
61               5                  5.000409
552              7                  6.992452
161              5                  5.000409
mean absolute error for validate case =0.007161816404951633
mean squared error for validate case =7.252536851011292e-05
root mean squared error for test case =0.008516182742879167
r2 score for tesvalidatet case =0.999983971074125


In [21]:
#Ridge
rr=Ridge()
rrcv=RidgeCV(cv=4)
rr.fit(x_train_scaled,y_train)
rrcv.fit(x_train1_scaled,y_train1)
pred_1_2=rr.predict(x_test_scaled)
pred_11_2=rrcv.predict(x_val_scaled)
data_test_rr={'original data':y_test,
         'test case prediction':pred_1_2,}
df_test_rr=pd.DataFrame(data_test_rr)

print(df_test_rr.head())

print(f'mean absolute error for test case ={mean_absolute_error(y_test,pred_1_2)}')
print(f'mean squared error for test case ={mean_squared_error(y_test,pred_1_2)}')
print(f'root mean squared error for test case ={root_mean_squared_error(y_test,pred_1_2)}')
print(f'r2 score for test case ={r2_score(y_test,pred_1_2)}')


data_val_rr={'original data':y_val,
         'validate case prediction':pred_11_2,}
df_val_rr=pd.DataFrame(data_val_rr)
#print(df_val_lr)

print(df_val_rr.head())


print(f'mean absolute error for validate case ={mean_absolute_error(y_val,pred_11_2)}')
print(f'mean squared error for validate case ={mean_squared_error(y_val,pred_11_2)}')
print(f'root mean squared error for validate case ={root_mean_squared_error(y_val,pred_11_2)}')
print(f'r2 score for validate case ={r2_score(y_val,pred_11_2)}')


     original data  test case prediction
31               5              4.984462
736              5              5.003672
529              8              8.001047
936              2              2.003196
448              8              8.002857
mean absolute error for test case =0.008631019316277804
mean squared error for test case =0.0005303955185793961
root mean squared error for test case =0.02303031737904183
r2 score for test case =0.9998824893094157
     original data  validate case prediction
698              5                  4.999606
152              4                  4.004076
61               5                  4.998764
552              7                  6.993058
161              5                  4.999957
mean absolute error for validate case =0.007200919099748947
mean squared error for validate case =8.774121529319823e-05
root mean squared error for validate case =0.009367028092901089
r2 score for validate case =0.9999806082000683


In [22]:
#Lasso
l=Lasso()
lcv=LassoCV(cv=4)
l.fit(x_train_scaled,y_train)
lcv.fit(x_train1_scaled,y_train1)
pred_1_3=l.predict(x_test_scaled)
pred_11_3=l.predict(x_val_scaled)
data_test_l={'original data':y_test,
         'test case prediction':pred_1_3,}
df_test_l=pd.DataFrame(data_test_l)

print(df_test_l.head())

print(f'mean absolute error for test case ={mean_absolute_error(y_test,pred_1_3)}')
print(f'mean squared error for test case ={mean_squared_error(y_test,pred_1_3)}')
print(f'root mean squared error for test case ={root_mean_squared_error(y_test,pred_1_3)}')
print(f'r2 score for test case ={r2_score(y_test,pred_1_3)}')


data_val_l={'original data':y_val,
         'validate case prediction':pred_11_3,}
df_val_l=pd.DataFrame(data_val_l)
#print(df_val_lr)

print(df_val_lr.head())


print(f'mean absolute error for validate case ={mean_absolute_error(y_val,pred_11_3)}')
print(f'mean squared error for validate case ={mean_squared_error(y_val,pred_11_3)}')
print(f'root mean squared error for validate case ={root_mean_squared_error(y_val,pred_11_3)}')
print(f'r2 score for validate case ={r2_score(y_val,pred_11_3)}')

     original data  test case prediction
31               5              4.937421
736              5              4.937421
529              8              6.517524
936              2              3.357319
448              8              6.517524
mean absolute error for test case =0.8378077979504732
mean squared error for test case =1.0111057446328382
root mean squared error for test case =1.0055375401410125
r2 score for test case =0.7759865418475382
     original data  validate case prediction
698              5                  5.000409
152              4                  4.004387
61               5                  5.000409
552              7                  6.992452
161              5                  5.000409
mean absolute error for validate case =0.8401418702130798
mean squared error for validate case =1.0163413807859016
root mean squared error for validate case =1.0081375802864914
r2 score for validate case =0.775377071623055


In [23]:
#ElasticNet
el=ElasticNet()
elcv=ElasticNetCV(cv=3,l1_ratio=0.95)
el.fit(x_train_scaled,y_train)
elcv.fit(x_train1_scaled,y_train1)
pred_1_4=el.predict(x_test_scaled)
pred_11_4=elcv.predict(x_val_scaled)
data_test_el={'original data':y_test,
         'test case prediction':pred_1_4,}
df_test_el=pd.DataFrame(data_test_el)

print(df_test_el.head())

print(f'mean absolute error for test case ={mean_absolute_error(y_test,pred_1_4)}')
print(f'mean squared error for test case ={mean_squared_error(y_test,pred_1_4)}')
print(f'root mean squared error for test case ={root_mean_squared_error(y_test,pred_1_4)}')
print(f'r2 score for test case ={r2_score(y_test,pred_1)}')


data_val_el={'original data':y_val,
         'validate case prediction':pred_11_4,}
df_val_el=pd.DataFrame(data_val_el)
#print(df_val_lr)

print(df_val_el.head())


print(f'mean absolute error for validate case ={mean_absolute_error(y_val,pred_11_4)}')
print(f'mean squared error for validate case ={mean_squared_error(y_val,pred_11_4)}')
print(f'root mean squared error for validate case ={root_mean_squared_error(y_val,pred_11_4)}')
print(f'r2 score for validate case ={r2_score(y_val,pred_11_4)}')

     original data  test case prediction
31               5              4.939871
736              5              4.939871
529              8              6.931058
936              2              2.948684
448              8              6.931058
mean absolute error for test case =0.611953010271243
mean squared error for test case =0.5598716270985997
root mean squared error for test case =0.7482456996860054
r2 score for test case =0.9999918291402744
     original data  validate case prediction
698              5                  5.000154
152              4                  4.005058
61               5                  5.000154
552              7                  6.990347
161              5                  5.000154
mean absolute error for validate case =0.008852682684536359
mean squared error for validate case =0.00011478058128139702
root mean squared error for validate case =0.010713569959700502
r2 score for validate case =0.9999746321946782


In [24]:
#DecisionTreeRegressor
dt=DecisionTreeRegressor()
dt1=DecisionTreeRegressor(max_features=7,max_depth=5,criterion='friedman_mse',splitter='random')
dt.fit(x_train_scaled,y_train)
dt1.fit(x_train1_scaled,y_train1)
pred_2=dt.predict(x_test_scaled)
pred_22=dt1.predict(x_val_scaled)
data_test_dt={'original data':y_test,
         'test case prediction':pred_2}
df_test_dt=pd.DataFrame(data_test_dt)

print(df_test_dt.head())

print(f'mean absolute error for test case ={mean_absolute_error(y_test,pred_2)}')
print(f'mean squared error for test case ={mean_squared_error(y_test,pred_2)}')
print(f'root mean squared error for test case ={root_mean_squared_error(y_test,pred_2)}')
print(f'r2 score for test case ={r2_score(y_test,pred_2)}')


data_val_dt={'original data':y_val,
         'validate case prediction':pred_22}
df_val_dt=pd.DataFrame(data_val_dt)
#print(df_val_lr)

print(df_val_lr.head())


print(f'mean absolute error for validate case ={mean_absolute_error(y_val,pred_22)}')
print(f'mean squared error for validate case ={mean_squared_error(y_val,pred_22)}')
print(f'root mean squared error for validate case ={root_mean_squared_error(y_val,pred_22)}')
print(f'r2 score for validate case ={r2_score(y_val,pred_22)}')

     original data  test case prediction
31               5                   5.0
736              5                   5.0
529              8                   8.0
936              2                   2.0
448              8                   8.0
mean absolute error for test case =0.0
mean squared error for test case =0.0
root mean squared error for test case =0.0
r2 score for test case =1.0
     original data  validate case prediction
698              5                  5.000409
152              4                  4.004387
61               5                  5.000409
552              7                  6.992452
161              5                  5.000409
mean absolute error for validate case =0.2565781710914455
mean squared error for validate case =0.2115910930117212
root mean squared error for validate case =0.45999031838911697
r2 score for validate case =0.9532359777636729


In [25]:
#RandomForestRegressor
rf=RandomForestRegressor()
rf1=RandomForestRegressor(max_depth=10,n_estimators=220,max_features=7,criterion='friedman_mse',bootstrap=True,
                          oob_score=True,random_state=30)
rf.fit(x_train_scaled,y_train)
rf1.fit(x_train1_scaled,y_train1)
pred_3=rf.predict(x_test_scaled)
pred_33=rf1.predict(x_val_scaled)
data_test_rf={'original data':y_test,
         'test case prediction':pred_3}
df_test_rf=pd.DataFrame(data_test_rf)

print(df_test_rf.head())

print(f'mean absolute error for test case ={mean_absolute_error(y_test,pred_3)}')
print(f'mean squared error for test case ={mean_squared_error(y_test,pred_3)}')
print(f'root mean squared error for test case ={root_mean_squared_error(y_test,pred_3)}')
print(f'r2 score for test case ={r2_score(y_test,pred_3)}')


data_val_rf={'original data':y_val,
         'validate case prediction':pred_33}
df_val_rf=pd.DataFrame(data_val_rf)
#print(df_val_lr)

print(df_val_rf.head())


print(f'mean absolute error for validate case ={mean_absolute_error(y_val,pred_33)}')
print(f'mean squared error for validate case ={mean_squared_error(y_val,pred_33)}')
print(f'root mean squared error for validate case ={root_mean_squared_error(y_val,pred_33)}')
print(f'r2 score for validate case ={r2_score(y_val,pred_33)}')

     original data  test case prediction
31               5                   5.0
736              5                   5.0
529              8                   8.0
936              2                   2.0
448              8                   8.0
mean absolute error for test case =0.0
mean squared error for test case =0.0
root mean squared error for test case =0.0
r2 score for test case =1.0
     original data  validate case prediction
698              5                       5.0
152              4                       4.0
61               5                       5.0
552              7                       7.0
161              5                       5.0
mean absolute error for validate case =0.02059811912225709
mean squared error for validate case =0.005299069725570208
root mean squared error for validate case =0.07279470946140391
r2 score for validate case =0.9988288457186395


In [26]:
#AdaBoostRegressor
adb=AdaBoostRegressor()
adb1=AdaBoostRegressor()
adb.fit(x_train_scaled,y_train)
adb1.fit(x_train1_scaled,y_train1)
pred_4=adb.predict(x_test_scaled)
pred_44=adb1.predict(x_val_scaled)
data_test_adb={'original data':y_test,
         'test case prediction':pred_4}
df_test_adb=pd.DataFrame(data_test_adb)

print(df_test_adb.head())

print(f'mean absolute error for test case ={mean_absolute_error(y_test,pred_4)}')
print(f'mean squared error for test case ={mean_squared_error(y_test,pred_4)}')
print(f'root mean squared error for test case ={root_mean_squared_error(y_test,pred_4)}')
print(f'r2 score for test case ={r2_score(y_test,pred_4)}')


data_val_adb={'original data':y_val,
         'validate case prediction':pred_44}
df_val_adb=pd.DataFrame(data_val_adb)
#print(df_val_lr)

print(df_val_adb.head())


print(f'mean absolute error for validate case ={mean_absolute_error(y_val,pred_44)}')
print(f'mean squared error for validate case ={mean_squared_error(y_val,pred_44)}')
print(f'root mean squared error for validate case ={root_mean_squared_error(y_val,pred_44)}')
print(f'r2 score for validate case ={r2_score(y_val,pred_44)}')

     original data  test case prediction
31               5                   5.0
736              5                   5.0
529              8                   8.0
936              2                   2.0
448              8                   8.0
mean absolute error for test case =0.0
mean squared error for test case =0.0
root mean squared error for test case =0.0
r2 score for test case =1.0
     original data  validate case prediction
698              5                       5.0
152              4                       4.0
61               5                       5.0
552              7                       7.0
161              5                       5.0
mean absolute error for validate case =0.0
mean squared error for validate case =0.0
root mean squared error for validate case =0.0
r2 score for validate case =1.0


In [27]:
#GradientBoostingRegressor
gb=GradientBoostingRegressor()
gb1=GradientBoostingRegressor()
gb.fit(x_train_scaled,y_train)
gb1.fit(x_train1_scaled,y_train1)
pred_5=gb.predict(x_test_scaled)
pred_55=gb.predict(x_val_scaled)
data_test_gb={'original data':y_test,
         'test case prediction':pred_5}
df_test_gb=pd.DataFrame(data_test_gb)

print(df_test_gb.head())

print(f'mean absolute error for test case ={mean_absolute_error(y_test,pred_5)}')
print(f'mean squared error for test case ={mean_squared_error(y_test,pred_5)}')
print(f'root mean squared error for test case ={root_mean_squared_error(y_test,pred_5)}')
print(f'r2 score for test case ={r2_score(y_test,pred_5)}')


data_val_gb={'original data':y_val,
         'validate case prediction':pred_55}
df_val_gb=pd.DataFrame(data_val_gb)
#print(df_val_lr)

print(df_val_lr.head())


print(f'mean absolute error for validate case ={mean_absolute_error(y_val,pred_55)}')
print(f'mean squared error for validate case ={mean_squared_error(y_val,pred_55)}')
print(f'root mean squared error for validate case ={root_mean_squared_error(y_val,pred_55)}')
print(f'r2 score for validate case ={r2_score(y_val,pred_55)}')

     original data  test case prediction
31               5              4.999993
736              5              4.999993
529              8              7.999911
936              2              2.000089
448              8              7.999911
mean absolute error for test case =4.98515278551398e-05
mean squared error for test case =3.916175689557528e-09
root mean squared error for test case =6.257935513855611e-05
r2 score for test case =0.9999999991323597
     original data  validate case prediction
698              5                  5.000409
152              4                  4.004387
61               5                  5.000409
552              7                  6.992452
161              5                  5.000409
mean absolute error for validate case =4.993841209180857e-05
mean squared error for validate case =3.926399011692566e-09
root mean squared error for validate case =6.266098476478458e-05
r2 score for validate case =0.9999999991322215


In [28]:
#XGradientBoostingRegressor
xgb=xgb.XGBRegressor()
#xgb1=xgb.XGBRegressor()
xgb.fit(x_train_scaled,y_train)
xgb.fit(x_train1_scaled,y_train1)
pred_6=xgb.predict(x_test_scaled)
pred_66=xgb.predict(x_val_scaled)
data_test_xgb={'original data':y_test,
         'test case prediction':pred_6}
df_test_xgb=pd.DataFrame(data_test_xgb)

print(df_test_xgb.head())

print(f'mean absolute error for test case ={mean_absolute_error(y_test,pred_6)}')
print(f'mean squared error for test case ={mean_squared_error(y_test,pred_6)}')
print(f'root mean squared error for test case ={root_mean_squared_error(y_test,pred_6)}')
print(f'r2 score for test case ={r2_score(y_test,pred_6)}')


data_val_xgb={'original data':y_val,
         'validate case prediction':pred_66}
df_val_xgb=pd.DataFrame(data_val_xgb)
#print(df_val_lr)

print(df_val_xgb.head())


print(f'mean absolute error for validate case ={mean_absolute_error(y_val,pred_66)}')
print(f'mean squared error for validate case ={mean_squared_error(y_val,pred_66)}')
print(f'root mean squared error for validate case ={root_mean_squared_error(y_val,pred_66)}')
print(f'r2 score for validate case ={r2_score(y_val,pred_66)}')

     original data  test case prediction
31               5              5.999991
736              5              5.999991
529              8              7.999955
936              2              3.000012
448              8              7.999955
mean absolute error for test case =0.6475135684013367
mean squared error for test case =0.6474987268447876
root mean squared error for test case =0.8046730756759644
r2 score for test case =0.8565447330474854
     original data  validate case prediction
698              5                  5.999991
152              4                  4.999985
61               5                  5.999991
552              7                  6.999974
161              5                  5.999991
mean absolute error for validate case =0.6466802954673767
mean squared error for validate case =0.6466653347015381
root mean squared error for validate case =0.8041550517082214
r2 score for validate case =0.8570796251296997


In [29]:
rf1_pkl=pickle.dump(rf1,open('random_forest.pkl','wb'))
ss_pkl=pickle.dump(s,open('standard_scaler.pkl','wb'))