In [None]:
from sklearn.preprocessing import StandardScaler,MinMaxScaler
from sklearn.model_selection import train_test_split,cross_val_score,GridSearchCV,RandomizedSearchCV
from sklearn.linear_model import LinearRegression,ElasticNet,ElasticNetCV,Ridge,RidgeCV,Lasso,LassoCV
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestRegressor,GradientBoostingRegressor,AdaBoostRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.preprocessing import OrdinalEncoder,LabelEncoder
from sklearn.metrics import r2_score,mean_absolute_error,mean_squared_error,root_mean_squared_error
import xgboost as xgbss
import pandas as pd
import numpy as np
import logging
import os
import matplotlib.pyplot as plt # type: ignore
import seaborn as sns # type: ignore
import pickle

In [None]:
logging.basicConfig(
    filename='prediction.log',
    filemode='w',
    level=logging.DEBUG,
    format='%(asctime)s-%(name)s-%(levelname)s-%(message)s',
    datefmt='%Y-%m-%d %H:%M:%S'
)

In [None]:
social=pd.read_csv(r'Time-Wasters on Social Media.csv')
data=pd.DataFrame(social)
logging.info('retrieving the data')

In [None]:
data.shape

In [None]:
data.info()

In [None]:
data.head()

In [None]:
logging.debug('modification of data')
mod_data=data.drop(columns=['UserID','Video ID','Location','Watch Time'])

In [None]:
mod_data.head()

In [None]:
for i in mod_data.columns:
    if mod_data[i].dtype=='int':
        q3=mod_data[i].quantile(0.75)
        q1=mod_data[i].quantile(0.25)
        iqr=q3-q1
        lower_bound=q1-1.5*iqr
        upper_bound=q3+1.5*iqr
        mod_data[(mod_data[i]>=lower_bound) & (mod_data[i]<=upper_bound)]
        
    elif mod_data[i].dtype=='float':
        q3=mod_data[i].quantile(0.75)
        q1=mod_data[i].quantile(0.25)
        iqr=q3-q1
        lower_bound=q1-1.5*iqr
        upper_bound=q3+1.5*iqr
        mod_data[(mod_data[i]>=lower_bound) & (mod_data[i]<=upper_bound)]

In [None]:
for i in mod_data.columns:
    if mod_data[i].dtype=='object':
        print(mod_data[i].value_counts())

In [None]:
mod_data['Demographics']=mod_data['Demographics'].replace({'Rural':0,'Urban':1}).astype('int')
mod_data['Gender']=mod_data['Gender'].replace({'Male':1,'Female':2,'Other':3}).astype('int')
mod_data['ConnectionType']=mod_data['ConnectionType'].replace({'Mobile Data':1,'Wi-Fi':2}).astype('int')
mod_data['Profession']=mod_data['Profession'].replace({'Students':1,'Waiting staff':2,
                                                       'Labor/Worker':3,'driver':4,'Engineer':5,
                                                       'Cashier':6,'Manager':7,'Artist':8
                                                       ,'Teacher':9}).astype('int')
mod_data['Platform']=mod_data['Platform'].replace({'TikTok':1,'Instagram':2
                                                  ,'YouTube':3,'Facebook':4}).astype('int')
mod_data['DeviceType']=mod_data['DeviceType'].replace({'Smartphone':1,'Tablet':2
                                                       ,'Computer':3}).astype('int')
mod_data['Watch Reason']=mod_data['Watch Reason'].replace({'Habit':1,'Boredom':2,
                                                           'Entertainment':3,'Procrastination':4}).astype('int')
mod_data['CurrentActivity']=mod_data['CurrentActivity'].replace({'At home':1,
                                                                 'At school':2,
                                                                 'At work':3,
                                                                 'Commuting':4}).astype('int')
mod_data['Frequency']=mod_data['Frequency'].replace({'Evening':1,'Night':2,'Afternoon':3
                                                     ,'Morning':4}).astype('int')

In [None]:
mod_data['Video Category']=mod_data['Video Category'].replace({'Jokes/Memes':1,
                                                               'Life Hacks':2,
                                                               'Gaming':3,
                                                               'Vlogs':4,
                                                               'Pranks':5,
                                                               'Entertainment':6,
                                                               'Trends':7,
                                                               'ASMR':8,
                                                               'Comedy':9}).astype('int')

In [None]:
mod_data.head()

In [None]:
mod_data.shape

In [None]:
x=mod_data.drop(columns=['Debt','Owns Property','Demographics','Video Length',
                         'Importance Score','Watch Reason','OS','Satisfaction'],axis=1)
y=mod_data['Satisfaction']

In [None]:
x_train,x_test,y_train,y_test=train_test_split(x,y,test_size=0.4,stratify=y,random_state=1)
x_train1,x_val,y_train1,y_val=train_test_split(x_test,y_test,test_size=0.5,stratify=y,random_state=23)

In [None]:
print(f'The test size of independent variables is={x_test.shape}')
print(f'The test size of dependent variables is={y_test.shape}')
print(f'The train size of independent variables is={x_train.shape}')
print(f'The train size of dependent variables is={y_train.shape}')

In [None]:
print(f'The test size of independent variables is={x_val.shape}')
print(f'The test size of dependent variables is={y_val.shape}')
print(f'The train size of independent variables is={x_train1.shape}')
print(f'The train size of dependent variables is={y_train1.shape}')

In [None]:
s=StandardScaler()
x_train_scaled=s.fit_transform(x_train)
x_test_scaled=s.fit_transform(x_test)
x_val_scaled=s.fit_transform(x_val)
x_train1_scaled=s.fit_transform(x_train1)

In [None]:
#LinearRegression
lr=LinearRegression()
lr1=LinearRegression()
lr.fit(x_train_scaled,y_train)
lr1.fit(x_train1_scaled,y_train1)
pred_1=lr.predict(x_test_scaled)
pred_11=lr1.predict(x_val_scaled)
data_test_lr={'original data':y_test,
         'test case prediction':pred_1,}
df_test_lr=pd.DataFrame(data_test_lr)

print(df_test_lr.head())

print(f'mean absolute error for test case ={mean_absolute_error(y_test,pred_1)}')
print(f'mean squared error for test case ={mean_squared_error(y_test,pred_1)}')
print(f'root mean squared error for test case ={root_mean_squared_error(y_test,pred_1)}')
print(f'r2 score for test case ={r2_score(y_test,pred_1)}')


data_val_lr={'original data':y_val,
         'validate case prediction':pred_11,}
df_val_lr=pd.DataFrame(data_val_lr)
#print(df_val_lr)

print(df_val_lr.head())


print(f'mean absolute error for validate case ={mean_absolute_error(y_val,pred_11)}')
print(f'mean squared error for validate case ={mean_squared_error(y_val,pred_11)}')
print(f'root mean squared error for test case ={root_mean_squared_error(y_val,pred_11)}')
print(f'r2 score for tesvalidatet case ={r2_score(y_val,pred_11)}')

In [None]:
#Ridge
rr=Ridge()
rrcv=RidgeCV(cv=4)
rr.fit(x_train_scaled,y_train)
rrcv.fit(x_train1_scaled,y_train1)
pred_1_2=rr.predict(x_test_scaled)
pred_11_2=rrcv.predict(x_val_scaled)
data_test_rr={'original data':y_test,
         'test case prediction':pred_1_2,}
df_test_rr=pd.DataFrame(data_test_rr)

print(df_test_rr.head())

print(f'mean absolute error for test case ={mean_absolute_error(y_test,pred_1_2)}')
print(f'mean squared error for test case ={mean_squared_error(y_test,pred_1_2)}')
print(f'root mean squared error for test case ={root_mean_squared_error(y_test,pred_1_2)}')
print(f'r2 score for test case ={r2_score(y_test,pred_1_2)}')


data_val_rr={'original data':y_val,
         'validate case prediction':pred_11_2,}
df_val_rr=pd.DataFrame(data_val_rr)
#print(df_val_lr)

print(df_val_rr.head())


print(f'mean absolute error for validate case ={mean_absolute_error(y_val,pred_11_2)}')
print(f'mean squared error for validate case ={mean_squared_error(y_val,pred_11_2)}')
print(f'root mean squared error for validate case ={root_mean_squared_error(y_val,pred_11_2)}')
print(f'r2 score for validate case ={r2_score(y_val,pred_11_2)}')


In [None]:
#Lasso
l=Lasso()
lcv=LassoCV(cv=4)
l.fit(x_train_scaled,y_train)
lcv.fit(x_train1_scaled,y_train1)
pred_1_3=l.predict(x_test_scaled)
pred_11_3=l.predict(x_val_scaled)
data_test_l={'original data':y_test,
         'test case prediction':pred_1_3,}
df_test_l=pd.DataFrame(data_test_l)

print(df_test_l.head())

print(f'mean absolute error for test case ={mean_absolute_error(y_test,pred_1_3)}')
print(f'mean squared error for test case ={mean_squared_error(y_test,pred_1_3)}')
print(f'root mean squared error for test case ={root_mean_squared_error(y_test,pred_1_3)}')
print(f'r2 score for test case ={r2_score(y_test,pred_1_3)}')


data_val_l={'original data':y_val,
         'validate case prediction':pred_11_3,}
df_val_l=pd.DataFrame(data_val_l)
#print(df_val_lr)

print(df_val_lr.head())


print(f'mean absolute error for validate case ={mean_absolute_error(y_val,pred_11_3)}')
print(f'mean squared error for validate case ={mean_squared_error(y_val,pred_11_3)}')
print(f'root mean squared error for validate case ={root_mean_squared_error(y_val,pred_11_3)}')
print(f'r2 score for validate case ={r2_score(y_val,pred_11_3)}')

In [None]:
#ElasticNet
el=ElasticNet()
elcv=ElasticNetCV(cv=3,l1_ratio=0.95)
el.fit(x_train_scaled,y_train)
elcv.fit(x_train1_scaled,y_train1)
pred_1_4=el.predict(x_test_scaled)
pred_11_4=elcv.predict(x_val_scaled)
data_test_el={'original data':y_test,
         'test case prediction':pred_1_4,}
df_test_el=pd.DataFrame(data_test_el)

print(df_test_el.head())

print(f'mean absolute error for test case ={mean_absolute_error(y_test,pred_1_4)}')
print(f'mean squared error for test case ={mean_squared_error(y_test,pred_1_4)}')
print(f'root mean squared error for test case ={root_mean_squared_error(y_test,pred_1_4)}')
print(f'r2 score for test case ={r2_score(y_test,pred_1)}')


data_val_el={'original data':y_val,
         'validate case prediction':pred_11_4,}
df_val_el=pd.DataFrame(data_val_el)
#print(df_val_lr)

print(df_val_el.head())


print(f'mean absolute error for validate case ={mean_absolute_error(y_val,pred_11_4)}')
print(f'mean squared error for validate case ={mean_squared_error(y_val,pred_11_4)}')
print(f'root mean squared error for validate case ={root_mean_squared_error(y_val,pred_11_4)}')
print(f'r2 score for validate case ={r2_score(y_val,pred_11_4)}')

In [None]:
#DecisionTreeRegressor
dt=DecisionTreeRegressor()
dt1=DecisionTreeRegressor(max_features=7,max_depth=5,criterion='friedman_mse',splitter='random')
dt.fit(x_train_scaled,y_train)
dt1.fit(x_train1_scaled,y_train1)
pred_2=dt.predict(x_test_scaled)
pred_22=dt1.predict(x_val_scaled)
data_test_dt={'original data':y_test,
         'test case prediction':pred_2}
df_test_dt=pd.DataFrame(data_test_dt)

print(df_test_dt.head())

print(f'mean absolute error for test case ={mean_absolute_error(y_test,pred_2)}')
print(f'mean squared error for test case ={mean_squared_error(y_test,pred_2)}')
print(f'root mean squared error for test case ={root_mean_squared_error(y_test,pred_2)}')
print(f'r2 score for test case ={r2_score(y_test,pred_2)}')


data_val_dt={'original data':y_val,
         'validate case prediction':pred_22}
df_val_dt=pd.DataFrame(data_val_dt)
#print(df_val_lr)

print(df_val_lr.head())


print(f'mean absolute error for validate case ={mean_absolute_error(y_val,pred_22)}')
print(f'mean squared error for validate case ={mean_squared_error(y_val,pred_22)}')
print(f'root mean squared error for validate case ={root_mean_squared_error(y_val,pred_22)}')
print(f'r2 score for validate case ={r2_score(y_val,pred_22)}')

In [None]:
#RandomForestRegressor
rf=RandomForestRegressor()
rf1=RandomForestRegressor(max_depth=10,n_estimators=220,max_features=7,criterion='friedman_mse',bootstrap=True,
                          oob_score=True,random_state=30)
rf.fit(x_train_scaled,y_train)
rf1.fit(x_train1_scaled,y_train1)
pred_3=rf.predict(x_test_scaled)
pred_33=rf1.predict(x_val_scaled)
data_test_rf={'original data':y_test,
         'test case prediction':pred_3}
df_test_rf=pd.DataFrame(data_test_rf)

print(df_test_rf.head())

print(f'mean absolute error for test case ={mean_absolute_error(y_test,pred_3)}')
print(f'mean squared error for test case ={mean_squared_error(y_test,pred_3)}')
print(f'root mean squared error for test case ={root_mean_squared_error(y_test,pred_3)}')
print(f'r2 score for test case ={r2_score(y_test,pred_3)}')


data_val_rf={'original data':y_val,
         'validate case prediction':pred_33}
df_val_rf=pd.DataFrame(data_val_rf)
#print(df_val_lr)

print(df_val_rf.head())


print(f'mean absolute error for validate case ={mean_absolute_error(y_val,pred_33)}')
print(f'mean squared error for validate case ={mean_squared_error(y_val,pred_33)}')
print(f'root mean squared error for validate case ={root_mean_squared_error(y_val,pred_33)}')
print(f'r2 score for validate case ={r2_score(y_val,pred_33)}')

In [None]:
#AdaBoostRegressor
adb=AdaBoostRegressor()
adb1=AdaBoostRegressor()
adb.fit(x_train_scaled,y_train)
adb1.fit(x_train1_scaled,y_train1)
pred_4=adb.predict(x_test_scaled)
pred_44=adb1.predict(x_val_scaled)
data_test_adb={'original data':y_test,
         'test case prediction':pred_4}
df_test_adb=pd.DataFrame(data_test_adb)

print(df_test_adb.head())

print(f'mean absolute error for test case ={mean_absolute_error(y_test,pred_4)}')
print(f'mean squared error for test case ={mean_squared_error(y_test,pred_4)}')
print(f'root mean squared error for test case ={root_mean_squared_error(y_test,pred_4)}')
print(f'r2 score for test case ={r2_score(y_test,pred_4)}')


data_val_adb={'original data':y_val,
         'validate case prediction':pred_44}
df_val_adb=pd.DataFrame(data_val_adb)
#print(df_val_lr)

print(df_val_adb.head())


print(f'mean absolute error for validate case ={mean_absolute_error(y_val,pred_44)}')
print(f'mean squared error for validate case ={mean_squared_error(y_val,pred_44)}')
print(f'root mean squared error for validate case ={root_mean_squared_error(y_val,pred_44)}')
print(f'r2 score for validate case ={r2_score(y_val,pred_44)}')

In [None]:
#GradientBoostingRegressor
gb=GradientBoostingRegressor()
gb1=GradientBoostingRegressor()
gb.fit(x_train_scaled,y_train)
gb1.fit(x_train1_scaled,y_train1)
pred_5=gb.predict(x_test_scaled)
pred_55=gb.predict(x_val_scaled)
data_test_gb={'original data':y_test,
         'test case prediction':pred_5}
df_test_gb=pd.DataFrame(data_test_gb)

print(df_test_gb.head())

print(f'mean absolute error for test case ={mean_absolute_error(y_test,pred_5)}')
print(f'mean squared error for test case ={mean_squared_error(y_test,pred_5)}')
print(f'root mean squared error for test case ={root_mean_squared_error(y_test,pred_5)}')
print(f'r2 score for test case ={r2_score(y_test,pred_5)}')


data_val_gb={'original data':y_val,
         'validate case prediction':pred_55}
df_val_gb=pd.DataFrame(data_val_gb)
#print(df_val_lr)

print(df_val_lr.head())


print(f'mean absolute error for validate case ={mean_absolute_error(y_val,pred_55)}')
print(f'mean squared error for validate case ={mean_squared_error(y_val,pred_55)}')
print(f'root mean squared error for validate case ={root_mean_squared_error(y_val,pred_55)}')
print(f'r2 score for validate case ={r2_score(y_val,pred_55)}')

In [None]:
#XGradientBoostingRegressor
xgb=xgb.XGBRegressor()
#xgb1=xgb.XGBRegressor()
xgb.fit(x_train_scaled,y_train)
xgb.fit(x_train1_scaled,y_train1)
pred_6=xgb.predict(x_test_scaled)
pred_66=xgb.predict(x_val_scaled)
data_test_xgb={'original data':y_test,
         'test case prediction':pred_6}
df_test_xgb=pd.DataFrame(data_test_xgb)

print(df_test_xgb.head())

print(f'mean absolute error for test case ={mean_absolute_error(y_test,pred_6)}')
print(f'mean squared error for test case ={mean_squared_error(y_test,pred_6)}')
print(f'root mean squared error for test case ={root_mean_squared_error(y_test,pred_6)}')
print(f'r2 score for test case ={r2_score(y_test,pred_6)}')


data_val_xgb={'original data':y_val,
         'validate case prediction':pred_66}
df_val_xgb=pd.DataFrame(data_val_xgb)
#print(df_val_lr)

print(df_val_xgb.head())


print(f'mean absolute error for validate case ={mean_absolute_error(y_val,pred_66)}')
print(f'mean squared error for validate case ={mean_squared_error(y_val,pred_66)}')
print(f'root mean squared error for validate case ={root_mean_squared_error(y_val,pred_66)}')
print(f'r2 score for validate case ={r2_score(y_val,pred_66)}')

In [None]:
rf1_pkl=pickle.dump(rf1,open('random_forest.pkl','wb'))
ss_pkl=pickle.dump(s,open('standard_scaler.pkl','wb'))