### Practice Exercise (Multiple Linear Regression)

In [25]:
import pandas as pd
import numpy as np
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import r2_score, mean_absolute_error, mean_squared_error

In [15]:
#loading the dataset
df = pd.read_csv('empolyee_productivity.csv')

print(f'Shape of the dataset: {df.shape}\n')
print(f'Missing Values of the dataset:\n {df.isnull().sum()}')

#feature and target variable
X = df.iloc[:,1:-1].values
y = df[['ProductivityScore']].values

str_col = [1,2,3,7,8]
int_col = [0,4,5,6,9,10,11]

#handling the missing values
impute_mean = SimpleImputer(missing_values=np.nan,strategy='mean')
X[:,int_col] = impute_mean.fit_transform(X[:,int_col])

impute_frequent = SimpleImputer(missing_values=np.nan,strategy='most_frequent')
X[:,str_col] = impute_frequent.fit_transform(X[:,str_col])

y = impute_mean.fit_transform(y)

#encoding categorial col

ct = ColumnTransformer(transformers=[('Encode',OneHotEncoder(),str_col)],remainder='passthrough')

X = ct.fit_transform(X)

Shape of the dataset: (25, 14)

Missing Values of the dataset:
 EmpID                0
Age                  4
Gender               3
Education            0
Department           0
Experience           0
TrainingHours        2
TeamSize             4
RemoteWork           0
WorkLifeBalance      0
ManagerRating        0
SatisfactionScore    3
MonthlyHours         2
ProductivityScore    2
dtype: int64


In [18]:
#splitting the train and test set
X_train, X_test , y_train, y_test = train_test_split(X,y,test_size=0.2,random_state=42)

In [19]:
#Training the regression model
ln = LinearRegression()
ln.fit(X_train,y_train)

In [20]:
#Making the prediction
y_pred = ln.predict(X_test)

In [23]:
#comparing the models
np.set_printoptions(precision=2)
print(np.concatenate((y_pred.reshape(len(y_pred),1),y_test.reshape(len(y_test),1)),1))

[[81.93 72.  ]
 [79.45 76.  ]
 [83.54 82.  ]
 [80.39 77.  ]
 [83.14 83.  ]]


In [36]:
#evaluate the regression
R2 = r2_score(y_test,y_pred)
mae = mean_absolute_error(y_test,y_pred)
mse = np.sqrt(mean_squared_error(y_test,y_pred))

print(f'Mean Absolute Error: {mae:2f}\n')
print(f'Mean Squared Error: {mse:2f}\n\n')

if R2 <=0.9:
    print('Model is perfect')
elif R2 <= 0.7:
    print('Okish')
else:
    print('Model needs improvement')

Mean Absolute Error: 3.690656

Mean Squared Error: 4.989558


Model is perfect


In [43]:
#comparision by df
df_comparison = pd.DataFrame({
    'Actual' : y_test.ravel(),
    'Predicted' : y_pred.ravel(),
    'Error': y_test.ravel() - y_pred.ravel()
})
df_comparison

Unnamed: 0,Actual,Predicted,Error
0,72.0,81.934399,-9.934399
1,76.0,79.448935,-3.448935
2,82.0,83.539593,-1.539593
3,77.0,80.391359,-3.391359
4,83.0,83.138996,-0.138996
