# Model Training



In [47]:
# Basic Import
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt 
import seaborn as sns
# Modelling
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.svm import SVR
from sklearn.metrics import r2_score, mean_absolute_error, mean_squared_error
from sklearn.model_selection import RandomizedSearchCV
from lazypredict.Supervised import LazyClassifier
from sklearn.preprocessing import StandardScaler
import warnings


In [3]:
#loading data
df = pd.read_csv('personality_dataset.csv')

In [5]:
# loading first five rows of dataset
df.head()

Unnamed: 0,Time_spent_Alone,Stage_fear,Social_event_attendance,Going_outside,Drained_after_socializing,Friends_circle_size,Post_frequency,Personality
0,4.0,No,4.0,6.0,No,13.0,5.0,Extrovert
1,9.0,Yes,0.0,0.0,Yes,0.0,3.0,Introvert
2,9.0,Yes,1.0,2.0,Yes,5.0,2.0,Introvert
3,0.0,No,6.0,7.0,No,14.0,8.0,Extrovert
4,3.0,No,9.0,4.0,No,8.0,5.0,Extrovert


In [7]:
# checking data with missing values
df.isnull().sum()

Time_spent_Alone             63
Stage_fear                   73
Social_event_attendance      62
Going_outside                66
Drained_after_socializing    52
Friends_circle_size          77
Post_frequency               65
Personality                   0
dtype: int64

In [11]:
# checking percentage of missing values from total dataset
missing_percentage = df.isnull().mean() * 100
missing_percentage

Time_spent_Alone             2.172414
Stage_fear                   2.517241
Social_event_attendance      2.137931
Going_outside                2.275862
Drained_after_socializing    1.793103
Friends_circle_size          2.655172
Post_frequency               2.241379
Personality                  0.000000
dtype: float64

In [21]:
# selecting numerical features
numerical_features = [feature for feature in df.columns if df[feature].dtype != object]
#selecting categorical features
categorical_features = [feature for feature in df.columns if df[feature].dtype == object]

print('We have {} numerical features : {}'.format(len(numerical_features), numerical_features))
print('\nWe have {} categorical features : {}'.format(len(categorical_features), categorical_features))


We have 5 numerical features : ['Time_spent_Alone', 'Social_event_attendance', 'Going_outside', 'Friends_circle_size', 'Post_frequency']

We have 3 categorical features : ['Stage_fear', 'Drained_after_socializing', 'Personality']


In [24]:
#handle missing values
# Fill numerical columns with median
for col in ['Time_spent_Alone', 'Social_event_attendance', 
            'Going_outside', 'Friends_circle_size', 'Post_frequency']:
    df[col].fillna(df[col].median(), inplace=True)

# Fill Personality column if it's categorical
if df['Personality'].dtype == 'object':
    df['Personality'].fillna(df['Personality'].mode()[0], inplace=True)



The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df[col].fillna(df[col].median(), inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df[col].fillna(df[col].median(), inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values

In [33]:

for col in categorical_features:
    df[col].fillna(df[col].mode()[0], inplace=True)

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df[col].fillna(df[col].mode()[0], inplace=True)


In [35]:
#checking do missing values still exist
df.isnull().sum()

Time_spent_Alone             0
Stage_fear                   0
Social_event_attendance      0
Going_outside                0
Drained_after_socializing    0
Friends_circle_size          0
Post_frequency               0
Personality                  0
dtype: int64

In [37]:
df.head()

Unnamed: 0,Time_spent_Alone,Stage_fear,Social_event_attendance,Going_outside,Drained_after_socializing,Friends_circle_size,Post_frequency,Personality,Personality_encoded,Social_engagement
0,4.0,No,4.0,6.0,No,13.0,5.0,Extrovert,1,17.0
1,9.0,Yes,0.0,0.0,Yes,0.0,3.0,Introvert,0,0.0
2,9.0,Yes,1.0,2.0,Yes,5.0,2.0,Introvert,0,6.0
3,0.0,No,6.0,7.0,No,14.0,8.0,Extrovert,1,20.0
4,3.0,No,9.0,4.0,No,8.0,5.0,Extrovert,1,17.0


In [38]:
# Encode the target variable
df['Personality_encoded'] = df['Personality'].map({'Introvert': 0, 'Extrovert': 1})

# Encode Drained_after_socializing
df['Encoded_Drained_after_socializing'] = df['Drained_after_socializing'].map({'No': 0, 'Yes': 1})

#  Feature engineering
df['Social_engagement'] = df['Social_event_attendance'] + df['Friends_circle_size']
df['Isolation_score'] = df['Time_spent_Alone'] + df['Encoded_Drained_after_socializing']

# . Select features and target
features = ['Time_spent_Alone', 'Friends_circle_size', 'Social_event_attendance',
            'Social_engagement', 'Isolation_score']
X = df[features]
y = df['Personality_encoded']

In [41]:
# Split data into train and test
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

X_train.shape,X_test.shape

((2320, 5), (580, 5))

### create an evaluate function to give all metrics after model training

In [43]:
def evaluate_model(true, predicted):
    mae = mean_absolute_error(true, predicted)
    mse = mean_squared_error(true, predicted)
    rmse = np.sqrt(mean_squared_error(true, predicted))
    r2_square = r2_score(true, predicted)
    return mae, rmse, r2_square

In [None]:
# scale feature 
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)


array([[ 0.73372663, -1.23017148, -1.3720507 , -1.38640414,  0.78670859],
       [ 1.31678736, -1.46564984, -1.3720507 , -1.53761741,  1.30551584],
       [ 0.44219627, -0.75921474, -1.02420264, -0.93276431,  0.52730496],
       ...,
       [-0.14086446, -0.288258  , -0.32850651, -0.3279112 ,  0.00849771],
       [ 0.73372663, -1.23017148, -1.3720507 , -1.38640414,  0.78670859],
       [-1.30698591,  1.12461221,  1.75858186,  1.48664811, -1.28852043]],
      shape=(2320, 5))

In [58]:
from tqdm import tqdm
tqdm.pandas(disable=True)

In [63]:
# LazyClassifier
clf = LazyClassifier(verbose=0, ignore_warnings=True, custom_metric=None)
models, prediction = clf.fit(X_train, X_test,y_train,y_test)

# Show results


ImportError: IProgress not found. Please update jupyter and ipywidgets. See https://ipywidgets.readthedocs.io/en/stable/user_install.html