In [77]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import LabelEncoder

In [78]:
df=pd.read_csv("interview.csv")

In [79]:
df.head()

Unnamed: 0,id,Time_spent_Alone,Stage_fear,Social_event_attendance,Going_outside,Drained_after_socializing,Friends_circle_size,Post_frequency,Personality
0,0,0.0,No,6.0,4.0,No,15.0,5.0,Extrovert
1,1,1.0,No,7.0,3.0,No,10.0,8.0,Extrovert
2,2,6.0,Yes,1.0,0.0,,3.0,0.0,Introvert
3,3,3.0,No,7.0,3.0,No,11.0,5.0,Extrovert
4,4,1.0,No,4.0,4.0,No,13.0,,Extrovert


In [80]:
df.shape

(18524, 9)

In [81]:
df.tail()

Unnamed: 0,id,Time_spent_Alone,Stage_fear,Social_event_attendance,Going_outside,Drained_after_socializing,Friends_circle_size,Post_frequency,Personality
18519,18519,3.0,No,7.0,3.0,No,9.0,7.0,Extrovert
18520,18520,1.0,,6.0,7.0,No,6.0,5.0,Extrovert
18521,18521,7.0,Yes,1.0,1.0,Yes,1.0,,Introvert
18522,18522,,Yes,1.0,0.0,Yes,5.0,2.0,Introvert
18523,18523,1.0,No,8.0,6.0,No,4.0,7.0,Extrovert


In [82]:
df.duplicated().sum()

np.int64(0)

In [83]:
df.isna().sum()

id                              0
Time_spent_Alone             1190
Stage_fear                   1893
Social_event_attendance      1180
Going_outside                1466
Drained_after_socializing    1149
Friends_circle_size          1054
Post_frequency               1264
Personality                     0
dtype: int64

In [84]:
df.shape

(18524, 9)

In [85]:
X = df.drop("Personality", axis=1)
y = df["Personality"]


### identify columns type

In [None]:
numeric_cols = X.select_dtypes(include=["int64", "float64"]).columns
categorical_cols = X.select_dtypes(include=["object"]).columns


In [87]:
from sklearn.impute import SimpleImputer

In [88]:
num_imputer = SimpleImputer(strategy="median")
X[numeric_cols] = num_imputer.fit_transform(X[numeric_cols])

In [89]:
cat_imputer = SimpleImputer(strategy="most_frequent")
X[categorical_cols] = cat_imputer.fit_transform(X[categorical_cols])


In [90]:
X.isna().sum()

id                           0
Time_spent_Alone             0
Stage_fear                   0
Social_event_attendance      0
Going_outside                0
Drained_after_socializing    0
Friends_circle_size          0
Post_frequency               0
dtype: int64

In [91]:
X.dtypes

id                           float64
Time_spent_Alone             float64
Stage_fear                    object
Social_event_attendance      float64
Going_outside                float64
Drained_after_socializing     object
Friends_circle_size          float64
Post_frequency               float64
dtype: object

In [92]:
df.describe()

Unnamed: 0,id,Time_spent_Alone,Social_event_attendance,Going_outside,Friends_circle_size,Post_frequency
count,18524.0,17334.0,17344.0,17058.0,17470.0,17260.0
mean,9261.5,3.137764,5.265106,4.044319,7.996737,4.982097
std,5347.562529,3.003786,2.753359,2.06258,4.223484,2.879139
min,0.0,0.0,0.0,0.0,0.0,0.0
25%,4630.75,1.0,3.0,3.0,5.0,3.0
50%,9261.5,2.0,5.0,4.0,8.0,5.0
75%,13892.25,4.0,8.0,6.0,12.0,7.0
max,18523.0,11.0,10.0,7.0,15.0,10.0


In [93]:
df.info()


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 18524 entries, 0 to 18523
Data columns (total 9 columns):
 #   Column                     Non-Null Count  Dtype  
---  ------                     --------------  -----  
 0   id                         18524 non-null  int64  
 1   Time_spent_Alone           17334 non-null  float64
 2   Stage_fear                 16631 non-null  object 
 3   Social_event_attendance    17344 non-null  float64
 4   Going_outside              17058 non-null  float64
 5   Drained_after_socializing  17375 non-null  object 
 6   Friends_circle_size        17470 non-null  float64
 7   Post_frequency             17260 non-null  float64
 8   Personality                18524 non-null  object 
dtypes: float64(5), int64(1), object(3)
memory usage: 1.3+ MB


In [94]:

print(df["Personality"].value_counts())


Personality
Extrovert    13699
Introvert     4825
Name: count, dtype: int64


encoding categorical values

In [101]:
categorical_cols = X.select_dtypes(include=["object"]).columns  # recalc after imputation
X = pd.get_dummies(X, columns=categorical_cols, drop_first=True)

split data

In [102]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

scaling

In [103]:

from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

In [None]:
from sklearn.impute import SimpleImputer
imputer = SimpleImputer(strategy='median')
X_train_scaled = imputer.fit_transform(X_train_scaled)
X_test_scaled = imputer.transform(X_test_scaled)

train model and evaluate

In [111]:
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score, classification_report

logistic regression

In [112]:
lr = LogisticRegression(random_state=42, max_iter=1000)
lr.fit(X_train_scaled, y_train)
y_pred_lr = lr.predict(X_test_scaled)

Random forest

In [113]:
rf = RandomForestClassifier(n_estimators=100, random_state=42)
rf.fit(X_train_scaled, y_train)
y_pred_rf = rf.predict(X_test_scaled)

KNN

In [114]:
knn = KNeighborsClassifier(n_neighbors=5)
knn.fit(X_train_scaled, y_train)
y_pred_knn = knn.predict(X_test_scaled)

Decision tree

In [115]:
dt = DecisionTreeClassifier(random_state=42)
dt.fit(X_train_scaled, y_train)
y_pred_dt = dt.predict(X_test_scaled)

Classification report

In [117]:
models = {'Logistic Regression': y_pred_lr,'Random Forest': y_pred_rf,'KNN': y_pred_knn,
'Decision Tree': y_pred_dt}
for name, pred in models.items():
    print(f'\n{name} Results:')
    print('Accuracy:', accuracy_score(y_test, pred))
    print(classification_report(y_test, pred))


Logistic Regression Results:
Accuracy: 0.967434328895286
              precision    recall  f1-score   support

           0       0.98      0.98      0.98      4115
           1       0.94      0.93      0.94      1443

    accuracy                           0.97      5558
   macro avg       0.96      0.95      0.96      5558
weighted avg       0.97      0.97      0.97      5558


Random Forest Results:
Accuracy: 0.9641957538682979
              precision    recall  f1-score   support

           0       0.97      0.98      0.98      4115
           1       0.94      0.92      0.93      1443

    accuracy                           0.96      5558
   macro avg       0.96      0.95      0.95      5558
weighted avg       0.96      0.96      0.96      5558


KNN Results:
Accuracy: 0.9676142497301188
              precision    recall  f1-score   support

           0       0.98      0.98      0.98      4115
           1       0.95      0.93      0.94      1443

    accuracy                

deploy

In [122]:
import pickle
with open('random_forest_model.pkl', 'wb') as f:
    pickle.dump(rf, f)
with open('scaler.pkl', 'wb') as f:
    pickle.dump(scaler, f)
feature_columns = X.columns.tolist()
with open('feature_columns.pkl', 'wb') as f:
    pickle.dump(feature_columns, f)


In [121]:
df.head()

Unnamed: 0,id,Time_spent_Alone,Stage_fear,Social_event_attendance,Going_outside,Drained_after_socializing,Friends_circle_size,Post_frequency,Personality,Stage_fear_encoded,Drained_encoded,Personality_encoded
0,0,0.0,No,6.0,4.0,No,15.0,5.0,Extrovert,0,0,0
1,1,1.0,No,7.0,3.0,No,10.0,8.0,Extrovert,0,0,0
2,2,6.0,Yes,1.0,0.0,,3.0,0.0,Introvert,1,2,1
3,3,3.0,No,7.0,3.0,No,11.0,5.0,Extrovert,0,0,0
4,4,1.0,No,4.0,4.0,No,13.0,,Extrovert,0,0,0
