In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.metrics import (confusion_matrix, accuracy_score, precision_recall_fscore_support, roc_auc_score)

In [2]:
df = pd.read_csv('personality_dataset.csv')
df

Unnamed: 0,Time_spent_Alone,Stage_fear,Social_event_attendance,Going_outside,Drained_after_socializing,Friends_circle_size,Post_frequency,Personality
0,4.0,No,4.0,6.0,No,13.0,5.0,Extrovert
1,9.0,Yes,0.0,0.0,Yes,0.0,3.0,Introvert
2,9.0,Yes,1.0,2.0,Yes,5.0,2.0,Introvert
3,0.0,No,6.0,7.0,No,14.0,8.0,Extrovert
4,3.0,No,9.0,4.0,No,8.0,5.0,Extrovert
...,...,...,...,...,...,...,...,...
2895,3.0,No,7.0,6.0,No,6.0,6.0,Extrovert
2896,3.0,No,8.0,3.0,No,14.0,9.0,Extrovert
2897,4.0,Yes,1.0,1.0,Yes,4.0,0.0,Introvert
2898,11.0,Yes,1.0,,Yes,2.0,0.0,Introvert


# Cleaning data

In [4]:
print(df['Personality'].value_counts())

Personality
Extrovert    1491
Introvert    1409
Name: count, dtype: int64


In [5]:
# counting missing data in each column
print(df.isna().sum())

Time_spent_Alone             63
Stage_fear                   73
Social_event_attendance      62
Going_outside                66
Drained_after_socializing    52
Friends_circle_size          77
Post_frequency               65
Personality                   0
dtype: int64


In [6]:
# Rurning categorical features into numbers

df['Personality'] = df['Personality'].map({'Introvert': 1, 'Extrovert': 0})

df['Stage_fear'] = df['Stage_fear'].map({'Yes': 1, 'No': 0})

df['Drained_after_socializing'] = df['Drained_after_socializing'].map({'Yes': 1, 'No': 0})

# Replacing missing values in numerical columns with median
numerical_cols = ['Time_spent_Alone', 'Social_event_attendance', 'Going_outside', 
                 'Friends_circle_size', 'Post_frequency']
for col in numerical_cols:
    df[col] = df[col].fillna(df[col].median())

# For caregorical values since the missing data is <5% we will drop them
df = df.dropna(subset=['Stage_fear', 'Drained_after_socializing'])

df

Unnamed: 0,Time_spent_Alone,Stage_fear,Social_event_attendance,Going_outside,Drained_after_socializing,Friends_circle_size,Post_frequency,Personality
0,4.0,0.0,4.0,6.0,0.0,13.0,5.0,0
1,9.0,1.0,0.0,0.0,1.0,0.0,3.0,1
2,9.0,1.0,1.0,2.0,1.0,5.0,2.0,1
3,0.0,0.0,6.0,7.0,0.0,14.0,8.0,0
4,3.0,0.0,9.0,4.0,0.0,8.0,5.0,0
...,...,...,...,...,...,...,...,...
2895,3.0,0.0,7.0,6.0,0.0,6.0,6.0,0
2896,3.0,0.0,8.0,3.0,0.0,14.0,9.0,0
2897,4.0,1.0,1.0,1.0,1.0,4.0,0.0,1
2898,11.0,1.0,1.0,3.0,1.0,2.0,0.0,1


In [7]:
# splitting data into features/ label
X = df.drop('Personality', axis=1)
y = df['Personality']

# scaing features
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# splitting data into train and test data 80%/20%
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.2)

# Comparing Classification Models

## Linear Discriminant Analysis (LDA)

In [10]:
lda = LinearDiscriminantAnalysis()
lda.fit(X_train, y_train)
preds = lda.predict(X_test)

# Confusion Matrix and Accuracy
conf = confusion_matrix(y_test, preds, labels=lda.classes_)
print('Confusion matrix:\n')
print(lda.classes_)
print(conf)

acc = accuracy_score(y_test, preds)
print('\nThe accuracy is: ', acc)

Confusion matrix:

[0 1]
[[248  21]
 [ 16 271]]

The accuracy is:  0.9334532374100719


In [11]:
# Precision, Recall, F1
precision_recall_fscore_support(y_test, preds, pos_label=1, average='binary') # I considered being introvert as our positive

(0.928082191780822, 0.9442508710801394, 0.9360967184801382, None)

## k-Nearest Neighbours (kNN) for k = 3, 5, 21

In [13]:
# For k = 3
knn = KNeighborsClassifier(3)
knn.fit(X_train, y_train)
knn_preds = knn.predict(X_test)

conf = confusion_matrix(y_test, knn_preds, labels=knn.classes_)
print('Confusion matrix:\n')
print(knn.classes_)
print(conf)

acc = accuracy_score(y_test, knn_preds)
print('\nThe accuracy is: ', acc)

Confusion matrix:

[0 1]
[[240  29]
 [ 22 265]]

The accuracy is:  0.908273381294964


In [14]:
precision_recall_fscore_support(y_test, knn_preds, pos_label=1, average='binary') # I considered being introvert as our positive

(0.9013605442176871, 0.9233449477351916, 0.9122203098106713, None)

In [15]:
# For k = 5
knn = KNeighborsClassifier(5)
knn.fit(X_train, y_train)
knn_preds = knn.predict(X_test)

conf = confusion_matrix(y_test, knn_preds, labels=knn.classes_)
print('Confusion matrix:\n')
print(knn.classes_)
print(conf)

acc = accuracy_score(y_test, knn_preds)
print('\nThe accuracy is: ', acc)

Confusion matrix:

[0 1]
[[246  23]
 [ 18 269]]

The accuracy is:  0.9262589928057554


In [16]:
precision_recall_fscore_support(y_test, knn_preds, pos_label=1, average='binary') # I considered being introvert as our positive

(0.9212328767123288, 0.9372822299651568, 0.9291882556131261, None)

In [64]:
# For k = 21
knn = KNeighborsClassifier(21)
knn.fit(X_train, y_train)
knn_preds = knn.predict(X_test)

conf = confusion_matrix(y_test, knn_preds, labels=knn.classes_)
print('Confusion matrix:\n')
print(knn.classes_)
print(conf)

acc = accuracy_score(y_test, knn_preds)
print('\nThe accuracy is: ', acc)

Confusion matrix:

[0 1]
[[248  21]
 [ 16 271]]

The accuracy is:  0.9334532374100719


In [18]:
precision_recall_fscore_support(y_test, knn_preds, pos_label=1, average='binary') # I considered being introvert as our positive

(0.928082191780822, 0.9442508710801394, 0.9360967184801382, None)

## Logistic Regression

In [20]:
logreg = LogisticRegression()
logreg.fit(X_train, y_train)
logreg_preds = logreg.predict(X_test)

conf = confusion_matrix(y_test, logreg_preds, labels=logreg.classes_)
print('Confusion matrix:')
print(logreg.classes_)
print(conf)

acc = accuracy_score(y_test, logreg_preds)
print('\nThe accuracy is:', acc)

Confusion matrix:
[0 1]
[[248  21]
 [ 16 271]]

The accuracy is: 0.9334532374100719


In [21]:
precision_recall_fscore_support(y_test, logreg_preds, pos_label=1, average='binary') # I considered being introvert as our positive

(0.928082191780822, 0.9442508710801394, 0.9360967184801382, None)

## Logistic Regression (probabilistic predictions)

In [23]:
# function that classifies preds based on threshold

def classify(preds, threshold, newpreds):
    for p in preds:
        if p[0] > threshold:
            newpreds.append(classes[0])
        else:
            newpreds.append(classes[1])

In [24]:
preds = logreg.predict_proba(X_test)
classes = logreg.classes_

In [25]:
# for threshold = 0.45

newpreds = []
classify(preds, 0.45, newpreds)

print(logreg.classes_)        
conf = confusion_matrix(y_test, newpreds)
print(conf)

acc = accuracy_score(y_test, newpreds)
print('accuracy is: ', acc)

[0 1]
[[248  21]
 [ 16 271]]
accuracy is:  0.9334532374100719


In [26]:
precision_recall_fscore_support(y_test, newpreds, pos_label=1, average='binary')

(0.928082191780822, 0.9442508710801394, 0.9360967184801382, None)

In [27]:
# for threshold = 0.55

newpreds = []
classify(preds, 0.55, newpreds)

print(logreg.classes_)        
conf = confusion_matrix(y_test, newpreds)
print(conf)

acc = accuracy_score(y_test, newpreds)
print('accuracy is: ', acc)

[0 1]
[[248  21]
 [ 16 271]]
accuracy is:  0.9334532374100719


In [28]:
precision_recall_fscore_support(y_test, newpreds, pos_label=1, average='binary')

(0.928082191780822, 0.9442508710801394, 0.9360967184801382, None)

In [29]:
# for threshold = 0.1

newpreds = []
classify(preds, 0.1, newpreds)

print(logreg.classes_)        
conf = confusion_matrix(y_test, newpreds)
print(conf)

acc = accuracy_score(y_test, newpreds)
print('accuracy is: ', acc)

[0 1]
[[260   9]
 [ 92 195]]
accuracy is:  0.8183453237410072


In [30]:
precision_recall_fscore_support(y_test, newpreds, pos_label=1, average='binary')

(0.9558823529411765, 0.6794425087108014, 0.7942973523421588, None)

## Support Vector Machines (SVM)

In [32]:
svm = SVC()
svm.fit(X_train, y_train)
svm_preds = svm.predict(X_test)

print(svm.classes_)        
conf = confusion_matrix(y_test, svm_preds)
print(conf)

acc = accuracy_score(y_test, svm_preds)
print('accuracy is: ', acc)

[0 1]
[[248  21]
 [ 16 271]]
accuracy is:  0.9334532374100719


In [33]:
precision_recall_fscore_support(y_test, svm_preds, pos_label=1, average='binary')

(0.928082191780822, 0.9442508710801394, 0.9360967184801382, None)

# Let's predict someone's type of personality

In [35]:
input_data = [[12, 0, 6, 5, 1, 4, 0]]  
input_data_scaled = scaler.transform(input_data) 

prediction = logreg.predict(input_data_scaled)[0]
print("Predicted Personality:", "Introvert" if prediction == 1 else "Extrovert")

Predicted Personality: Extrovert


