In [None]:
from google.colab import files
files.upload()  # upload kaggle.json here

In [None]:
!mkdir -p ~/.kaggle
!cp kaggle.json ~/.kaggle/
!chmod 600 ~/.kaggle/kaggle.json

In [None]:
import kagglehub

# Download latest version
path = kagglehub.dataset_download("rakeshkapilavai/extrovert-vs-introvert-behavior-data")

print("Path to dataset files:", path)

In [None]:
import os

path = "/kaggle/input/extrovert-vs-introvert-behavior-data"
print("Files in dataset folder:", os.listdir(path))

In [None]:
import pandas as pd

df = pd.read_csv('/kaggle/input/extrovert-vs-introvert-behavior-data/personality_dataset.csv')

In [None]:
df

In [None]:
df.dtypes

In [None]:
df['Stage_fear'] = df['Stage_fear'].replace({'Yes': 1, 'No': 0})
df['Drained_after_socializing'] = df['Drained_after_socializing'].replace({'Yes': 1, 'No': 0})
df['Personality'] = df['Personality'].replace({'Extrovert': 1, 'Introvert': 0})

In [None]:
df

In [None]:
df.dtypes

In [None]:
df.corr()

In [None]:
positive_corr_cols = [
    'Social_event_attendance',
    'Going_outside',
    'Post_frequency',
    'Friends_circle_size'
]

df_selected = pd.concat([df[positive_corr_cols], df['Personality']], axis=1)

In [None]:
df_selected

In [None]:
df_selected['Personality'].value_counts()

In [None]:
df_selected.isnull().sum()

In [None]:
from sklearn.impute import SimpleImputer

si = SimpleImputer(strategy='mean')

df_imputed = si.fit_transform(df_selected[positive_corr_cols])

df_imputed = pd.DataFrame(df_imputed, columns=positive_corr_cols)

In [None]:
final_df = pd.concat([df_imputed, df_selected['Personality']], axis=1)

In [None]:
final_df

In [None]:
final_df.isnull().sum()

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns

sns.boxplot(final_df['Friends_circle_size'])

In [None]:
sns.boxplot(final_df['Post_frequency'])

In [None]:
sns.boxplot(final_df['Going_outside'])

In [None]:
sns.boxplot(df['Social_event_attendance'])

## splitting training and testing data

In [None]:
from sklearn.model_selection import train_test_split

X = final_df.drop('Personality', axis=1)
y = final_df['Personality']

X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42, test_size=0.2)

In [None]:
X_train.shape, X_test.shape, y_train.shape, y_test.shape

## model training and prediction

In [None]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import r2_score, accuracy_score, classification_report, confusion_matrix

rfc = RandomForestClassifier(n_estimators=150)

rfc.fit(X_train, y_train)

In [None]:
y_pred = rfc.predict(X_test)

In [None]:
accuracy_score(y_test, y_pred)

In [None]:
confusion_matrix(y_test, y_pred)

In [None]:
cm = confusion_matrix(y_test, y_pred)
TN, FP, FN, TP = cm.ravel()

In [None]:
TN, FP, FN, TP

In [None]:
accuracy = (TP + TN) / (TN+ FP+ FN+ TP)
print(accuracy)

In [None]:
Precision = TP / (TP + FP)
print(Precision)

## query points

In [None]:
query_pt = [[0, 2, 0, 2]]

In [None]:
rfc.predict(query_pt)   # corret - Abhiraj's friend

In [None]:
query_pt2 = [[0, 2, 2, 5]]

In [None]:
rfc.predict(query_pt2)   # correct

In [None]:
query_pt3 = [[2, 6, 4, 14]]

In [None]:
rfc.predict(query_pt3)     # correct

In [None]:
query_pt4 = [[1, 3, 2, 5]]
rfc.predict(query_pt4)    # wrong

In [None]:
query_pt5 = [[1, 4, 2, 9]]
rfc.predict(query_pt5)     # correct

In [None]:
query_pt6 = [[2, 3, 1, 3]]
rfc.predict(query_pt6)     # correct

In [None]:
query_pt7 = [[2, 2, 3, 20]]
rfc.predict(query_pt7)     # correct

In [None]:
query_pt8 = [[3, 3, 3, 6]]
rfc.predict(query_pt8)      # wrong

In [None]:
query_pt9 = [[3, 3, 0, 3]]
rfc.predict(query_pt9)      #

In [None]:
import joblib
joblib.dump(rfc, "personality.pkl")