In [37]:

!pip install catboost xgboost lightgbm --quiet
!pip install kaggle --upgrade --quiet

In [38]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.preprocessing import LabelEncoder
from sklearn.ensemble import VotingClassifier
from catboost import CatBoostClassifier
from lightgbm import LGBMClassifier
from xgboost import XGBClassifier
from sklearn.metrics import accuracy_score
import os
import time
import numpy as np
import pandas as pd
import joblib
import kagglehub

In [39]:
train = pd.read_csv('train.csv')
test = pd.read_csv('test.csv')

path = kagglehub.dataset_download("rakeshkapilavai/extrovert-vs-introvert-behavior-data")
orig = pd.read_csv(os.path.join(path, 'personality_dataset.csv'))


In [40]:
train.info()
train.describe()
train.isnull().sum()
train['Personality'].value_counts()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 18524 entries, 0 to 18523
Data columns (total 9 columns):
 #   Column                     Non-Null Count  Dtype  
---  ------                     --------------  -----  
 0   id                         18524 non-null  int64  
 1   Time_spent_Alone           17334 non-null  float64
 2   Stage_fear                 16631 non-null  object 
 3   Social_event_attendance    17344 non-null  float64
 4   Going_outside              17058 non-null  float64
 5   Drained_after_socializing  17375 non-null  object 
 6   Friends_circle_size        17470 non-null  float64
 7   Post_frequency             17260 non-null  float64
 8   Personality                18524 non-null  object 
dtypes: float64(5), int64(1), object(3)
memory usage: 1.3+ MB


Unnamed: 0_level_0,count
Personality,Unnamed: 1_level_1
Extrovert,13699
Introvert,4825


In [41]:
orig.info()
orig.describe()
orig.isnull().sum()
orig['Personality'].value_counts()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2900 entries, 0 to 2899
Data columns (total 8 columns):
 #   Column                     Non-Null Count  Dtype  
---  ------                     --------------  -----  
 0   Time_spent_Alone           2837 non-null   float64
 1   Stage_fear                 2827 non-null   object 
 2   Social_event_attendance    2838 non-null   float64
 3   Going_outside              2834 non-null   float64
 4   Drained_after_socializing  2848 non-null   object 
 5   Friends_circle_size        2823 non-null   float64
 6   Post_frequency             2835 non-null   float64
 7   Personality                2900 non-null   object 
dtypes: float64(5), object(3)
memory usage: 181.4+ KB


Unnamed: 0_level_0,count
Personality,Unnamed: 1_level_1
Extrovert,1491
Introvert,1409


In [42]:
train = train.rename(columns={'Personality': 'Personality'})
orig = orig.rename(columns={'Personality': 'Personality'})
combined = pd.concat([train, orig], ignore_index=True)
combined = combined.drop_duplicates(subset=[col for col in combined.columns if col != 'Personality'])
print("Columns in combined dataset:", combined.columns.tolist())
print("Shape of combined dataset:", combined.shape)
print("Null counts:\n", combined.isnull().sum())

Columns in combined dataset: ['id', 'Time_spent_Alone', 'Stage_fear', 'Social_event_attendance', 'Going_outside', 'Drained_after_socializing', 'Friends_circle_size', 'Post_frequency', 'Personality']
Shape of combined dataset: (20977, 9)
Null counts:
 id                           2453
Time_spent_Alone             1251
Stage_fear                   1966
Social_event_attendance      1241
Going_outside                1531
Drained_after_socializing    1200
Friends_circle_size          1129
Post_frequency               1327
Personality                     0
dtype: int64


In [43]:
combined = combined.drop_duplicates()
print("Shape after dropping duplicates:", combined.shape)
print("Null counts after dropping duplicates:\n", combined.isnull().sum())

Shape after dropping duplicates: (20977, 9)
Null counts after dropping duplicates:
 id                           2453
Time_spent_Alone             1251
Stage_fear                   1966
Social_event_attendance      1241
Going_outside                1531
Drained_after_socializing    1200
Friends_circle_size          1129
Post_frequency               1327
Personality                     0
dtype: int64


In [44]:
categorical_cols = ['Stage_fear', 'Drained_after_socializing']
numerical_cols = ['Time_spent_Alone', 'Social_event_attendance', 'Going_outside', 'Friends_circle_size', 'Post_frequency']
for col in numerical_cols:
    combined[col] = combined[col].fillna(combined[col].mean())
for col in categorical_cols:
    combined[col] = combined[col].fillna(combined[col].mode()[0])
print("Null counts after imputation:\n", combined.isnull().sum())

Null counts after imputation:
 id                           2453
Time_spent_Alone                0
Stage_fear                      0
Social_event_attendance         0
Going_outside                   0
Drained_after_socializing       0
Friends_circle_size             0
Post_frequency                  0
Personality                     0
dtype: int64


In [45]:
combined.isnull().sum().sum()

np.int64(2453)

In [46]:
combined['id'] = combined['id'].fillna(-1)
print("Null counts after handling id column:\n", combined.isnull().sum())

Null counts after handling id column:
 id                           0
Time_spent_Alone             0
Stage_fear                   0
Social_event_attendance      0
Going_outside                0
Drained_after_socializing    0
Friends_circle_size          0
Post_frequency               0
Personality                  0
dtype: int64


In [47]:
from sklearn.preprocessing import LabelEncoder, StandardScaler

# Encode categorical columns
categorical_cols = ['Stage_fear', 'Drained_after_socializing']
label_encoders = {}
for col in categorical_cols:
    le = LabelEncoder()
    combined[col] = le.fit_transform(combined[col])
    label_encoders[col] = le

# Scale numerical columns
numerical_cols = ['Time_spent_Alone', 'Social_event_attendance', 'Going_outside', 'Friends_circle_size', 'Post_frequency']
scaler = StandardScaler()
combined[numerical_cols] = scaler.fit_transform(combined[numerical_cols])

# Encode target column
le_personality = LabelEncoder()
combined['Personality'] = le_personality.fit_transform(combined['Personality'])

# Verify changes
print("Data types after encoding and scaling:\n", combined.dtypes)
print("Sample of transformed data:\n", combined.head())

Data types after encoding and scaling:
 id                           float64
Time_spent_Alone             float64
Stage_fear                     int64
Social_event_attendance      float64
Going_outside                float64
Drained_after_socializing      int64
Friends_circle_size          float64
Post_frequency               float64
Personality                    int64
dtype: object
Sample of transformed data:
     id  Time_spent_Alone  Stage_fear  Social_event_attendance  Going_outside  \
0  0.0         -1.093038           0                 0.315761       0.026560   
1  1.0         -0.757464           0                 0.685144      -0.467692   
2  2.0          0.920403           1                -1.531153      -1.950449   
3  3.0         -0.086318           0                 0.685144      -0.467692   
4  4.0         -0.757464           0                -0.423005       0.026560   

   Drained_after_socializing  Friends_circle_size  Post_frequency  Personality  
0                     

In [76]:
# Split data into training and the rest (for validation and testing)
X_train, X_temp, y_train, y_temp = train_test_split(X, y, test_size=0.4, random_state=42)

# Split the rest into validation and test sets
X_val, X_test, y_val, y_test = train_test_split(X_temp, y_temp, test_size=0.5, random_state=42)

print("Shape of X_train:", X_train.shape)
print("Shape of X_val:", X_val.shape)
print("Shape of X_test:", X_test.shape)
print("Shape of y_train:", y_train.shape)
print("Shape of y_val:", y_val.shape)
print("Shape of y_test:", y_test.shape)

Shape of X_train: (12586, 7)
Shape of X_val: (4195, 7)
Shape of X_test: (4196, 7)
Shape of y_train: (12586,)
Shape of y_val: (4195,)
Shape of y_test: (4196,)


In [49]:
cb_params = {
    "border_count": 39,
    "colsample_bylevel": 0.19459088572914465,
    "depth": 2,
    "iterations": 1467,
    "l2_leaf_reg": 31.236169478676036,
    "learning_rate": 0.06852669420904771,
    "min_child_samples": 160,
    "random_state": 42,
    "random_strength": 0.8517786189616939,
    "scale_pos_weight": 1.1691394390533685,
    "subsample": 0.3192330024411618,
    "verbose": False,
    "cat_features": categorical_cols
}

xgb_params = {
    "colsample_bylevel": 0.8168489864941239,
    "colsample_bynode": 0.8850485490950061,
    "colsample_bytree": 0.8379339940113913,
    "gamma": 2.3977359439809276,
    "learning_rate": 0.0616974880921061,
    "max_depth": 344,
    "max_leaves": 89,
    "min_child_weight": 10,
    "n_estimators": 696,
    "n_jobs": -1,
    "random_state": 42,
    "reg_alpha": 1.849084818346014,
    "reg_lambda": 29.680324563362227,
    "subsample": 0.5902901569391961,
    "verbosity": 0,
    "enable_categorical": True
}

hgb_params = {
    "l2_regularization": 28.13576008319012,
    "learning_rate": 0.1543598086529694,
    "max_depth": 325,
    "max_features": 0.323620656779567,
    "max_iter": 2490,
    "max_leaf_nodes": 216,
    "min_samples_leaf": 12,
    "random_state": 42,
    "categorical_features": "from_dtype"
}

lgbm_params = {
    "boosting_type": "gbdt",
    "colsample_bytree": 0.6467443250209886,
    "learning_rate": 0.06547186748153115,
    "min_child_samples": 34,
    "min_child_weight": 0.24399244943904663,
    "n_estimators": 498,
    "n_jobs": -1,
    "num_leaves": 158,
    "random_state": 42,
    "reg_alpha": 6.568921253574134,
    "reg_lambda": 62.66165355751099,
    "subsample": 0.0011019938618584968,
    "verbose": -1
}

lgbm_goss_params = {
    "boosting_type": "goss",
    "colsample_bytree": 0.8384834064170148,
    "learning_rate": 0.07006829797238343,
    "min_child_samples": 46,
    "min_child_weight": 0.7625394962666617,
    "n_estimators": 1887,
    "n_jobs": -1,
    "num_leaves": 341,
    "random_state": 42,
    "reg_alpha": 10.53082019937197,
    "reg_lambda": 67.44600065144685,
    "subsample": 0.4925008305336127,
    "verbose": -1
}

lgbm_dart_params = {
    "boosting_type": "dart",
    "colsample_bytree": 0.7592971191793424,
    "learning_rate": 0.046141766106846074,
    "min_child_samples": 18,
    "min_child_weight": 0.4740109054323218,
    "n_estimators": 4035,
    "n_jobs": -1,
    "num_leaves": 393,
    "random_state": 42,
    "reg_alpha": 48.016799341666605,
    "reg_lambda": 89.12860300833658,
    "subsample": 0.016333358901112538,
    "verbose": -1
}

In [77]:
from xgboost import XGBClassifier

xgb_model = XGBClassifier(**xgb_params)
xgb_model.fit(X_train, y_train)

In [78]:
from lightgbm import LGBMClassifier

lgbm_model = LGBMClassifier(**lgbm_params)
lgbm_model.fit(X_train, y_train)

In [79]:
lgbm_goss_model = LGBMClassifier(**lgbm_goss_params)
lgbm_goss_model.fit(X_train, y_train)

In [80]:
lgbm_dart_model = LGBMClassifier(**lgbm_dart_params)
lgbm_dart_model.fit(X_train, y_train)

In [81]:
from catboost import CatBoostClassifier

catboost_model = CatBoostClassifier(**cb_params)
catboost_model.fit(X_train, y_train)

<catboost.core.CatBoostClassifier at 0x7c24ef199990>

In [82]:
from sklearn.ensemble import HistGradientBoostingClassifier

hgb_model = HistGradientBoostingClassifier(**hgb_params)
hgb_model.fit(X_train, y_train)

In [83]:
from sklearn.ensemble import AdaBoostClassifier

ada_model = AdaBoostClassifier(n_estimators=300, learning_rate=0.8, random_state=42)
ada_model.fit(X_train, y_train)

In [84]:
from sklearn.linear_model import LogisticRegression

# Stack probabilities from each model on the validation set
probs_val_stack = np.array([
    xgb_model.predict_proba(X_val),
    lgbm_model.predict_proba(X_val),
    lgbm_goss_model.predict_proba(X_val),
    lgbm_dart_model.predict_proba(X_val),
    catboost_model.predict_proba(X_val),
    hgb_model.predict_proba(X_val),
    ada_model.predict_proba(X_val)
])

# Reshape for the meta-model
X_meta_train = np.concatenate([p for p in probs_val_stack], axis=1)

In [85]:
logreg = LogisticRegression(max_iter=1000, random_state=42)
logreg.fit(X_meta_train, y_val)
logreg_preds_val = logreg.predict(X_meta_train)
logreg_acc_val = accuracy_score(y_val, logreg_preds_val)

print("Stacked Logistic Regression Accuracy on Validation Set:", logreg_acc_val)

Stacked Logistic Regression Accuracy on Validation Set: 0.9623361144219309


In [60]:
test.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6175 entries, 0 to 6174
Data columns (total 8 columns):
 #   Column                     Non-Null Count  Dtype  
---  ------                     --------------  -----  
 0   id                         6175 non-null   int64  
 1   Time_spent_Alone           5750 non-null   float64
 2   Stage_fear                 5577 non-null   object 
 3   Social_event_attendance    5778 non-null   float64
 4   Going_outside              5709 non-null   float64
 5   Drained_after_socializing  5743 non-null   object 
 6   Friends_circle_size        5825 non-null   float64
 7   Post_frequency             5767 non-null   float64
dtypes: float64(5), int64(1), object(2)
memory usage: 386.1+ KB


In [63]:
# Step 1: Load test set
test = pd.read_csv("test.csv")
test_ids = test['id']

In [67]:
# Retrieve mode from *original*, unencoded data
original_combined = pd.read_csv(os.path.join(path, 'personality_dataset.csv'))
categorical_cols = ['Stage_fear', 'Drained_after_socializing']
numerical_cols = ['Time_spent_Alone', 'Social_event_attendance', 'Going_outside', 'Friends_circle_size', 'Post_frequency']

# Fix: use original string-based modes
string_modes = {}
for col in categorical_cols:
    string_modes[col] = original_combined[col].mode()[0]

# Impute numerical
for col in numerical_cols:
    test[col] = test[col].fillna(combined[col].mean())  # mean is fine

# Impute categorical using string modes
# Just in case, fix bad string values like "0"
for col in categorical_cols:
    test[col] = test[col].replace("0", string_modes[col])  # override bad "0" values

# Recheck values
for col in categorical_cols:
    print(f"{col} unique values (post-fix):", test[col].unique())

# Now encode safely
for col in categorical_cols:
    le = label_encoders[col]
    unseen = set(test[col].unique()) - set(le.classes_)
    if unseen:
        print(f"⚠️ Unseen labels in {col}:", unseen)
        raise ValueError(f"Unseen category in test[{col}]: {unseen}")
    test[col] = le.transform(test[col])


Stage_fear unique values (post-fix): ['No' 'Yes']
Drained_after_socializing unique values (post-fix): ['No' 'Yes']


In [70]:
# Step 4: Scale numerical columns
test[numerical_cols] = scaler.transform(test[numerical_cols])

# Step 5: Drop 'id' for prediction
X_test_final = test.drop(columns=['id'])

# Step 6: Generate model probabilities from each trained model
probs_test_stack = np.array([
    xgb_model.predict_proba(X_test_final),
    lgbm_model.predict_proba(X_test_final),
    lgbm_goss_model.predict_proba(X_test_final),
    lgbm_dart_model.predict_proba(X_test_final),
    catboost_model.predict_proba(X_test_final),
    hgb_model.predict_proba(X_test_final),
    ada_model.predict_proba(X_test_final)
])

# Step 7: Stack predictions like before
X_meta_test = np.concatenate([p for p in probs_test_stack], axis=1)

# Step 8: Final stacked prediction (use MLP or LogisticRegression)
final_preds = logreg.predict(X_meta_test)


# Step 9: Decode label to get Extrovert / Introvert
final_labels = le_personality.inverse_transform(final_preds)

# Step 10: Create submission dataframe
submission = pd.DataFrame({
    'id': test_ids,
    'Personality': final_labels
})

# Step 11: Save to CSV
submission.to_csv("submission.csv", index=False)

print("✅ Submission file created successfully!")

✅ Submission file created successfully!


In [86]:
# Step 1: Scale numerical columns in test set (using the scaler fitted on combined data)
test[numerical_cols] = scaler.transform(test[numerical_cols])

# Step 2: Drop 'id' column to prepare test features
X_test_final = test.drop(columns=['id'])

# Step 3: Generate probabilities from each base model on the test set
probs_test_stack = np.array([
    xgb_model.predict_proba(X_test_final),
    lgbm_model.predict_proba(X_test_final),
    lgbm_goss_model.predict_proba(X_test_final),
    lgbm_dart_model.predict_proba(X_test_final),
    catboost_model.predict_proba(X_test_final),
    hgb_model.predict_proba(X_test_final),
    ada_model.predict_proba(X_test_final)
])

# Step 4: Stack predictions horizontally for meta model input
X_meta_test = np.concatenate([p for p in probs_test_stack], axis=1)

# Step 5: Predict on test stacked features using the trained meta-model
final_preds = logreg.predict(X_meta_test)

# Step 6: Decode predictions
final_labels = le_personality.inverse_transform(final_preds)

# Step 7: Prepare submission dataframe
submission = pd.DataFrame({
    'id': test_ids,
    'Personality': final_labels
})

# Step 8: Save to CSV
submission.to_csv("submission.csv", index=False)
print("✅ Submission file created successfully!")

✅ Submission file created successfully!
