In [111]:
import pandas as pd
import numpy as np

In [112]:
train = pd.read_csv('train.csv')
train.describe()
train.head()

Unnamed: 0,id,Time_spent_Alone,Stage_fear,Social_event_attendance,Going_outside,Drained_after_socializing,Friends_circle_size,Post_frequency,Personality
0,0,0.0,No,6.0,4.0,No,15.0,5.0,Extrovert
1,1,1.0,No,7.0,3.0,No,10.0,8.0,Extrovert
2,2,6.0,Yes,1.0,0.0,,3.0,0.0,Introvert
3,3,3.0,No,7.0,3.0,No,11.0,5.0,Extrovert
4,4,1.0,No,4.0,4.0,No,13.0,,Extrovert


In [113]:
print(train.isnull().sum())

id                              0
Time_spent_Alone             1190
Stage_fear                   1893
Social_event_attendance      1180
Going_outside                1466
Drained_after_socializing    1149
Friends_circle_size          1054
Post_frequency               1264
Personality                     0
dtype: int64


In [114]:
# train.fillna(train.mean(numeric_only=True), inplace=True)
train['Stage_fear'].fillna(train['Stage_fear'].mode()[0], inplace=True)
train['Drained_after_socializing'].fillna(train['Drained_after_socializing'].mode()[0], inplace=True)

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  train['Stage_fear'].fillna(train['Stage_fear'].mode()[0], inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  train['Drained_after_socializing'].fillna(train['Drained_after_socializing'].mode()[0], inplace=True)


In [115]:
print(train.isnull().sum())

id                              0
Time_spent_Alone             1190
Stage_fear                      0
Social_event_attendance      1180
Going_outside                1466
Drained_after_socializing       0
Friends_circle_size          1054
Post_frequency               1264
Personality                     0
dtype: int64


In [118]:
for col in ['Stage_fear', 'Drained_after_socializing']:
    train[col] = train[col].map({'Yes': 1, 'No': 0})
train.to_csv("trained.csv", index=False)

In [103]:
train.head()

Unnamed: 0,id,Time_spent_Alone,Stage_fear,Social_event_attendance,Going_outside,Drained_after_socializing,Friends_circle_size,Post_frequency,Personality
0,0,0.0,0,6.0,4.0,0,15.0,5.0,Extrovert
1,1,1.0,0,7.0,3.0,0,10.0,8.0,Extrovert
3,3,3.0,0,7.0,3.0,0,11.0,5.0,Extrovert
7,7,2.0,0,8.0,3.0,0,4.0,5.0,Extrovert
9,9,1.0,0,8.0,6.0,0,14.0,9.0,Extrovert


In [159]:
import logging
import pandas as pd
import numpy as np
import json
import zipfile
import xgboost as xgb
from sklearn.model_selection import train_test_split, RandomizedSearchCV
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, LabelEncoder
from sklearn.pipeline import Pipeline, make_pipeline
from sklearn.dummy import DummyClassifier
from sklearn.metrics import accuracy_score, classification_report
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler

In [160]:
def baseline():
    logging.info("Reading train and test files")
    train = pd.read_csv("train.csv")
    test = pd.read_csv("test.csv")
    train, valid = train_test_split(train, test_size=1/3, random_state=123)

    preprocess = ColumnTransformer(
        transformers=[
            ('Stage_fear', 
             Pipeline(steps=[('encoder', OneHotEncoder(handle_unknown='ignore'))]), 
             ['Stage_fear'] 
            ),
            ('Drained_after_socializing', 
             Pipeline(steps=[('encoder', OneHotEncoder(handle_unknown='ignore'))]), 
             ['Drained_after_socializing'] 
            ),
            ('numerical',
            Pipeline(steps=[('scaler', StandardScaler())]),
            ['Social_event_attendance', 'Time_spent_Alone', 'Going_outside', 'Friends_circle_size', 'Post_frequency'])

        ],
        remainder='drop'
    )

    # Changed to DummyClassifier for classification
    dummy = make_pipeline(preprocess, DummyClassifier(strategy='most_frequent'))
    
    # Define hyperparameter search space
    param_dist = {
        'xgbclassifier__n_estimators': [100, 150, 200, 250, 300, 350, 500],  # Changed prefix
        'xgbclassifier__max_depth': [3, 4, 5, 6, 7],
        'xgbclassifier__learning_rate': [0.01, 0.05, 0.1, 0.15, 0.2],
        'xgbclassifier__subsample': [0.6, 0.7, 0.8, 0.9, 1.0],
        'xgbclassifier__colsample_bytree': [0.6, 0.7, 0.8, 0.9, 1.0],
        'xgbclassifier__min_child_weight': [1, 3, 5],
        'xgbclassifier__gamma': [0, 0.1, 0.2],
        'xgbclassifier__reg_alpha': [0, 0.1, 1],
        'xgbclassifier__reg_lambda': [1, 1.5, 2]
    }
    
    # Create base pipeline
    base_xgb_model = make_pipeline(preprocess, xgb.XGBClassifier(
        objective='binary:logistic', random_state=123))
    
    # Random search for hyperparameter tuning
    logging.info("Starting hyperparameter tuning with RandomizedSearchCV")
    random_search = RandomizedSearchCV(
        base_xgb_model,
        param_dist,
        n_iter=50,  # Number of parameter combinations to try
        cv=5,
        scoring='accuracy',  # Changed to appropriate classification metric
        n_jobs=-1,
        random_state=123,
        verbose=1
    )
    
    label = 'Personality'
    X_train = train.drop([label], axis=1)
    le = LabelEncoder()
    y_train = le.fit_transform(train[label])
    
    # Fit the random search
    random_search.fit(X_train, y_train)
    
    # Get the best model
    best_xgb_model = random_search.best_estimator_
    
    logging.info(f"Best parameters: {random_search.best_params_}")
    logging.info(f"Best CV score: {random_search.best_score_:.4f}")
    
    # Evaluate models
    for model_name, model in [("dummy", dummy), ("xgboost_tuned", best_xgb_model)]:
        logging.info(f"Evaluating model {model_name}")
        
        if model_name == "dummy":
            model.fit(X_train, y_train)
        # best_xgb_model is already fitted from RandomizedSearchCV
        
        for split_name, split in [("train", train), ("valid", valid)]:
            # Get features for current split
            X_split = split.drop([label], axis=1)
            y_true = le.transform(split[label])
            
            # Make prediction on current split
            pred_binary = model.predict(X_split)
            
            # Calculate accuracy for current split
            accuracy = accuracy_score(y_true, pred_binary)  # FIXED: use correct variables
            logging.info(f"{model_name} {split_name} accuracy: {accuracy:.3f}")

    # Make final test predictions
    pred_test_binary = best_xgb_model.predict(test)  # Binary predictions
    pred_test_labels = le.inverse_transform(pred_test_binary)  # Convert back to text
    
    # FIXED: Assign predictions to test dataframe
    test[label] = pred_test_labels
    test[['id', 'Personality']].to_csv("predicted.csv", index=False)

    # Final validation metrics
    X_valid = valid.drop([label], axis=1)
    y_valid_true = le.transform(valid[label])
    pred_valid_binary = best_xgb_model.predict(X_valid)

    accuracy = accuracy_score(y_valid_true, pred_valid_binary)
    print(f"Final Accuracy: {accuracy:.4f}")
    
    # Additional classification metrics
    print("\nClassification Report:")
    print(classification_report(y_valid_true, pred_valid_binary, target_names=le.classes_))
    
    # Print best parameters for reference
    print("\nBest hyperparameters found:")
    for param, value in random_search.best_params_.items():
        print(f"{param}: {value}")

if __name__ == '__main__':
    logging.getLogger().setLevel(logging.INFO)
    baseline()

INFO:root:Reading train and test files
INFO:root:Starting hyperparameter tuning with RandomizedSearchCV


Fitting 5 folds for each of 50 candidates, totalling 250 fits


INFO:root:Best parameters: {'xgbclassifier__subsample': 1.0, 'xgbclassifier__reg_lambda': 2, 'xgbclassifier__reg_alpha': 0, 'xgbclassifier__n_estimators': 250, 'xgbclassifier__min_child_weight': 5, 'xgbclassifier__max_depth': 4, 'xgbclassifier__learning_rate': 0.2, 'xgbclassifier__gamma': 0.2, 'xgbclassifier__colsample_bytree': 0.6}
INFO:root:Best CV score: 0.9700
INFO:root:Evaluating model dummy
INFO:root:dummy train accuracy: 0.742
INFO:root:dummy valid accuracy: 0.734
INFO:root:Evaluating model xgboost_tuned
INFO:root:xgboost_tuned train accuracy: 0.971
INFO:root:xgboost_tuned valid accuracy: 0.967


Final Accuracy: 0.9674

Classification Report:
              precision    recall  f1-score   support

   Extrovert       0.97      0.98      0.98      4535
   Introvert       0.95      0.93      0.94      1640

    accuracy                           0.97      6175
   macro avg       0.96      0.96      0.96      6175
weighted avg       0.97      0.97      0.97      6175


Best hyperparameters found:
xgbclassifier__subsample: 1.0
xgbclassifier__reg_lambda: 2
xgbclassifier__reg_alpha: 0
xgbclassifier__n_estimators: 250
xgbclassifier__min_child_weight: 5
xgbclassifier__max_depth: 4
xgbclassifier__learning_rate: 0.2
xgbclassifier__gamma: 0.2
xgbclassifier__colsample_bytree: 0.6
