In [1]:
import pandas as pd 

In [2]:
raw_data = pd.read_csv("../data/raw/data.csv")
raw_data.head()

Unnamed: 0,Age,Gender,Education,Introversion Score,Sensing Score,Thinking Score,Judging Score,Interest,Personality
0,19.0,Male,0,9.4708,7.141434,6.03696,4.360278,Unknown,ENFP
1,27.0,Female,0,5.85392,6.160195,0.80552,4.221421,Sports,ESFP
2,21.0,Female,0,7.08615,3.388433,2.66188,5.12732,Unknown,ENFP
3,28.0,Male,0,2.01892,4.823624,7.30625,5.98655,Others,INTP
4,36.0,Female,1,9.91703,4.75508,5.31469,4.677213,Technology,ENFP


In [3]:
features = raw_data.drop(columns=["Personality"])
target = raw_data["Personality"]

In [4]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(features, target, test_size=0.2, random_state=42)
print(X_train.shape, X_test.shape, y_train.shape, y_test.shape)

(102448, 8) (25613, 8) (102448,) (25613,)


In [5]:
import yaml
with open("../config/data_management.yaml", "r") as file:
    config = yaml.safe_load(file)
config

{'data_source': 'data/raw/data.csv',
 'remote_storage': 'data/preprocessed/',
 'data_description': {'features': ['Age',
   'Gender',
   'Education',
   'Introversion Score',
   'Sensing Score',
   'Thinking Score',
   'Judging Score',
   'Interest'],
  'targets': ['Personality']},
 'data_preprocessing': {'features_transformations': {'Age': {'imputer': 'mean',
    'transformer': 'standard'},
   'Gender': {'imputer': 'most_frequent', 'transformer': 'ordinal'},
   'Education': {'imputer': 'most_frequent', 'transformer': 'None'},
   'Introversion Score': {'imputer': 'most_frequent', 'transformer': 'MinMax'},
   'Sensing Score': {'imputer': 'mean', 'transformer': 'standard'},
   'Thinking Score': {'imputer': 'most_frequent', 'transformer': 'MinMax'},
   'Judging Score': {'imputer': 'mean', 'transformer': 'standard'},
   'Interest': {'imputer': 'most_frequent', 'transformer': 'oh_encoder'}},
  'targets_transformations': {'Personality': {'imputer': 'most_frequent',
    'transformer': 'ordinal

In [6]:
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import OrdinalEncoder
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer

preprocessing_settings = config["data_preprocessing"]["features_transformations"]

preprocessing_pipeline = []

for column, settings in preprocessing_settings.items():
    selected_imputer = settings["imputer"]
    selected_transformer = settings["transformer"]

    column_steps = []

    if selected_imputer in ["mean", "median", "most_frequent"]:
        column_steps.append(("imputer", SimpleImputer(strategy=selected_imputer)))

    if selected_transformer == "standard":
        column_steps.append(("transformer", StandardScaler()))
    elif selected_transformer == "MinMax":
        column_steps.append(("transformer", MinMaxScaler()))
    elif selected_transformer == "oh_encoder":
        column_steps.append(("transformer", OneHotEncoder(sparse_output=False)))
    elif selected_transformer == "ordinal":
        column_steps.append(("transformer", OrdinalEncoder()))

    preprocessing_pipeline.append((column, Pipeline(column_steps), [column]))
    
dataset_preprocessor = ColumnTransformer(preprocessing_pipeline, verbose_feature_names_out=False).set_output(transform="pandas")
dataset_preprocessor

In [7]:
processed_train = dataset_preprocessor.fit_transform(pd.concat([X_train, y_train], axis=1))
new_targets = dataset_preprocessor.named_transformers_["Personality"].get_feature_names_out()
X_preprocessed_train = processed_train.drop(columns=new_targets)
y_preprocessed_train = processed_train[new_targets]
new_targets

KeyError: 'Personality'

In [8]:
processed_test = dataset_preprocessor.transform(pd.concat([X_test, y_test], axis=1))
X_preprocessed_test = processed_test.drop(columns=new_targets)
y_preprocessed_test = processed_test[new_targets]

In [10]:
# Save the preprocessed data
X_preprocessed_train.to_csv("../data/preprocessed/X_train.csv", index=False)
y_preprocessed_train.to_csv("../data/preprocessed/y_train.csv", index=False)
X_preprocessed_test.to_csv("../data/preprocessed/X_test.csv", index=False)
y_preprocessed_test.to_csv("../data/preprocessed/y_test.csv", index=False)

In [54]:
# Model training for multiclass classification
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import classification_report
from sklearn.ensemble import HistGradientBoostingClassifier
from sklearn.linear_model import RidgeClassifier
from sklearn.svm import SVC
import pickle

models = {
    "HistGradientBoosting": {
        "model": HistGradientBoostingClassifier(),
        "params": {
            "learning_rate": [0.1, 0.01],
            "max_iter": [100, 200],
            "max_leaf_nodes": [15, 31],
            "max_depth": [None, 10]
        }
    },
    "RidgeClassifier": {
        "model": RidgeClassifier(),
        "params": {
            "alpha": [0.1, 1, 10]
        }
    },
    "SVC": {
        "model": SVC(),
        "params": {
            "C": [0.1, 1, 10],
            "kernel": ["linear", "rbf"]
        }
    }
}

for model_name, model_params in models.items():
    grid_search = GridSearchCV(model_params["model"], model_params["params"], cv=5, n_jobs=-1, verbose=2)
    grid_search.fit(X_preprocessed_train, y_preprocessed_train)
    print(f"Best parameters for {model_name}: {grid_search.best_params_}")

    y_pred = grid_search.predict(X_preprocessed_test)
    print(f"Classification report for {model_name}:\n{classification_report(y_preprocessed_test, y_pred)}")

    with open(f"../models/{model_name}.pkl", "wb") as file:
        pickle.dump(grid_search, file)

Fitting 5 folds for each of 16 candidates, totalling 80 fits


  y = column_or_1d(y, warn=True)


Best parameters for HistGradientBoosting: {'learning_rate': 0.01, 'max_depth': 10, 'max_iter': 200, 'max_leaf_nodes': 15}
Classification report for HistGradientBoosting:
              precision    recall  f1-score   support

         0.0       0.93      0.93      0.93       799
         1.0       0.93      0.94      0.93      6963
         2.0       0.92      0.92      0.92       531
         3.0       0.93      0.93      0.93      4989
         4.0       0.79      0.80      0.80       101
         5.0       0.85      0.84      0.85       981
         6.0       0.83      0.76      0.79        78
         7.0       0.85      0.80      0.83       633
         8.0       0.90      0.93      0.91       611
         9.0       0.90      0.91      0.91      4895
        10.0       0.91      0.87      0.89       362
        11.0       0.90      0.90      0.90      3415
        12.0       0.84      0.71      0.77        75
        13.0       0.84      0.76      0.80       670
        14.0       

  y = column_or_1d(y, warn=True)
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


Best parameters for RidgeClassifier: {'alpha': 1}
Classification report for RidgeClassifier:
              precision    recall  f1-score   support

         0.0       0.62      0.01      0.01       799
         1.0       0.72      0.93      0.81      6963
         2.0       0.00      0.00      0.00       531
         3.0       0.70      0.92      0.79      4989
         4.0       0.00      0.00      0.00       101
         5.0       0.60      0.01      0.02       981
         6.0       0.00      0.00      0.00        78
         7.0       0.00      0.00      0.00       633
         8.0       0.00      0.00      0.00       611
         9.0       0.67      0.89      0.77      4895
        10.0       0.00      0.00      0.00       362
        11.0       0.72      0.73      0.72      3415
        12.0       0.00      0.00      0.00        75
        13.0       0.00      0.00      0.00       670
        14.0       0.00      0.00      0.00        48
        15.0       0.00      0.00      0.0

KeyboardInterrupt: 