In [11]:
import datetime
import sys

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns
from joblib import dump, load
from sklearn.compose import ColumnTransformer
from sklearn.ensemble import GradientBoostingClassifier, RandomForestClassifier

# sklearn imports for preprocessing and model building
from sklearn.impute import SimpleImputer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import (
    accuracy_score,
    confusion_matrix,
    f1_score,
    precision_score,
    recall_score,
    roc_auc_score,
)
from sklearn.model_selection import GridSearchCV, train_test_split
from sklearn.neighbors import KNeighborsClassifier
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import (
    LabelEncoder,
    OneHotEncoder,
    PolynomialFeatures,
    StandardScaler,
)
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.utils.class_weight import compute_class_weight
from tabulate import tabulate

sys.path.insert(0, "../utils")
import utils


In [12]:
# Import csv to df
df = pd.read_csv("../data/renttherunway_cleaned_data.csv")

print(df.dtypes)

fit                object
user_id             int64
bust size          object
item_id             int64
weight            float64
rating              int64
rented for         object
review_text        object
body type          object
review_summary     object
category           object
height            float64
size                int64
age               float64
review_date        object
review_length       int64
band_size         float64
cup_size           object
dtype: object


In [13]:
# Results dataframe
results = pd.DataFrame(
    columns=[
        "model",
        "accuracy",
        "precision",
        "recall",
        "f1",
        "mse",
        "ber",
        "mae",
        "best_params",
        "best_estimator",
        "feature_importances",
        "confusion_matrix",
    ]
)


In [14]:
def save_transformed_data(data, data_file='../data/transformed_data.joblib', preprocessor_file='../data/preprocessor.joblib'):
    # Define categorical and numeric columns
    categorical_cols = ["fit", "bust size", "rented for", "body type", "category", "cup_size"]
    numeric_cols = ["weight", "height", "size", "age", "review_length", "band_size"]

    # Preprocessor for categorical and numeric data
    preprocessor = ColumnTransformer(
        transformers=[
            ('num', SimpleImputer(strategy='mean'), numeric_cols),
            ('cat', OneHotEncoder(), categorical_cols)
        ], sparse_threshold=0)

    # Convert 'rating' using LabelEncoder
    label_encoder = LabelEncoder()
    data['rating'] = label_encoder.fit_transform(data['rating'])

    # Splitting features and target
    X = data.drop("rating", axis=1)
    y = data["rating"]

    # Applying transformations
    X_transformed = preprocessor.fit_transform(X)

    # Saving the transformed data
    # dump((X_transformed, y), file_name)
    # print(f"Transformed data saved to {file_name}")
    dump((X_transformed, y), data_file)
    dump(preprocessor, preprocessor_file)
    print(f"Transformed data saved to {data_file}")
    print(f"Preprocessor saved to {preprocessor_file}")

    return X_transformed, y

In [15]:
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder

def get_feature_names(column_transformer):
    """
    Get feature names from a ColumnTransformer.
    """
    output_features = []

    for transformer in column_transformer.transformers_:
        transformer_name, transformer_obj, transformer_cols = transformer

        if transformer_name == 'remainder':
            # This case handles the remainder columns (those not explicitly transformed)
            output_features.extend(transformer_cols)
        elif hasattr(transformer_obj, 'get_feature_names_out'):
            # This case handles transformers with a `get_feature_names_out` method
            transformer_features = transformer_obj.get_feature_names_out(transformer_cols)
            output_features.extend(transformer_features)
        else:
            # This case handles all other transformers
            output_features.extend(transformer_cols)

    return output_features


In [16]:
def logistic_regression_model(X_train, y_train, X_test, y_test, preprocessor, model_class, param_grid, full_grid_search=False):
    # Create a pipeline with model
    model_pipeline = Pipeline(steps=[
        ('scaler', StandardScaler(with_mean=False)),
        ('classifier', model_class())
    ])

    # Grid Search with Cross-Validation
    if full_grid_search:
        grid_search = GridSearchCV(model_pipeline, param_grid, cv=5, scoring="accuracy", verbose=3)
        grid_search.fit(X_train, y_train)

        best_model = grid_search.best_estimator_
        print(f"Best parameters: {grid_search.best_params_}")
        print(f"Best cross-validation score: {grid_search.best_score_}")
    else:
        # Use param grid as kwargs for model
        best_model = model_pipeline
        best_model.set_params(**{'classifier__'+k: v for k, v in param_grid.items()})
        best_model.fit(X_train, y_train)

        

    # Evaluation
    y_pred = best_model.predict(X_test)
    mse = np.mean((y_test - y_pred) ** 2)
    mae = np.mean(np.abs(y_test - y_pred))
    f1 = f1_score(y_test, y_pred, average="weighted")
    precision = precision_score(y_test, y_pred, average="weighted")
    recall = recall_score(y_test, y_pred, average="weighted")
    accuracy = accuracy_score(y_test, y_pred)
    ber = 1 - accuracy_score(y_test, y_pred, normalize=True, sample_weight=None)
    
    # Feature Importances Calculation
    try:
        feature_names = get_feature_names(preprocessor)

        if hasattr(best_model.named_steps['classifier'], 'coef_'):
            importances = best_model.named_steps['classifier'].coef_[0]
        elif hasattr(best_model.named_steps['classifier'], 'feature_importances_'):
            importances = best_model.named_steps['classifier'].feature_importances_
        else:
            importances = 'N/A'

        if importances != 'N/A':
            # Map feature names to their importances
            feature_importances = {feature: imp for feature, imp in zip(feature_names, importances)}
        else:
            feature_importances = 'N/A'
    except AttributeError:
        feature_importances = 'N/A'

    # Dynamically aggreagate label importances
    # Define a dictionary to store aggregated importances by category
    aggregated_importances = {}

    # Iterate through feature importances and aggregate importances by category
    for feature, importance in feature_importances.items():
        # Split the feature name by '_' to get category and label
        parts = feature.split('_')
        if len(parts) > 1:
            category = parts[0]  # Get the category part
            if category not in aggregated_importances:
                aggregated_importances[category] = 0
            # Aggregate the importance within the category
            aggregated_importances[category] += importance

    # Display the aggregated importances for each category
    feature_importances_str = ""
    for category, importance in aggregated_importances.items():
        feature_importances_str += f'{category}: {importance}\n'

    # # Convert feature importances to string
    # feature_importances_str = ""
    # if feature_importances != 'N/A':
    #     for key, value in feature_importances.items():
    #         feature_importances_str += f"{key}: {value}\n"

    run_results = {
        "model": model_class.__name__,
        "accuracy": accuracy,
        "precision": precision,
        "recall": recall,
        "f1": f1,
        "mse": mse,
        "ber": ber,
        "mae": mae,
        "best_params": grid_search.best_params_ if full_grid_search else param_grid,
        "best_estimator": best_model,
        "feature_importances": feature_importances_str,
        "confusion_matrix": confusion_matrix(y_test, y_pred),
    }

    # Create a DataFrame from the run results
    run_results_df = pd.DataFrame([run_results])

    # Append to the external results DataFrame
    global results  # Ensure we're using the external 'results' DataFrame
    results = pd.concat([results, run_results_df], ignore_index=True)

    # Print results
    print(tabulate(run_results_df, headers="keys", tablefmt="psql"))

    return best_model


In [17]:
# Transform and save data
save_transformed_data(df)

# Load transformed data and preprocessor
X_transformed, y = load('../data/transformed_data.joblib')
preprocessor = load('../data/preprocessor.joblib')

X_train, X_test, y_train, y_test = train_test_split(X_transformed, y, test_size=0.3, random_state=42)

Transformed data saved to ../data/transformed_data.joblib
Preprocessor saved to ../data/preprocessor.joblib


In [18]:
# Print snippet of data
print(X_train[:5])
print()
print(y_train[:5])
print()
print(preprocessor.transformers_)

[[135.  70.   8. ...   0.   0.   0.]
 [160.  66.  20. ...   0.   0.   0.]
 [180.  65.  35. ...   0.   0.   1.]
 [115.  67.   5. ...   0.   0.   0.]
 [125.  69.   8. ...   0.   0.   0.]]

37601     4
37009     4
165897    3
159616    4
110372    4
Name: rating, dtype: int64

[('num', SimpleImputer(), ['weight', 'height', 'size', 'age', 'review_length', 'band_size']), ('cat', OneHotEncoder(), ['fit', 'bust size', 'rented for', 'body type', 'category', 'cup_size']), ('remainder', 'drop', [1, 3, 6, 8, 13])]


In [19]:
lr_param_grid = {
    "C": [0.01, 0.1, 1, 10, 100],
    "penalty": ["l1", "l2"],
    "solver": ["saga", "liblinear"],
    "class_weight": ["balanced"],
}

lr_best_params = {
    "C": 0.01,
    "class_weight": "balanced",
    "penalty": "l1",
    "solver": "liblinear",
}

best_lr_model = logistic_regression_model(
    X_train, y_train, X_test, y_test,
    preprocessor,
    LogisticRegression,
    full_grid_search=False,
    param_grid=lr_best_params
)


+----+--------------------+------------+-------------+----------+----------+-------+----------+----------+---------------------------------------------------------------------------------+--------------------------------------------------------------------------+-----------------------------------+-----------------------------------+
|    | model              |   accuracy |   precision |   recall |       f1 |   mse |      ber |      mae | best_params                                                                     | best_estimator                                                           | feature_importances               | confusion_matrix                  |
|----+--------------------+------------+-------------+----------+----------+-------+----------+----------+---------------------------------------------------------------------------------+--------------------------------------------------------------------------+-----------------------------------+-----------------------------

  if importances != 'N/A':


In [20]:
gbc_param_grid = {
    "n_estimators": [100, 200, 300],
    "learning_rate": [0.1, 0.01, 0.001],
    "max_depth": [3, 5, 7],
    "random_state": [42],
}

gbc_best_params = {
    "n_estimators": 300,
    "learning_rate": 0.01,
    "max_depth": 7,
    "random_state": 42,
}

best_gbc_model = logistic_regression_model(
    X_train, y_train, X_test, y_test,
    preprocessor,
    GradientBoostingClassifier,
    full_grid_search=False,
    param_grid=gbc_best_params
)


In [None]:
rf_param_grid = {
    "n_estimators": [100, 200, 300],
    "max_depth": [3, 5, 7],
    "random_state": [42],
}

rf_best_params = {
    "n_estimators": 300,
    "max_depth": 7,
    "random_state": 42,
}

best_rf_model = logistic_regression_model(
    X_train, y_train, X_test, y_test,
    preprocessor,
    RandomForestClassifier,
    full_grid_search=False,
    param_grid=rf_best_params
)


Fitting 5 folds for each of 9 candidates, totalling 45 fits
Best parameters: {'max_depth': 7, 'n_estimators': 300, 'random_state': 42}
Best cross-validation score: 0.6475954358398762
+------------------------+------------+-------------+----------+----------+----------+----------+----------+-----------------------------------------------------------+------------------------------------------------------------------------+--------------------------------------+-----------------------------------+
| model                  |   accuracy |   precision |   recall |       f1 |      mse |      ber |      mae | best_params                                               | best_estimator                                                         | feature_importances                  | confusion_matrix                  |
|------------------------+------------+-------------+----------+----------+----------+----------+----------+-----------------------------------------------------------+---------------

In [None]:
svc_param_grid = {
    "C": [0.1, 1, 10],
    "kernel": ["linear", "rbf"],
    "gamma": ["scale", "auto"],
}

svc_best_params = {
    "C": 1,
    "kernel": "rbf",
    "gamma": "scale",
}

best_svc_model = logistic_regression_model(
    X_train, y_train, X_test, y_test,
    preprocessor,
    SVC,
    full_grid_search=False,
    param_grid=svc_best_params,
)


Fitting 5 folds for each of 12 candidates, totalling 60 fits


In [None]:
knn_param_grid = {
    "n_neighbors": [3, 5, 7],
    "weights": ["uniform", "distance"],
    "algorithm": ["ball_tree", "kd_tree", "brute"],
}

knn_best_params = {
    "n_neighbors": 5,
    "weights": "uniform",
    "algorithm": "ball_tree",
}

best_knn_model = logistic_regression_model(
    X_train, y_train, X_test, y_test,
    preprocessor,
    KNeighborsClassifier,
    full_grid_search=False,
    param_grid=knn_best_params
)


In [None]:
dt_param_grid = {
    "max_depth": [3, 5, 7, None],
    "min_samples_split": [2, 4, 6],
    "min_samples_leaf": [1, 2, 3],
    "random_state": [42],
}

dt_best_params = {
    "max_depth": 3,
    "min_samples_split": 2,
    "min_samples_leaf": 1,
    "random_state": 42,
}

best_dt_model = logistic_regression_model(
    X_train, y_train, X_test, y_test,
    preprocessor,
    DecisionTreeClassifier,
    full_grid_search=False,
    param_grid=dt_best_params
)


In [None]:
# Get time in format 20230101-010101
now = datetime.datetime.now()
time = now.strftime("%Y%m%d-%H%M%S")

# Save results df
results.to_csv(f"../results/sklearn_results_{time}.csv", index=False)
