In [43]:
import sys

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns
from sklearn.ensemble import GradientBoostingClassifier, RandomForestClassifier

# sklearn imports for preprocessing and model building
from sklearn.impute import SimpleImputer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import (
    accuracy_score,
    confusion_matrix,
    f1_score,
    precision_score,
    recall_score,
    roc_auc_score,
)
from sklearn.model_selection import GridSearchCV, train_test_split
from sklearn.neighbors import KNeighborsClassifier
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import LabelEncoder, PolynomialFeatures, StandardScaler
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.utils.class_weight import compute_class_weight
from tabulate import tabulate

sys.path.insert(0, "../utils")
import utils


In [44]:
# Import csv to df
df = pd.read_csv("../data/renttherunway_cleaned_data.csv")


In [45]:
# Results dataframe
results = pd.DataFrame(
    columns=[
        "model",
        "accuracy",
        "precision",
        "recall",
        "f1",
        "mse",
        "ber",
        "mae",
        "best_params",
        "best_estimator",
        "feature_importances",
        "confusion_matrix",
    ]
)


In [46]:
def logistic_regression_model(
    data, model_class, full_grid_search=False, param_grid=None, **kwargs
):
    # Label Encoding for categorical variables
    label_encoders = {}
    categorical_cols = [
        "bust size",
        "rented for",
        "body type",
        "category",
        "cup_size",
        "fit",
    ]
    for col in categorical_cols:
        le = LabelEncoder()
        data[col] = le.fit_transform(data[col].astype(str))
        label_encoders[col] = le

    # Feature columns
    feature_cols = [
        "fit",
        "user_id",
        "bust size",
        "item_id",
        "weight",
        "rented for",
        "body type",
        "category",
        "height",
        "size",
        "age",
        "review_length",
        "band_size",
        "cup_size",
    ]

    # Impute missing values
    imputer = SimpleImputer(strategy="mean")
    data[feature_cols] = imputer.fit_transform(data[feature_cols])

    data = data.dropna(subset=["rating"])

    # Splitting the dataset
    X = data[feature_cols]

    def map_rating_to_label(rating):
        rating_mapping = {2.0: 0, 4.0: 1, 6.0: 2, 8.0: 3, 10.0: 4}
        return rating_mapping.get(rating, 5)  # 5 is the label for 'others'

    y = data["rating"].apply(map_rating_to_label)

    X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=0.3, random_state=42
    )

    # Scaling features
    scaler = StandardScaler()
    X_train = scaler.fit_transform(X_train)
    X_test = scaler.transform(X_test)

    # Ignore sklearn warnings
    import warnings

    warnings.filterwarnings("ignore")

    # Grid Search with Cross-Validation
    if full_grid_search:
        grid_search = GridSearchCV(
            model_class(), param_grid, cv=5, scoring="accuracy", verbose=1
        )
        grid_search.fit(X_train, y_train)

        best_model = grid_search.best_estimator_
        print(f"Best parameters: {grid_search.best_params_}")
        print(f"Best cross-validation score: {grid_search.best_score_}")
    else:
        best_model = model_class(**kwargs)
        # best_model = LogisticRegression(C=0.01, class_weight='balanced', penalty='l1', solver='liblinear')
        best_model.fit(X_train, y_train)

    # Feature Importance Analysis
    # Get coefficients and calculate odds ratios
    try:
        coefficients = best_model.coef_[0]
        odds_ratios = np.exp(coefficients)
    except AttributeError:
        coefficients = best_model.feature_importances_
        odds_ratios = coefficients

    # Make feature importance string of format (feature, odds_ratio)
    feature_importances = ""
    for i in range(len(feature_cols)):
        feature_importances += (
            "(" + feature_cols[i] + ", " + str(odds_ratios[i]) + ")\n"
        )

    # Evaluate on test set
    test_score = best_model.score(X_test, y_test)
    y_pred = best_model.predict(X_test)

    # Save results
    mse = np.mean((y_test - y_pred) ** 2)
    ber = 1 - test_score
    mae = np.mean(np.abs(y_test - y_pred))
    f1 = f1_score(y_test, y_pred, average="weighted")
    precision = precision_score(y_test, y_pred, average="weighted")
    recall = recall_score(y_test, y_pred, average="weighted")
    accuracy = accuracy_score(y_test, y_pred)
    if full_grid_search:
        best_params = grid_search.best_params_
        best_estimator = grid_search.best_estimator_
    else:
        best_params = kwargs
        best_estimator = None
    results.loc[len(results)] = [
        model_class.__name__,
        accuracy,
        precision,
        recall,
        f1,
        mse,
        ber,
        mae,
        best_params,
        best_estimator,
        feature_importances,
        confusion_matrix(y_test, y_pred),
    ]

    # Only print recently appended row of results
    print(tabulate(results.iloc[-1:], headers="keys", tablefmt="psql", showindex=False))

    return best_model


In [47]:
lr_param_grid = {
    "C": [0.01, 0.1, 1, 10, 100],
    "penalty": ["l1", "l2"],
    "solver": ["saga", "liblinear"],
    "class_weight": ["balanced"],
}
# param_grid = {
#     'C': [0.01, 0.1, 1, 10, 100],  # Regularization strength
#     'penalty': ['l1', 'l2'],  # Penalty types
#     'solver': ['newton-cg', 'lbfgs', 'liblinear', 'sag', 'saga'],  # Solvers
#     'class_weight': [
#         {0: 1, 1: 1, 2: 1, 3: 1, 4: 1, 5: 1},  # Equal weights
#         {0: 2, 1: 1, 2: 1, 3: 1, 4: 1, 5: 1},  # Higher weight for class 0
#         {0: 1, 1: 2, 2: 1, 3: 1, 4: 1, 5: 1},  # Higher weight for class 1
#         {0: 1, 1: 1, 2: 2, 3: 1, 4: 1, 5: 1},  # Higher weight for class 2
#         {0: 1, 1: 1, 2: 1, 3: 2, 4: 1, 5: 1},  # Higher weight for class 3
#         {0: 1, 1: 1, 2: 1, 3: 1, 4: 2, 5: 1},  # Higher weight for class 4
#         {0: 1, 1: 1, 2: 1, 3: 1, 4: 1, 5: 2},  # Higher weight for class 5
#         {0: 3, 1: 2, 2: 1, 3: 1, 4: 1, 5: 1},  # Custom weights as an example
#         'balanced'  # Automatically compute weights based on class frequencies
#     ]
# }

best_lr_model = logistic_regression_model(
    df,
    LogisticRegression,
    full_grid_search=True,
    param_grid=lr_param_grid,
    C=0.01,
    class_weight="balanced",
    penalty="l1",
    solver="liblinear",
)


Fitting 5 folds for each of 20 candidates, totalling 100 fits
Best parameters: {'C': 0.01, 'class_weight': 'balanced', 'penalty': 'l1', 'solver': 'liblinear'}
Best cross-validation score: 0.5863364199374198
+--------------------+------------+-------------+----------+----------+---------+----------+----------+---------------------------------------------------------------------------------+-------------------------------------------------------------------+-------------------------------------+-----------------------------------+
| model              |   accuracy |   precision |   recall |       f1 |     mse |      ber |      mae | best_params                                                                     | best_estimator                                                    | feature_importances                 | confusion_matrix                  |
|--------------------+------------+-------------+----------+----------+---------+----------+----------+----------------------------------

In [48]:
gbc_param_grid = {
    "n_estimators": [100, 200, 300],
    "learning_rate": [0.1, 0.01, 0.001],
    "max_depth": [3, 5, 7],
    "random_state": [42],
}

best_gbc_model = logistic_regression_model(
    df,
    GradientBoostingClassifier,
    full_grid_search=True,
    param_grid=gbc_param_grid,
    n_estimators=100,
    learning_rate=0.1,
    max_depth=3,
    random_state=42,
)


Fitting 5 folds for each of 27 candidates, totalling 135 fits


In [None]:
rf_param_grid = {
    "n_estimators": [100, 200, 300],
    "max_depth": [3, 5, 7],
    "random_state": [42],
}

best_rf_model = logistic_regression_model(
    df,
    RandomForestClassifier,
    full_grid_search=True,
    param_grid=rf_param_grid,
    n_estimators=100,
    max_depth=3,
    random_state=42,
)


+------------------------+------------+-------------+----------+----------+----------+----------+----------+-----------------------------------------------------------+------------------+--------------------------------------+-----------------------------------+
| model                  |   accuracy |   precision |   recall |       f1 |      mse |      ber |      mae | best_params                                               | best_estimator   | feature_importances                  | confusion_matrix                  |
|------------------------+------------+-------------+----------+----------+----------+----------+----------+-----------------------------------------------------------+------------------+--------------------------------------+-----------------------------------|
| RandomForestClassifier |   0.646443 |    0.417889 | 0.646443 | 0.507627 | 0.718267 | 0.353557 | 0.454251 | {'n_estimators': 100, 'max_depth': 3, 'random_state': 42} |                  | (fit, 0.635002172844580

In [None]:
svc_param_grid = {
    "C": [0.1, 1, 10],
    "kernel": ["linear", "rbf"],
    "gamma": ["scale", "auto"],
}

best_svc_model = logistic_regression_model(
    df,
    SVC,
    full_grid_search=True,
    param_grid=svc_param_grid,
    C=1,
    kernel="rbf",
    gamma="scale",
)


In [None]:
knn_param_grid = {
    "n_neighbors": [3, 5, 7],
    "weights": ["uniform", "distance"],
    "algorithm": ["ball_tree", "kd_tree", "brute"],
}

best_knn_model = logistic_regression_model(
    df,
    KNeighborsClassifier,
    full_grid_search=True,
    param_grid=knn_param_grid,
    n_neighbors=5,
    weights="uniform",
    algorithm="ball_tree",
)


In [None]:
dt_param_grid = {
    "max_depth": [3, 5, 7, None],
    "min_samples_split": [2, 4, 6],
    "min_samples_leaf": [1, 2, 3],
    "random_state": [42],
}

best_dt_model = logistic_regression_model(
    df,
    DecisionTreeClassifier,
    full_grid_search=True,
    param_grid=dt_param_grid,
    max_depth=3,
    min_samples_split=2,
    min_samples_leaf=1,
    random_state=42,
)


In [None]:
# Save results df
results.to_csv("../results/sklearn_results.csv", index=False)
