In [48]:
import xgboost as xgb
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
import pandas as pd
from scipy.sparse import csr_matrix

In [49]:
# Import csv to df
df = pd.read_csv("../data/renttherunway_cleaned_data.csv")

print(df.dtypes)

fit                 object
user_id              int64
bust size           object
item_id              int64
weight             float64
rating               int64
rented for          object
review_text         object
body type           object
review_summary      object
category            object
height             float64
size                 int64
age                float64
review_date         object
review_length        int64
band_size          float64
cup_size            object
item_rent_count      int64
dtype: object


In [50]:

def train_and_predict_xgboost(df):
    df = df.drop(['user_id', 'item_id', 'review_date', 'review_summary', 'review_text'], axis=1)

    categorical_features = ['fit', 'bust size', 'rented for', 'body type', 'category', 'cup_size']
    df = pd.get_dummies(df, columns=categorical_features)

    # Assuming 'ratings' is the target column, and you have other features as well.
    X = df.drop('rating', axis=1)
    y = df['rating']

    # Split the dataset into training and testing sets.
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

    # Create an XGBoost regressor model.
    xgb_model = xgb.XGBRegressor(objective='reg:squarederror', random_state=42)

    # Train the model on the training data.
    xgb_model.fit(X_train, y_train)

    # Make predictions on the test data.
    y_pred = xgb_model.predict(X_test)

    # Calculate the mean squared error to evaluate the model's performance.
    mse = mean_squared_error(y_test, y_pred)

    return xgb_model, mse

# Usage:
# Assuming 'your_dataframe' is your dataset.
trained_model, mse_score = train_and_predict_xgboost(df)
print("Mean Squared Error:", mse_score)


Mean Squared Error: 1.829081701297055


In [51]:
from xgboost import XGBRegressor
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import mean_squared_error, r2_score

def train_and_predict_xgboost(df):
    # Drop unnecessary columns
    df = df.drop(['user_id', 'item_id', 'review_date', 'review_summary', 'review_text'], axis=1)

    # Define categorical features
    categorical_features = ['fit', 'bust size', 'rented for', 'body type', 'category', 'cup_size']

    # One-hot encode categorical features
    df = pd.get_dummies(df, columns=categorical_features)

    # Assuming 'ratings' is the target column, and you have other features as well.
    X = df.drop('rating', axis=1)
    y = df['rating']

    # Split the dataset into training and testing sets.
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

    # Create an XGBoost regressor model.
    xgb_model = XGBRegressor(
        objective='reg:squarederror',
        random_state=42,
        n_estimators=1000,  # Increase the number of estimators
        learning_rate=0.05,  # Adjust learning rate
        max_depth=5,  # Tune the maximum depth of trees
        subsample=0.8,  # Use subsampling for regularization
        colsample_bytree=0.8,  # Feature subsampling
        reg_alpha=0.01,  # L1 regularization
        reg_lambda=0.1,  # L2 regularization
    )

    # Perform hyperparameter tuning with cross-validation
    param_grid = {
        'n_estimators': [100, 500, 1000],
        'max_depth': [3, 5, 7],
    }
    grid_search = GridSearchCV(estimator=xgb_model, param_grid=param_grid, scoring='neg_mean_squared_error', cv=3)
    grid_search.fit(X_train, y_train)
    best_xgb_model = grid_search.best_estimator_

    # Train the best model on the training data
    best_xgb_model.fit(X_train, y_train)

    # Make predictions on the test data
    y_pred = best_xgb_model.predict(X_test)

    # Calculate mean squared error and R-squared
    mse = mean_squared_error(y_test, y_pred)
    r2 = r2_score(y_test, y_pred)

    return best_xgb_model, mse, r2

# Usage:
# Assuming 'your_dataframe' is your dataset.
trained_model, mse_score, r2_score = train_and_predict_xgboost(df)
print("Mean Squared Error:", mse_score)
print("R-squared:", r2_score)


Mean Squared Error: 1.8179605314858187
R-squared: 0.1040232495518475
