In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
import statsmodels.api as sm
import pyarrow.parquet as pq
from functools import reduce

In [None]:
from sklearn.decomposition import PCA
def perform_pca(df, n_components=2):
    pca = PCA(n_components=n_components)
    modified_df = df.copy()
    modified_df["pca_sales"] = pca.fit_transform(df[["0_sales", "sales_avg_3_months", "sales_avg_6_months"]])
    modified_df["pca_views"] = pca.fit_transform(df[["0_PLPViewsPerDay", "PLPViewsPerDay_avg_3_months", "PLPViewsPerDay_avg_6_months"]])
    modified_df["pca_clicks"] = pca.fit_transform(df[["0_PLPClicksPerDay", "PLPClicksPerDay_avg_3_months", "PLPClicksPerDay_avg_6_months"]])
    modified_df["pca_pdp"] = pca.fit_transform(df[["0_PDPCountPerDay", "PDPCountPerDay_avg_3_months", "PDPCountPerDay_avg_6_months"]])
    modified_df["pca_cart"] = pca.fit_transform(df[["0_TotalAddToCartPerDay", "TotalAddToCartPerDay_avg_3_months", "TotalAddToCartPerDay_avg_6_months"]])
    return modified_df

In [None]:
def load_train_test_actual_data(of, columns):
    train_df = pq.ParquetDataset(f"~/bhavesh-couture/Downloads/local/{of}/data/train").read(columns=["productid"] + columns + ["yQuantity"]).to_pandas()
    test_df = pq.ParquetDataset(f"~/bhavesh-couture/Downloads/local/{of}/data/test").read(columns=["productid"] + columns).to_pandas()
    actual = pd.read_parquet("~/bhavesh-couture/Downloads/local/actualData").rename(columns={"actual": "yQuantity"})
    return train_df, test_df, actual

def train_model_and_predict(x_train, y_train, x_test):
    model = sm.OLS(y_train, sm.add_constant(x_train)).fit()
    print(model.summary())
    y_pred = model.predict(sm.add_constant(x_test))
    return y_pred

def normalize(df, return_min_max=False):
    min_value = df.min()
    max_value = df.max()
    normalized_df = (df - min_value) / (max_value - min_value)
    if return_min_max:
        return normalized_df, min_value, max_value
    return normalized_df

def inverse_normalize(df, min_value, max_value):
    return df * (max_value - min_value) + min_value

In [None]:
from sklearn.metrics import mean_squared_error as mse, mean_absolute_percentage_error as mape, r2_score
def metrics(actual, predicted):    
    y_true = actual
    y_pred = predicted
    print(f"rmse: {mse(y_true, y_pred)**(0.5)}")
    print(f"mape: {mape(y_true, y_pred)}")
    print(f"r2_score: {r2_score(y_true, y_pred)}")

### Experiments

In [None]:
def perform_pca_experiment(of):
    attributes = ["sales", "PLPViewsPerDay", "PLPClicksPerDay", "PDPCountPerDay", "TotalAddToCartPerDay"]
    columns = reduce(lambda a,b: a+b, map(lambda attribute: [f"0_{attribute}", f"{attribute}_avg_3_months", f"{attribute}_avg_6_months"], attributes))
    print(f'feature: {", ".join([f"pca({columns[j]}, {columns[j+1]}, {columns[j+2]})" for j in range(0, len(columns), 3)])}')
    train, test, actual = load_train_test_actual_data(of, columns)
    train, test = perform_pca(train, 1), perform_pca(test, 1)
    
    x_train, y_train = train[["pca_sales", "pca_views", "pca_clicks", "pca_pdp", "pca_cart"]].copy(), train["yQuantity"].copy()
    x_test = test[["pca_sales", "pca_views", "pca_clicks", "pca_pdp", "pca_cart"]].copy()

    x_train, x_test = normalize(x_train), normalize(x_test)
    y_train, y_min, y_max = normalize(y_train, return_min_max=True)
    print(y_min, y_max)

    y_pred = train_model_and_predict(x_train, y_train, x_test)
    test["predictedyQuantity"] = inverse_normalize(y_pred, y_min, y_max)
    test[["productid", "predictedyQuantity"]].to_csv(f"~/bhavesh-couture/Downloads/local/{of}/predictions_pca.csv", index=False)
    return test, actual


In [None]:
def perform_n_months_experiment(of, n, attributes):
    columns = reduce(lambda a,b: a+b, map(lambda attribute: [f"{i}_{attribute}" for i in range(0, n)], attributes))
    print(f'features: {", ".join(columns)}')

    train, test, actual = load_train_test_actual_data(of, columns)
    x_train, y_train = train[columns].copy(), train["yQuantity"].copy()
    x_test = test[columns].copy()

    x_train, x_test = normalize(x_train), normalize(x_test)
    y_train, y_min, y_max = normalize(y_train, return_min_max=True)
    print(y_min, y_max)

    y_pred = train_model_and_predict(x_train, y_train, x_test)
    test["predictedyQuantity"] = inverse_normalize(y_pred, y_min, y_max)
    test[["productid", "predictedyQuantity"]].to_csv(f"~/bhavesh-couture/Downloads/local/{of}/predictions_{n}_months.csv", index=False)
    return test, actual

### Using PCA MenShirts

In [None]:
predictions, actual = perform_pca_experiment("menShirts")
# predictions.head(10)

In [None]:
print(predictions.shape)
metrics_df = predictions[['productid', 'predictedyQuantity']].merge(actual, on="productid")
print(metrics_df.shape)
metrics(metrics_df["yQuantity"], metrics_df["predictedyQuantity"])

In [None]:
metrics_df.sort_values(by=["yQuantity"], ascending=False).head(10)

### Using PCA WomenKurtas

In [None]:
predictions, actual = perform_pca_experiment("womenKurtas")

In [None]:
print(predictions.shape)
metrics_df = predictions[['productid', 'predictedyQuantity']].merge(actual, on="productid")
print(metrics_df.shape)
metrics(metrics_df["yQuantity"], metrics_df["predictedyQuantity"])

In [None]:
metrics_df.sort_values(by=["yQuantity"], ascending=False).head(10)

### N Months features MenShirts

In [None]:
def run_experiment(of, experiment_function, **kwargs):
    predictions, actual = experiment_function(of, **kwargs)
    print(predictions.shape)
    metrics_df = predictions[['productid', 'predictedyQuantity']].merge(actual, on="productid")
    print(metrics_df.shape)
    metrics(metrics_df["yQuantity"], metrics_df["predictedyQuantity"])
    print(metrics_df.sort_values(by=["yQuantity"], ascending=False).head(10))

In [None]:
def do_task(n, attributes):
    run_experiment("menShirts", perform_n_months_experiment, n=n, attributes=attributes)
    run_experiment("womenKurtas", perform_n_months_experiment, n=n, attributes=attributes)

In [None]:
do_task(
    n = 3,
    attributes = ["sales", "wishlist"]
)

In [None]:
do_task(
    n = 4,
    attributes = ["sales", "wishlist", "availableQuantity"]
)