In [None]:
import warnings
warnings.filterwarnings('ignore')

# data imports
import pandas as pd
import numpy as np
from plotnine import *

# modeling imports
from sklearn.linear_model import LinearRegression # linear regression
from sklearn.preprocessing import StandardScaler, PolynomialFeatures, OneHotEncoder # z-score varaibles, polynomial features, one-hot encoding
from sklearn.metrics import mean_squared_error, mean_absolute_error, mean_absolute_percentage_error, r2_score # model evaluation metrics

# pipeline imports
from sklearn.pipeline import make_pipeline
from sklearn.compose import make_column_transformer
%matplotlib inline

# Homework 1

## Data

In [None]:
url = pd.read_csv("https://raw.githubusercontent.com/ywen2021/CPSC392/main/Data/boutique.csv")
url.head()

# drop missing values
url.dropna(inplace = True)
url.reset_index(drop= True, inplace= True)

# set up X and y
X = url.drop(columns="amount_spent_annual") # predictors (features); all independent variables
y = url["amount_spent_annual"] # response (target); dependent variable

## 1. Modeling

In [None]:
# train-test split (80-20 split)
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# define column types
numerical = ["age", "height_cm", "waist_size_cm", "inseam_cm", "salary_self_report_in_k", "months_active", "num_purchases"]
categorical = ["gender", "test_group"]

# create column transformer (preprocessor); z-score numerical variables, one-hot encode categorical variables
preprocessor = make_column_transformer(
    (StandardScaler(), numerical),
    (OneHotEncoder(drop="first"), categorical),
    remainder="passthrough" # passthrough any other columns not specified (if any)
)

In [None]:
# LINEAR REGRESSION MODEL

# create linear regression pipeline
linear_model = make_pipeline(preprocessor, LinearRegression())

# fit 
linear_model.fit(X_train, y_train)

# predict
y_train_pred_linear = linear_model.predict(X_train)
y_test_pred_linear = linear_model.predict(X_test)

# TRAIN metrics
print("Training Metrics:")
print("MSE: ", mean_squared_error(y_train, y_train_pred_linear))
print("MAE: ", mean_absolute_error(y_train, y_train_pred_linear))
print("MAPE: ", mean_absolute_percentage_error(y_train, y_train_pred_linear))
print("R^2: ", r2_score(y_train, y_train_pred_linear))

# TEST metrics
print("\nTesting Metrics:")
print("MSE: ", mean_squared_error(y_test, y_test_pred_linear))
print("MAE: ", mean_absolute_error(y_test, y_test_pred_linear))
print("MAPE: ", mean_absolute_percentage_error(y_test, y_test_pred_linear))
print("R^2: ", r2_score(y_test, y_test_pred_linear))

In [None]:
# POLYNOMIAL REGRESSION MODEL (2nd degree)

# create polynomial regression pipeline
poly_model = make_pipeline(preprocessor, PolynomialFeatures(degree=2, include_bias=False), LinearRegression())

# fit
poly_model.fit(X_train, y_train)

# predict
y_train_pred_poly = poly_model.predict(X_train)
y_test_pred_poly = poly_model.predict(X_test)

# TRAIN metrics
print("Training Metrics:")
print("MSE: ", mean_squared_error(y_train, y_train_pred_poly))
print("MAE: ", mean_absolute_error(y_train, y_train_pred_poly))
print("MAPE: ", mean_absolute_percentage_error(y_train, y_train_pred_poly))
print("R^2: ", r2_score(y_train, y_train_pred_poly))

# TEST metrics
print("\nTesting Metrics:")
print("MSE: ", mean_squared_error(y_test, y_test_pred_poly))
print("MAE: ", mean_absolute_error(y_test, y_test_pred_poly))
print("MAPE: ", mean_absolute_percentage_error(y_test, y_test_pred_poly))
print("R^2: ", r2_score(y_test, y_test_pred_poly))

In [None]:
# POLYNOMIAL REGRESSION MODEL (3rd degree, interaction only)

# create polynomial regression pipeline
poly3_model = make_pipeline(preprocessor, PolynomialFeatures(degree=3, interaction_only=True, include_bias=False), LinearRegression())

# fit
poly3_model.fit(X_train, y_train)

# predict
y_train_pred_poly3 = poly3_model.predict(X_train)
y_test_pred_poly3 = poly3_model.predict(X_test)

# TRAIN metrics
print("Training Metrics:")
print("MSE: ", mean_squared_error(y_train, y_train_pred_poly3))
print("MAE: ", mean_absolute_error(y_train, y_train_pred_poly3))
print("MAPE: ", mean_absolute_percentage_error(y_train, y_train_pred_poly3))
print("R^2: ", r2_score(y_train, y_train_pred_poly3))

# TEST metrics
print("\nTesting Metrics:")
print("MSE: ", mean_squared_error(y_test, y_test_pred_poly3))
print("MAE: ", mean_absolute_error(y_test, y_test_pred_poly3))
print("MAPE: ", mean_absolute_percentage_error(y_test, y_test_pred_poly3))
print("R^2: ", r2_score(y_test, y_test_pred_poly3))

## 2. Graphs

In [None]:
# GRAPHS Q2.1: Salary vs Number of Purchases

p1 = (ggplot(url, aes(x="salary_self_report_in_k", y="num_purchases"))
        + geom_point(color="steelblue", alpha=0.5)
        + geom_smooth(method="lm", color="darkred", se=True)
        + labs(title="Relationship between Salary and Number of Purchases",
               x = "Self Reported Salary (in $1000s)",
               y = "Number of Purchases"
        )
        + theme_minimal()
)

p1

In [None]:
# GRAPHS Q2.2: Salary vs Amount Spent Annual

p2 = (ggplot(url, aes(x="salary_self_report_in_k", y="amount_spent_annual"))
        + geom_point(color="seagreen", alpha=0.5)
        + geom_smooth(method="lm", color="darkred", se=True)
        + labs(title="Relationship between Salary and Amount Spent Annually",
                x = "Self Reported Salary (in $1000s)",
                y = "Amount Spent Annually ($)"
        )
        + theme_minimal()
)

p2

In [None]:
# GRAPHS Q4.1: Inseam vs Amount Spent Annual

p3 = (ggplot(url, aes(x="inseam_cm", y="amount_spent_annual"))
        + geom_point(color="purple", alpha=0.5)
        + geom_smooth(method="lm", color="darkred", se=True)
        + labs(title="Relationship between Inseam and Amount Spent Annually",
                x = "Inseam (cm)",
                y = "Amount Spent Annually ($)"
        )
        + theme_minimal()
)   

p3

In [None]:
# GRAPHS Q4.2: Height vs Amount Spent Annual

p4 = (ggplot(url, aes(x="height_cm", y="amount_spent_annual"))
        + geom_point(color="orange", alpha=0.5)
        + geom_smooth(method="lm", color="darkred", se=True)
        + labs(title="Relationship between Height and Amount Spent Annually",
                x = "Height (cm)",
                y = "Amount Spent Annually ($)"
        )
        + theme_minimal()
)

p4