In [1]:
import pandas as pd

In [2]:
train_data = pd.read_csv("../data/train.csv")
test_data = pd.read_csv("../data/test.csv")

In [3]:
# Function to fit a model on train data
def fit_model(train_data, model, columns_to_drop, column_to_train):
    # First split the data into features and target
    X = train_data.drop(columns=columns_to_drop, axis=1)
    y = train_data[column_to_train]
    model.fit(X, y)
    return model

In [4]:
from sklearn.model_selection import GridSearchCV


# Function to search for the best hyperparameters
def search_hyperparameters(
    train_data, model, param_grid, columns_to_drop, column_to_train, scoring
):
    # First split the data into features and target
    X = train_data.drop(columns=columns_to_drop, axis=1)
    y = train_data[column_to_train]

    grid_search = GridSearchCV(
        model,
        param_grid,
        cv=7,
        scoring=scoring,
        return_train_score=True,
        n_jobs=-1,
    )
    grid_search.fit(X, y)
    return grid_search

In [5]:
from sklearn.metrics import accuracy_score, recall_score, f1_score


# Function to score a model on given data
def score_model(data, model, columns_to_drop, column_to_train):
    X = data.drop(columns=columns_to_drop, axis=1)
    y = data[column_to_train]
    predictions = model.predict(X)
    accuracy = accuracy_score(y, predictions)
    recall = recall_score(y, predictions)
    f1 = f1_score(y, predictions)
    return f"Accuracy: {accuracy}, Recall: {recall}, F1: {f1}"

In [6]:
from sklearn.metrics import classification_report


# Function to evaluate the model predictions
def evaluate_model(data, model, columns_to_drop, column_to_train):
    X = data.drop(columns=columns_to_drop, axis=1)
    y = data[column_to_train]
    predictions = model.predict(X)
    report = classification_report(y, predictions, digits=3)
    return report

In [7]:
train_data["Gender"].value_counts()

Gender
0    405
1    395
Name: count, dtype: int64

In [8]:
# Decision Tree Classifier
from sklearn.tree import DecisionTreeClassifier

model = DecisionTreeClassifier(random_state=42, ccp_alpha=0.003)
model = fit_model(
    train_data, model, columns_to_drop=["Gender"], column_to_train="Gender"
)
print("DecisionTreeClassifier score on train data")
print(
    score_model(train_data, model, columns_to_drop=["Gender"], column_to_train="Gender")
)
print(
    evaluate_model(
        train_data, model, columns_to_drop=["Gender"], column_to_train="Gender"
    )
)

DecisionTreeClassifier score on train data
Accuracy: 0.745, Recall: 0.6962025316455697, F1: 0.7294429708222812
              precision    recall  f1-score   support

           0      0.728     0.793     0.759       405
           1      0.766     0.696     0.729       395

    accuracy                          0.745       800
   macro avg      0.747     0.744     0.744       800
weighted avg      0.747     0.745     0.744       800



In [9]:
from sklearn.metrics import confusion_matrix

print(
    confusion_matrix(
        train_data["Gender"], model.predict(train_data.drop(columns=["Gender"]))
    )
)

[[321  84]
 [120 275]]


In [10]:
print("DecisionTreeClassifier score on test data")
print(
    score_model(test_data, model, columns_to_drop=["Gender"], column_to_train="Gender")
)
print(
    evaluate_model(
        test_data, model, columns_to_drop=["Gender"], column_to_train="Gender"
    )
)

DecisionTreeClassifier score on test data
Accuracy: 0.515, Recall: 0.4807692307692308, F1: 0.5076142131979695
              precision    recall  f1-score   support

           0      0.495     0.552     0.522        96
           1      0.538     0.481     0.508       104

    accuracy                          0.515       200
   macro avg      0.516     0.516     0.515       200
weighted avg      0.517     0.515     0.515       200



In [20]:
# Grid Search for Decision Tree Classifier
param_grid = {
    "max_depth": [2, 5, 7, 10],
    "min_samples_split": [2, 5, 10, 20],
    "min_samples_leaf": [1, 2, 5, 10],
    "max_features": [None, "sqrt", "log2"],
    "ccp_alpha": [0.001, 0.003, 0.005, 0.007, 0.009],
}
decision_tree_grid_search = search_hyperparameters(
    train_data,
    model,
    param_grid,
    columns_to_drop=["Gender"],
    column_to_train="Gender",
    scoring="accuracy",
)
print(f"Best parameters: {decision_tree_grid_search.best_params_}")

Best parameters: {'ccp_alpha': 0.001, 'max_depth': 10, 'max_features': None, 'min_samples_leaf': 2, 'min_samples_split': 2}


In [21]:
decision_tree_model_best = fit_model(
    train_data,
    DecisionTreeClassifier(**decision_tree_grid_search.best_params_, random_state=42),
    columns_to_drop=["Gender"],
    column_to_train="Gender",
)

In [22]:
# Score the model on train data
decision_tree_best = score_model(
    train_data,
    decision_tree_model_best,
    columns_to_drop=["Gender"],
    column_to_train="Gender",
)
print(f"Decision Tree accuracy on train data: {decision_tree_best}")
# Evaluate the model predictions
print(
    evaluate_model(
        train_data,
        decision_tree_model_best,
        columns_to_drop=["Gender"],
        column_to_train="Gender",
    )
)

Decision Tree accuracy on train data: Accuracy: 0.7975, Recall: 0.7215189873417721, F1: 0.7786885245901639
              precision    recall  f1-score   support

           0      0.762     0.872     0.813       405
           1      0.846     0.722     0.779       395

    accuracy                          0.797       800
   macro avg      0.804     0.797     0.796       800
weighted avg      0.804     0.797     0.796       800



In [24]:
# Score the model on test data
decision_tree_best = score_model(
    test_data,
    decision_tree_model_best,
    columns_to_drop=["Gender"],
    column_to_train="Gender",
)
print(f"Decision Tree accuracy on test data: {decision_tree_best}")
# Evaluate the model predictions
print(
    evaluate_model(
        test_data,
        decision_tree_model_best,
        columns_to_drop=["Gender"],
        column_to_train="Gender",
    )
)

Decision Tree accuracy on test data: Accuracy: 0.54, Recall: 0.4326923076923077, F1: 0.4945054945054945
              precision    recall  f1-score   support

           0      0.516     0.656     0.578        96
           1      0.577     0.433     0.495       104

    accuracy                          0.540       200
   macro avg      0.547     0.544     0.536       200
weighted avg      0.548     0.540     0.535       200



## Random Forest

In [365]:
from sklearn.ensemble import RandomForestClassifier

model = RandomForestClassifier(random_state=0)
model = fit_model(
    train_data,
    model,
    columns_to_drop=["Gender"],
    column_to_train="Gender",
)
print(
    score_model(
        train_data,
        model,
        columns_to_drop=["Gender"],
        column_to_train="Gender",
    )
)
print(
    evaluate_model(
        train_data,
        model,
        columns_to_drop=["Gender"],
        column_to_train="Gender",
    )
)

Accuracy: 1.0, Recall: 1.0, F1: 1.0
              precision    recall  f1-score   support

           0      1.000     1.000     1.000       405
           1      1.000     1.000     1.000       395

    accuracy                          1.000       800
   macro avg      1.000     1.000     1.000       800
weighted avg      1.000     1.000     1.000       800



In [366]:
print(
    score_model(test_data, model, columns_to_drop=["Gender"], column_to_train="Gender")
)
print(
    evaluate_model(
        test_data, model, columns_to_drop=["Gender"], column_to_train="Gender"
    )
)

Accuracy: 0.525, Recall: 0.4519230769230769, F1: 0.4973544973544973
              precision    recall  f1-score   support

           0      0.504     0.604     0.550        96
           1      0.553     0.452     0.497       104

    accuracy                          0.525       200
   macro avg      0.529     0.528     0.524       200
weighted avg      0.530     0.525     0.523       200



In [400]:
# Grid Search for Random Forest Classifier
params = {
    "n_estimators": [25, 50, 100, 200, 300, 500],
    "max_depth": [1, 2, 3],
    "min_samples_leaf": [2, 4, 6, 8, 10],
    "ccp_alpha": [0.0001, 0.0003, 0.0005, 0.0007],
}
random_forest_grid_search = search_hyperparameters(
    train_data,
    model,
    params,
    columns_to_drop=["Gender"],
    column_to_train="Gender",
    scoring="accuracy",
)
print(f"Best parameters: {random_forest_grid_search.best_params_}")

Best parameters: {'ccp_alpha': 0.0005, 'max_depth': 3, 'min_samples_leaf': 6, 'n_estimators': 100}


In [401]:
random_forest_model_best = fit_model(
    train_data,
    RandomForestClassifier(**random_forest_grid_search.best_params_, random_state=42),
    columns_to_drop=["Gender"],
    column_to_train="Gender",
)

In [402]:
# Score the model on train data
random_forest_best = score_model(
    train_data,
    random_forest_model_best,
    columns_to_drop=["Gender"],
    column_to_train="Gender",
)
print(f"Random Forest accuracy on train data: {random_forest_best}")
# Evaluate the model predictions
print(
    evaluate_model(
        train_data,
        random_forest_model_best,
        columns_to_drop=["Gender"],
        column_to_train="Gender",
    )
)

Random Forest accuracy on train data: Accuracy: 0.6425, Recall: 0.5670886075949367, F1: 0.6103542234332425
              precision    recall  f1-score   support

           0      0.629     0.716     0.670       405
           1      0.661     0.567     0.610       395

    accuracy                          0.642       800
   macro avg      0.645     0.642     0.640       800
weighted avg      0.645     0.642     0.640       800



In [404]:
# Score the model on test data
random_forest_best = score_model(
    test_data,
    random_forest_model_best,
    columns_to_drop=["Gender"],
    column_to_train="Gender",
)
print(f"Random Forest accuracy on test data: {random_forest_best}")
# Evaluate the model predictions
print(
    evaluate_model(
        test_data,
        random_forest_model_best,
        columns_to_drop=["Gender"],
        column_to_train="Gender",
    )
)

Random Forest accuracy on test data: Accuracy: 0.525, Recall: 0.4230769230769231, F1: 0.4808743169398907
              precision    recall  f1-score   support

           0      0.504     0.635     0.562        96
           1      0.557     0.423     0.481       104

    accuracy                          0.525       200
   macro avg      0.531     0.529     0.522       200
weighted avg      0.532     0.525     0.520       200



# Rating prediction

In [25]:
def predict(test_data, model, columns_to_drop):
    X = test_data.drop(columns=columns_to_drop, axis=1)
    return model.predict(X)

In [26]:
# MSE function
from sklearn.metrics import mean_squared_error


def mse(y_true, y_pred):
    return mean_squared_error(y_true, y_pred)

In [27]:
# RMSE function
import numpy as np


def rmse(y_true, y_pred):
    return np.sqrt(mse(y_true, y_pred))

In [28]:
# R2 function
from sklearn.metrics import r2_score


def r2(y_true, y_pred):
    return r2_score(y_true, y_pred)

In [29]:
def score_model_regression(data, model, columns_to_drop, column_to_train):
    X = data.drop(columns=columns_to_drop, axis=1)
    y = data[column_to_train]
    predictions = model.predict(X)
    mse_score = mse(y, predictions)
    rmse_score = rmse(y, predictions)
    r2_score_value = r2(y, predictions)
    return f"MSE: {mse_score}, RMSE: {rmse_score}, R2: {r2_score_value}"

In [30]:
# Scale the data
from sklearn.preprocessing import StandardScaler

scaled_train_data = train_data.copy()
scaler = StandardScaler()
scaled_train_data[["Total", "UnitPrice", "Tax"]] = scaler.fit_transform(
    scaled_train_data[["Total", "UnitPrice", "Tax"]]
)
scaled_test_data = test_data.copy()
scaled_test_data[["Total", "UnitPrice", "Tax"]] = scaler.transform(
    scaled_test_data[["Total", "UnitPrice", "Tax"]]
)

In [35]:
# Linear Regression Model trained on selected features
from sklearn.linear_model import LinearRegression

model = LinearRegression()
model = fit_model(
    scaled_train_data, model, columns_to_drop=["Rating"], column_to_train="Rating"
)
print(
    "Score for Linear Regression Model on train data:\n",
    score_model_regression(
        scaled_train_data, model, columns_to_drop=["Rating"], column_to_train="Rating"
    ),
)

Score for Linear Regression Model on train data:
 MSE: 2.8797175901390606, RMSE: 1.6969730670046184, R2: 0.013497573928445306


In [36]:
print(
    "Score for Linear Regression Model on test data:\n",
    score_model_regression(
        scaled_test_data, model, columns_to_drop=["Rating"], column_to_train="Rating"
    ),
)

Score for Linear Regression Model on test data:
 MSE: 3.110187291026073, RMSE: 1.7635723095541256, R2: -0.013816836503707242


In [63]:
# Select K Best features
from sklearn.feature_selection import SelectKBest, f_regression

# Get the selected features
selected_features = scaled_train_data.drop(columns="Rating", axis=1).columns[
    SelectKBest(f_regression, k=3)
    .fit(scaled_train_data.drop(columns="Rating", axis=1), scaled_train_data["Rating"])
    .get_support()
]
selected_features

Index(['Total', 'Branch_A', 'Branch_B'], dtype='object')

In [64]:
# Train a Linear Regression model on the selected features
model = LinearRegression()
model = fit_model(
    scaled_train_data[selected_features.tolist() + ["Rating"]],
    model,
    columns_to_drop=["Rating"],
    column_to_train="Rating",
)
print(
    "Score for Linear Regression Model on train data:\n",
    score_model_regression(
        scaled_train_data[selected_features.tolist() + ["Rating"]],
        model,
        columns_to_drop=["Rating"],
        column_to_train="Rating",
    ),
)

Score for Linear Regression Model on train data:
 MSE: 2.900901625588092, RMSE: 1.7032033424075037, R2: 0.0062405767714963645


In [65]:
print(
    "Score for Linear Regression Model on test data:\n",
    score_model_regression(
        scaled_test_data[selected_features.tolist() + ["Rating"]],
        model,
        columns_to_drop=["Rating"],
        column_to_train="Rating",
    ),
)

Score for Linear Regression Model on test data:
 MSE: 3.0891882498404444, RMSE: 1.75760867369288, R2: -0.00697185274152301


In [80]:
# Decision Tree Regressor
from sklearn.tree import DecisionTreeRegressor

model = DecisionTreeRegressor(random_state=42, ccp_alpha=0.003)
model = fit_model(
    train_data, model, columns_to_drop=["Rating"], column_to_train="Rating"
)
print(
    "Score for Decision Tree Regressor Model on train data:\n",
    score_model_regression(
        train_data, model, columns_to_drop=["Rating"], column_to_train="Rating"
    ),
)

Score for Decision Tree Regressor Model on train data:
 MSE: 0.28866862731018983, RMSE: 0.537278910166954, R2: 0.9011110318083322


In [81]:
print(
    "Score for Decision Tree Regressor Model on test data:\n",
    score_model_regression(
        test_data, model, columns_to_drop=["Rating"], column_to_train="Rating"
    ),
)

Score for Decision Tree Regressor Model on test data:
 MSE: 6.041170552047959, RMSE: 2.4578792793886275, R2: -0.9692191642375512


In [82]:
# Grid Search for Decision Tree Regressor
param_grid = {
    "max_depth": [3, 5, 8, 10, 15, 20],
    "min_samples_split": [2, 5, 10, 20],
    "min_samples_leaf": [1, 2, 5, 10],
    "max_features": ["sqrt", "log2"],
    "ccp_alpha": [0.0001, 0.0002, 0.0003, 0.0004, 0.0005],
}
decision_tree_grid_search = search_hyperparameters(
    train_data,
    model,
    param_grid,
    columns_to_drop=["Rating"],
    column_to_train="Rating",
    scoring="r2",
)
print(f"Best parameters: {decision_tree_grid_search.best_params_}")

Best parameters: {'ccp_alpha': 0.0001, 'max_depth': 3, 'max_features': 'sqrt', 'min_samples_leaf': 10, 'min_samples_split': 2}


In [83]:
# Fit the model with the best hyperparameters
decision_tree_model_best = fit_model(
    train_data,
    DecisionTreeRegressor(**decision_tree_grid_search.best_params_, random_state=42),
    columns_to_drop=["Rating"],
    column_to_train="Rating",
)

In [84]:
# Score the model on train data
decision_tree_best = score_model_regression(
    train_data,
    decision_tree_model_best,
    columns_to_drop=["Rating"],
    column_to_train="Rating",
)

In [85]:
# Evaluate the model predictions
print(
    "Score for Decision Tree Regressor Model on train data:\n",
    decision_tree_best,
)

Score for Decision Tree Regressor Model on train data:
 MSE: 2.7979244589968664, RMSE: 1.6726997515982558, R2: 0.04151737787868859


In [86]:
# Score the model on test data
decision_tree_best = score_model_regression(
    test_data,
    decision_tree_model_best,
    columns_to_drop=["Rating"],
    column_to_train="Rating",
)

In [87]:
# Evaluate the model predictions
print(
    "Score for Decision Tree Regressor Model on test data:\n",
    decision_tree_best,
)

Score for Decision Tree Regressor Model on test data:
 MSE: 3.066196484767474, RMSE: 1.7510558200033126, R2: 0.0005226922330419104
