In [4]:
import pandas as pd
import numpy as np
from scipy import stats
from datetime import datetime
import seaborn as sns
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import train_test_split
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler, OneHotEncoder, OrdinalEncoder, MinMaxScaler 
from sklearn.svm import SVC
import matplotlib.pyplot as plt
from sklearn.impute import SimpleImputer
from sklearn.model_selection import train_test_split, KFold, GridSearchCV
from sklearn.metrics import mean_squared_error, make_scorer, r2_score
from sklearn.pipeline import Pipeline
from sklearn.linear_model import Lasso
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.linear_model import Ridge, ElasticNet
from sklearn.ensemble import RandomForestRegressor
from sklearn.neighbors import KNeighborsRegressor
from sklearn.svm import SVR
from xgboost import XGBRegressor
fixed_seed = 0

In [33]:
data = pd.read_csv('/Users/ciciwxp/Desktop/DATA1030/project/kindle_data-v2.csv')
data = pd.DataFrame(data)

## drop useless columns
drop_columns = ['asin', 'author', 'title', 'imgUrl', 'productURL', 'category_id']
data = data.drop(columns = drop_columns, axis = 1)

## drop value Y = null
data = data[data['stars'] != 0]

## transform binary
data = data.replace({True: 1, False: 0})

## add transformed date column
date_format = "%Y-%m-%d"
data['publishedDate_clean'] = data['publishedDate'].apply(lambda x: 
    datetime.strptime(str(x), '%Y-%m-%d') if not pd.isna(x) else x)
origin = data['publishedDate_clean'].min()
# data['publishedDate_num'] = pd.to_numeric(data['publishedDate_clean'].apply(lambda x: (x - origin).days if not pd.isna(x) else x), errors='coerce') 
data['published_month'] = data['publishedDate_clean'].dt.month
data['published_year'] = data['publishedDate_clean'].dt.year
data['published_year'] = data['published_year'][data['published_year'].notna()].astype(int)
data['published_days'] = data['publishedDate_clean'].dt.day_name()

data.columns
## drop target variable
Y = data['stars']
X = data.drop(columns = ['stars', 'publishedDate', 'publishedDate_clean'], axis = 1)

unique_year = X['published_year'].unique()
unique_year.sort()
unique_year = unique_year[:-1]

In [37]:
# encoder
ordinal_ftrs = ['published_month', 'published_days', 'published_year'] 
ordinal_cats = [[1,2,3,4,5,6,7,8,9,10,11,12], ['Monday', 'Tuesday', 'Wednesday', 'Thursday', 'Friday','Saturday','Sunday'], unique_year] 
onehot_ftrs = ['soldBy', 'category_name','isBestSeller', 'isEditorsPick', 'isGoodReadsChoice', 'isKindleUnlimited']
std_ftrs = ['reviews', 'price']
numeric_imputer = SimpleImputer(strategy='median')  # or 'mean'
categorical_imputer = SimpleImputer(strategy='most_frequent')  # or 'constant', fill_value='missing'


preprocessor = ColumnTransformer(
    transformers=[
        ##actually no need for num imputer since theres no missing
        ('num', Pipeline(steps=[('impute', SimpleImputer(strategy='median')), ('scale', StandardScaler())]), std_ftrs),
        ('ord', Pipeline(steps=[('impute', SimpleImputer(strategy='most_frequent')), ('encode', OrdinalEncoder(categories=ordinal_cats))]), ordinal_ftrs),
        ('onehot', Pipeline(steps=[('impute', SimpleImputer(strategy='most_frequent')), ('encode', OneHotEncoder(sparse=False, handle_unknown='ignore'))]), onehot_ftrs)
    ]
)

clf = Pipeline(steps=[('preprocessor', preprocessor)])

preprocessor2 = ColumnTransformer(
    transformers=[
        ('num', Pipeline(steps=[('scale', StandardScaler())]), std_ftrs),
        ('ord', Pipeline(steps=[('encode', OrdinalEncoder(categories=ordinal_cats))]), ordinal_ftrs),
        ('onehot', Pipeline(steps=[('impute', SimpleImputer(strategy='most_frequent')), ('encode', OneHotEncoder(sparse=False, handle_unknown='ignore'))]), onehot_ftrs)
    ]
)

clf2 = Pipeline(steps=[('preprocessor', preprocessor2)])


In [8]:
bins = [0, 4.0, 4.5, 5.1]
labels = ['Low', 'Medium', 'High']
Y_binned = pd.cut(Y, bins=bins, labels=labels, include_lowest=True)

In [21]:
from itertools import product
param_grid = {
    'max_depth': [2, 3],
    'learning_rate': [0.01, 0.1],
    'n_estimators': [10000],
    'reg_alpha': [0e0, 1e-2, 1e-1],
    'missing': [np.nan]
}

In [42]:


def MLpipe_KFold_RMSE_Stratified(X, y, y_binned, preprocessor, ML_algo, param_grid):
    test_scores = []
    best_models = []
    r2_scores = []

    for state in range(3):
        print(f"\nRandom State: {state}")

        # Splitting the data
        X_other, X_test, Y_other, Y_test = train_test_split(X, y, test_size=0.2, random_state=state, stratify=y_binned)
        print("Length of X_other:", len(X_other))
        print("Length of Y_other:", len(Y_other))
        
        y_binned_other = pd.cut(Y_other, bins=bins, labels=labels, include_lowest=True)

        skf = StratifiedKFold(n_splits=4, shuffle=True, random_state=state)

        # Initialize best_score and best_model for each random state
        best_score = 1000000
        best_model = None

        for train_index, val_index in skf.split(X_other, y_binned_other):
            X_train, X_val = X_other.iloc[train_index], X_other.iloc[val_index]
            Y_train, Y_val = Y_other.iloc[train_index], Y_other.iloc[val_index]

            pipe = Pipeline(steps=[('preprocessor', preprocessor), ('ML_algo', ML_algo)])
            grid = GridSearchCV(pipe, param_grid, scoring=make_scorer(mean_squared_error, greater_is_better=False))
            grid.fit(X_train, Y_train)

            model = grid.best_estimator_
            Y_pred = model.predict(X_val)
            rmse = mean_squared_error(Y_val, Y_pred, squared=False)

            if rmse < best_score:
                best_score = rmse
                best_model = model

        # Predicting and scoring on the test set
        Y_pred_test = best_model.predict(X_test)
        test_rmse = mean_squared_error(Y_test, Y_pred_test, squared=False)
        test_r2 = r2_score(Y_test, Y_pred_test)

        test_scores.append(test_rmse)
        best_models.append(best_model)
        r2_scores.append(test_r2)

        print(f"Model Name: {ML_algo.__class__.__name__}")
        print(f"Best Model: {best_model}")
        print(f"Test RMSE: {test_rmse}")
        print(f"Test R2: {test_r2}")

    return test_scores, r2_scores, best_models


In [44]:
xgb_model = XGBRegressor()
test_score_xgb_stratified2, r2_score_xgb_stratified2, best_models_xgb_stratified2 = MLpipe_KFold_RMSE_Stratified(X, Y, Y_binned, clf, xgb_model, {
    'ML_algo__max_depth': [5,6],
    'ML_algo__learning_rate': [0.1,0.3],
    'ML_algo__n_estimators': [300],
    'ML_algo__reg_alpha': [0e0, 1e-2, 1e-1],
    'ML_algo__colsample_bytree': [0.9],              
    'ML_algo__subsample': [0.66]
})

# best from previous results: learning rate: 0.1


Random State: 0
Length of X_other: 103936
Length of Y_other: 103936




Model Name: XGBRegressor
Best Model: Pipeline(steps=[('preprocessor',
                 Pipeline(steps=[('preprocessor',
                                  ColumnTransformer(transformers=[('num',
                                                                   Pipeline(steps=[('impute',
                                                                                    SimpleImputer(strategy='median')),
                                                                                   ('scale',
                                                                                    StandardScaler())]),
                                                                   ['reviews',
                                                                    'price']),
                                                                  ('ord',
                                                                   Pipeline(steps=[('impute',
                                                                     



Model Name: XGBRegressor
Best Model: Pipeline(steps=[('preprocessor',
                 Pipeline(steps=[('preprocessor',
                                  ColumnTransformer(transformers=[('num',
                                                                   Pipeline(steps=[('impute',
                                                                                    SimpleImputer(strategy='median')),
                                                                                   ('scale',
                                                                                    StandardScaler())]),
                                                                   ['reviews',
                                                                    'price']),
                                                                  ('ord',
                                                                   Pipeline(steps=[('impute',
                                                                     



Model Name: XGBRegressor
Best Model: Pipeline(steps=[('preprocessor',
                 Pipeline(steps=[('preprocessor',
                                  ColumnTransformer(transformers=[('num',
                                                                   Pipeline(steps=[('impute',
                                                                                    SimpleImputer(strategy='median')),
                                                                                   ('scale',
                                                                                    StandardScaler())]),
                                                                   ['reviews',
                                                                    'price']),
                                                                  ('ord',
                                                                   Pipeline(steps=[('impute',
                                                                     

In [45]:
test_score_xgb_mean2 = pd.Series(test_score_xgb_stratified2).mean()
test_score_xgb_mean2

0.2721579124377502

In [48]:
best_models_xgb_stratified2[0].get_params()

{'memory': None,
 'steps': [('preprocessor', Pipeline(steps=[('preprocessor',
                    ColumnTransformer(transformers=[('num',
                                                     Pipeline(steps=[('impute',
                                                                      SimpleImputer(strategy='median')),
                                                                     ('scale',
                                                                      StandardScaler())]),
                                                     ['reviews', 'price']),
                                                    ('ord',
                                                     Pipeline(steps=[('impute',
                                                                      SimpleImputer(strategy='most_frequent')),
                                                                     ('encode',
                                                                      OrdinalEncoder(categories=[[1

In [50]:
from joblib import load
model = load('/Users/ciciwxp/Desktop/xgb_model_1206.joblib')

  If you are loading a serialized model (like pickle in Python, RDS in R) generated by
  older XGBoost, please export the model by calling `Booster.save_model` from that version
  first, then load it back in current version. See:

    https://xgboost.readthedocs.io/en/latest/tutorials/saving_model.html

  for more details about differences between saving model and serializing.

