# XGBRegressor

### Import Libs and Data

In [39]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import xgboost as xgb
from sklearn.model_selection import train_test_split,cross_val_predict, cross_val_score, GridSearchCV
from sklearn.metrics import accuracy_score, mean_squared_error
from scipy import stats
from sklearn.preprocessing import StandardScaler, RobustScaler
from sklearn.metrics import r2_score

In [40]:
# Load a dataset into a Pandas Dataframe
dataset_df = pd.read_excel('data_2.xlsx')
print("Full train dataset shape is {}".format(dataset_df.shape))

Full train dataset shape is (150, 6)


In [41]:
dataset_df.head()

Unnamed: 0,Date,Sales ($),TV_ads ($),online_ads ($),Print_ads ($),Price ($)
0,Jan-2008,6237000,181222.548442,12152.540107,24986.91276,135.0
1,Feb-2008,6115500,179104.353164,57353.376363,17574.38536,135.0
2,Mar-2008,10577250,222756.223235,146794.245435,16016.022899,135.0
3,Apr-2008,13473000,188266.818315,376459.468169,29689.69416,135.0
4,May-2008,6258525,210038.289741,21297.573231,17606.99144,136.5


In [42]:
dataset_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 150 entries, 0 to 149
Data columns (total 6 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   Date            150 non-null    object 
 1   Sales ($)       150 non-null    int64  
 2   TV_ads ($)      150 non-null    float64
 3   online_ads ($)  150 non-null    float64
 4   Print_ads ($)   150 non-null    float64
 5   Price ($)       150 non-null    float64
dtypes: float64(4), int64(1), object(1)
memory usage: 7.2+ KB


### Feature engineering

In [44]:
# Define a function to map months to seasons
def map_to_season(month):
    if month in ['Dec', 'Jan', 'Feb']:  # Winter: Dec, Jan, Feb
        return 'Winter'
    elif month in ['Mar', 'Apr', 'May']:  # Spring: Mar, Apr, May
        return 'Spring'
    elif month in ['Jun', 'Jul', 'Aug']:  # Summer: Jun, Jul, Aug
        return 'Summer'
    else:  # Fall: Sep, Oct, Nov
        return 'Fall'

In [45]:
############ New Features ############
dataset_df['Date'] = pd.to_datetime(dataset_df['Date'])

# Extract month names from the "date" column and create the "season" column
dataset_df['season'] = dataset_df['Date'].dt.strftime('%b').map(map_to_season)

# Create dummy variables for the "season" column
season_dummies = pd.get_dummies(dataset_df['season'], prefix='season', drop_first=True)
dataset_df = pd.concat([dataset_df, season_dummies], axis=1)

dataset_df['is_weekend'] = dataset_df['Date'].dt.dayofweek.isin([5, 6]).astype(int)

# Create interaction terms
dataset_df['TV_online_interaction'] = dataset_df['TV_ads ($)'] * dataset_df['online_ads ($)']
dataset_df['TV_print_interaction'] = dataset_df['TV_ads ($)'] * dataset_df['Print_ads ($)']
dataset_df['TV_price_interaction'] = dataset_df['TV_ads ($)'] * dataset_df['Price ($)']
dataset_df['Online_print_interaction'] = dataset_df['online_ads ($)'] * dataset_df['Print_ads ($)']
dataset_df['Online_price_interaction'] = dataset_df['online_ads ($)'] * dataset_df['Price ($)']
dataset_df['Print_price_interaction'] = dataset_df['Print_ads ($)'] * dataset_df['Price ($)']
dataset_df['TV_online_Print_price_interaction'] = dataset_df['TV_ads ($)'] * dataset_df['online_ads ($)'] * dataset_df['Print_ads ($)'] * dataset_df['Price ($)']

x = dataset_df.drop(columns=['Sales ($)','Date','season'])
y = dataset_df['Sales ($)']

# log(1+x) transform
#y = np.log1p(pd.to_numeric(y))

In [46]:
############ Filter Outliers ############

# Calculate z-scores for the target variable to identify outliers
z_scores = np.abs(stats.zscore(y))

# Set a threshold for z-scores to identify outliers
threshold = 3  # Adjust this threshold as needed

# Identify and filter outliers
filtered_indices = np.where(z_scores <= threshold)[0]
x_filtered = x.iloc[filtered_indices]
y_filtered = y[filtered_indices]


# Print the number of outliers and the number of remaining data points
print(f"Number of remaining data points: {len(filtered_indices)}")

Number of remaining data points: 150


In [47]:
############ Scale Data ############
scaler = StandardScaler()
scaled_x = scaler.fit_transform(x_filtered)

### Split Data

In [48]:
X_train, X_test, y_train, y_test = train_test_split(x_filtered, y_filtered, test_size=0.2, random_state=42)

### Xgboost

In [49]:
from xgboost import XGBRegressor

model = XGBRegressor()
# Create an XGBoost model object
xgb_model = XGBRegressor(n_estimators=1000, max_depth=7, eta=0.1, subsample=0.7, colsample_bytree=0.8, random_state=42)

# Train the model on the training data
xgb_model.fit(X_train, y_train)

# Make predictions on the test set
y_pred = xgb_model.predict(X_test)

# Calculate RMSE (Root Mean Squared Error)
rmse = np.sqrt(mean_squared_error(y_test, y_pred))
print(f"RMSE: {rmse}")

# For regression tasks, calculate mean squared error
mse = mean_squared_error(y_test, y_pred)
print(f"MSE: {mse}")

# Calculate R-squared (R2) score
r2 = r2_score(y_test, y_pred)
print(f"R2 Score: {r2}")


RMSE: 747121.1584091386
MSE: 558190025342.613
R2 Score: 0.9324039303492666


In [50]:
# Define the parameter grid for hyperparameter tuning
param_grid = {
    'n_estimators': [100, 200, 300],
    'max_depth': [3, 5, 7],
    'learning_rate': [0.1, 0.01, 0.001],
    'subsample': [0.7, 0.8, 0.9],
    'colsample_bytree': [0.7, 0.8, 0.9]
}

# Create GridSearchCV object with cross-validation
grid_search = GridSearchCV(estimator=xgb_model, param_grid=param_grid, cv=5, scoring='neg_root_mean_squared_error')

# Fit the GridSearchCV object on the training data
grid_search.fit(X_train, y_train)

# Get the best hyperparameters
best_params = grid_search.best_params_
print("Best Hyperparameters:", best_params)

# Get the best model
best_xgb_model = grid_search.best_estimator_

# Make predictions on the test set using the best model
y_pred = best_xgb_model.predict(X_test)

# Calculate RMSE (Root Mean Squared Error)
rmse = np.sqrt(mean_squared_error(y_test, y_pred))
print(f"RMSE: {rmse}")

# Calculate R-squared (R2) score
r2 = r2_score(y_test, y_pred)
print(f"R2 Score: {r2}")

Best Hyperparameters: {'colsample_bytree': 0.8, 'learning_rate': 0.1, 'max_depth': 5, 'n_estimators': 300, 'subsample': 0.8}
RMSE: 645520.4290659091
R2 Score: 0.9495385930178161
