# Walmart Sales Prediction

## Importing Libraries

In [62]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import MinMaxScaler, LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.svm import SVR
from sklearn.ensemble import GradientBoostingRegressor

## Data Preparation

### Process Macroeconomic Data

In [63]:
macro_economic_df = pd.read_excel('../data//macro_economic.xlsx')

In [64]:
macro_economic_df.drop(columns=['PartyInPower'], inplace=True)

macro_economic_df['AdvertisingExpenses (in Thousand Dollars)'].replace('?', np.nan, inplace=True)
macro_economic_df['AdvertisingExpenses (in Thousand Dollars)'] = macro_economic_df['AdvertisingExpenses (in Thousand Dollars)'].astype(float)
macro_economic_df['AdvertisingExpenses (in Thousand Dollars)'].fillna(macro_economic_df['AdvertisingExpenses (in Thousand Dollars)'].median(), inplace=True)
macro_economic_df['Year'] = macro_economic_df['Year-Month'].str.split('-', expand=True)[0]
macro_economic_df['Month'] = macro_economic_df['Year-Month'].str.split('-', expand=True)[1]
macro_economic_df = macro_economic_df.drop(['Year-Month'], axis=1)

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  macro_economic_df['AdvertisingExpenses (in Thousand Dollars)'].replace('?', np.nan, inplace=True)
  macro_economic_df['AdvertisingExpenses (in Thousand Dollars)'].replace('?', np.nan, inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  macro_economic_df['AdvertisingExpe

In [65]:
macro_economic_df['Month'] = macro_economic_df['Month'].str.lstrip()
month_to_number = {'Jan': 1, 'Feb': 2, 'Mar': 3, 'Apr': 4, 'May': 5, 'Jun': 6, 'Jul': 7, 'Aug': 8, 'Sep': 9, 'Oct': 10, 'Nov': 11, 'Dec': 12 }
macro_economic_df['Month'] = macro_economic_df['Month'].map(month_to_number)
macro_economic_df = macro_economic_df.sort_values(by=['Year','Month'])
macro_economic_df = macro_economic_df.reset_index(drop=True)
macro_economic_df['Month'] = macro_economic_df['Month'].astype(str)

In [66]:
macro_economic_df.to_csv('../data/clean_data/macro_economic.csv')

### Process Weather Data

In [67]:
def load_all_sheets():
    path = '../data/WeatherData.xlsx'
    all_sheets_df = pd.read_excel(path, sheet_name=None)

    wth_df = pd.DataFrame()

    for year, temp_df in all_sheets_df.items():
        temp_df['Year-Month'] = year + " - " + temp_df['Month']
        wth_df = pd.concat([wth_df, temp_df])

    return wth_df

weather_df = load_all_sheets()

In [68]:
weather_df.replace('-', np.nan, inplace=True)
weather_df['Precip.\xa0(mm) sum'].replace('T', 0, inplace=True)
weather_df.fillna(method='bfill', inplace=True)

  weather_df.replace('-', np.nan, inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  weather_df['Precip.\xa0(mm) sum'].replace('T', 0, inplace=True)
  weather_df['Precip.\xa0(mm) sum'].replace('T', 0, inplace=True)
  weather_df.fillna(method='bfill', inplace=True)


In [69]:
weather_df['Wind\xa0(km/h) low'] = weather_df['Wind\xa0(km/h) low'].astype(float)
weather_df['Wind\xa0(km/h) high'] = weather_df['Wind\xa0(km/h) high'].astype(float)
weather_df['Wind\xa0(km/h) avg'] = weather_df['Wind\xa0(km/h) avg'].astype(float)
weather_df['Precip.\xa0(mm) sum'] = weather_df['Precip.\xa0(mm) sum'].astype(float)

In [70]:
weather_df = weather_df.drop(columns=['Day'])
weather_df['WeatherEvent'] = weather_df['WeatherEvent'].astype(str)

mean_columns = [col for col in weather_df.columns if col not in ['Year', 'Month', 'Year-Month', 'WeatherEvent']]
mean_df = weather_df.groupby('Year-Month')[mean_columns].mean()
mean_df.reset_index(inplace=True)
mean_df[['Year', 'Month']] = mean_df['Year-Month'].str.split('-', expand=True)

mode_df = weather_df.groupby('Year-Month')['WeatherEvent'].agg(lambda x: ', '.join(x.unique())).reset_index()
weather_df = pd.merge(mean_df, mode_df, on='Year-Month')
weather_df = weather_df.drop(columns=['Year-Month'])

In [71]:
mode_df.head()

Unnamed: 0,Year-Month,WeatherEvent
0,2009 - Apr,"Rain, Fog , Rain"
1,2009 - Aug,"Fog , Rain, Rain"
2,2009 - Dec,"Rain, Rain , Snow, Fog , Snow"
3,2009 - Feb,"Fog , Snow, Rain, Rain , Snow"
4,2009 - Jan,"Snow, Rain, Fog , Rain , Snow, Fog , Snow"


In [72]:
weather_df['Month'] = weather_df['Month'].str.lstrip()
month_to_number = {'Jan': 1, 'Feb': 2, 'Mar': 3, 'Apr': 4, 'May': 5, 'Jun': 6, 'Jul': 7, 'Aug': 8, 'Sep': 9, 'Oct': 10, 'Nov': 11, 'Dec': 12 }
weather_df['Month'] = weather_df['Month'].map(month_to_number)
weather_df = weather_df.sort_values(by=['Year','Month'])
weather_df = weather_df.reset_index(drop=True)
weather_df['Month'] = weather_df['Month'].astype(str)

In [73]:
weather_df.to_csv('../data/clean_data/weather_df.csv')

### Process Events and Holidays Data

In [74]:
events_holidays_df = pd.read_excel('../data/Events_HolidaysData.xlsx')

In [75]:
events_holidays_df['Year'] = events_holidays_df['Year'].astype(str)
events_holidays_df.rename(columns={'MonthDate': 'Month'}, inplace=True)
events_holidays_df['Month'] = events_holidays_df['Month'].astype(str)
events_holidays_df['Month'] = events_holidays_df['Month'].str.split('-', expand=True)[1]
events_holidays_df['Month'] = events_holidays_df['Month'].str.lstrip('0')

In [76]:
events_holidays_df['HolidayCount'] = events_holidays_df.groupby(['Year', 'Month'])['Event'].transform('count')
events_holidays_df = events_holidays_df.drop(['Event', 'DayCategory'], axis=1)
events_holidays_df = events_holidays_df.drop_duplicates()

In [77]:
years = events_holidays_df['Year'].unique()
missing_months_df = pd.DataFrame({'Year': [], 'Month': [], 'HolidayCount': []})
for year in years:
    for month in [3, 4, 8]:
        if not ((events_holidays_df['Year'] == year) & (events_holidays_df['Month'] == month)).any():
            missing_data = pd.DataFrame({'Year': [year], 'Month': [month], 'HolidayCount': [0]})
            missing_months_df = pd.concat([missing_months_df, missing_data])

events_holidays_df = pd.concat([events_holidays_df, missing_months_df])
events_holidays_df['Month'] = events_holidays_df['Month'].astype(int)
events_holidays_df['HolidayCount'] = events_holidays_df['HolidayCount'].astype(int)
events_holidays_df = events_holidays_df.groupby(['Year', 'Month'], as_index=False)['HolidayCount'].sum()
events_holidays_df = events_holidays_df.sort_values(by=['Year','Month'])
events_holidays_df = events_holidays_df.reset_index(drop=True)
events_holidays_df['Month'] = events_holidays_df['Month'].astype(str)

In [78]:
events_holidays_df.to_csv('../data/clean_data/events_holidays.csv')

### Merge Macro economic, weather and events holidays datasets

In [79]:
macro_economic_df['Month'] = macro_economic_df['Month'].str.lstrip()
macro_economic_df['Year'] = macro_economic_df['Year'].str.lstrip()
events_holidays_df['Month'] = events_holidays_df['Month'].str.lstrip()
events_holidays_df['Year'] = events_holidays_df['Year'].str.lstrip()
macro_economic_df['Month'] = macro_economic_df['Month'].str.rstrip()
macro_economic_df['Year'] = macro_economic_df['Year'].str.rstrip()
events_holidays_df['Month'] = events_holidays_df['Month'].str.rstrip()
events_holidays_df['Year'] = events_holidays_df['Year'].str.rstrip()
weather_df['Month'] = weather_df['Month'].str.rstrip()
weather_df['Year'] = weather_df['Year'].str.rstrip()
weather_df['Month'] = weather_df['Month'].str.lstrip()
weather_df['Year'] = weather_df['Year'].str.lstrip()

In [80]:
macro_events_df = pd.merge(macro_economic_df, events_holidays_df, on=['Year', 'Month'], how='left')
macro_events_df.to_csv('../data/clean_data/macro_events.csv')

In [81]:
macro_events_weather_df = pd.merge(macro_events_df, weather_df, on=['Year', 'Month'], how='left')
macro_events_weather_df.to_csv('../data/clean_data/macro_events_weather.csv')

### Process Train Data

In [82]:
train_df = pd.read_csv('../data/train.csv')

In [83]:
train_df['Month'] = train_df['Month'].astype(str)
train_df['Year'] = train_df['Year'].astype(str)

In [84]:
mean_sales = train_df['Sales(In ThousandDollars)'].mean()
train_df['Sales(In ThousandDollars)'].fillna(mean_sales, inplace=True)

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  train_df['Sales(In ThousandDollars)'].fillna(mean_sales, inplace=True)


In [85]:
merged_df = pd.merge(macro_events_weather_df, train_df, on=['Year', 'Month'], how='left')
merged_df.to_csv('../data/clean_data/merged_df.csv')

### Prepare Training Data

In [86]:
le = LabelEncoder()
columns_to_encode = ['WeatherEvent']
for column in columns_to_encode:
    merged_df[column] = le.fit_transform(merged_df[column])

In [87]:
scaler = MinMaxScaler()

columns_to_scale = ['Monthly Nominal GDP Index (inMillion$)', 'Monthly Real GDP Index (inMillion$)', 'CPI', 'unemployment rate', 'CommercialBankInterestRateonCreditCardPlans', 'Finance Rate on Personal Loans at Commercial Banks, 24 Month Loan', 'Earnings or wages  in dollars per hour',     'AdvertisingExpenses (in Thousand Dollars)', 'Cotton Monthly Price - US cents per Pound(lbs)', 'Change(in%)', 'Average upland planted(million acres)', 'Average upland harvested(million acres)', 'yieldperharvested acre', 'Production (in  480-lb netweright in million bales)', 'Mill use  (in  480-lb netweright in million bales)', 'Exports', 'HolidayCount', 'Temp high (°C)', 'Temp avg (°C)', 'Temp low (°C)', 'Month',   'Dew Point high (°C)', 'Dew Point avg (°C)', 'Dew Point low (°C)', 'Humidity (%) high', 'Humidity (%) avg', 'Humidity (%) low', 'Sea Level Press. (hPa) high', 'Sea Level Press. (hPa) avg', 'Sea Level Press. (hPa) low', 'Visibility (km) high', 'Visibility (km) avg', 'Visibility (km) low', 'Wind (km/h) low', 'Wind (km/h) avg', 'Wind (km/h) high', 'Precip. (mm) sum', 'WeatherEvent']

merged_df[columns_to_scale] = scaler.fit_transform(merged_df[columns_to_scale])

In [88]:
women_clothing = merged_df[merged_df['ProductCategory'] == 'WomenClothing']
men_clothing = merged_df[merged_df['ProductCategory'] == 'MenClothing']
other_clothing = merged_df[merged_df['ProductCategory'] == 'OtherClothing']
women_clothing.drop(columns=['ProductCategory'], inplace=True)
men_clothing.drop(columns=['ProductCategory'], inplace=True)
other_clothing.drop(columns=['ProductCategory'], inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  women_clothing.drop(columns=['ProductCategory'], inplace=True)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  men_clothing.drop(columns=['ProductCategory'], inplace=True)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  other_clothing.drop(columns=['ProductCategory'], inplace=True)


In [89]:
women_clothing.to_csv('../data/clean_data/women_clothing.csv')
men_clothing.to_csv('../data/clean_data/men_clothing.csv')
other_clothing.to_csv('../data/clean_data/other_clothing.csv')

In [90]:
target_column = 'Sales(In ThousandDollars)'
feature_columns = merged_df.columns.drop(['Sales(In ThousandDollars)', 'ProductCategory'])

In [91]:
women_clothing_X = women_clothing[feature_columns]
women_clothing_y = women_clothing[target_column]

women_clothing_X_train, women_clothing_X_test, women_clothing_y_train, women_clothing_y_test = train_test_split(women_clothing_X, women_clothing_y, test_size=0.2, random_state=42)

In [92]:
women_clothing_X_test.head(20)

Unnamed: 0,Monthly Nominal GDP Index (inMillion$),Monthly Real GDP Index (inMillion$),CPI,unemployment rate,CommercialBankInterestRateonCreditCardPlans,"Finance Rate on Personal Loans at Commercial Banks, 24 Month Loan",Earnings or wages in dollars per hour,AdvertisingExpenses (in Thousand Dollars),Cotton Monthly Price - US cents per Pound(lbs),Change(in%),...,Sea Level Press. (hPa) avg,Sea Level Press. (hPa) low,Visibility (km) high,Visibility (km) avg,Visibility (km) low,Wind (km/h) low,Wind (km/h) avg,Wind (km/h) high,Precip. (mm) sum,WeatherEvent
0,0.022218,0.02386,0.0,0.592593,0.086066,1.0,0.022059,0.159091,0.034798,0.594531,...,0.553143,0.465849,1.0,0.615267,0.735771,0.737981,0.819277,0.680549,0.140957,0.97561
15,0.006632,0.000445,0.117743,0.907407,0.614754,0.904523,0.0,0.636364,0.055509,0.488587,...,0.221326,0.342452,0.896667,0.0,0.0,0.269914,0.246586,0.135135,0.539786,0.512195
108,0.329315,0.342302,0.497205,0.685185,0.221311,0.457286,0.392157,0.636364,0.278442,0.635659,...,0.584356,0.546229,1.0,0.651908,0.740072,0.786331,0.907631,0.715649,0.154426,0.756098
135,0.402498,0.368016,0.651957,0.592593,0.053279,0.462312,0.382353,0.636364,0.170904,0.451766,...,0.430523,0.47478,0.633333,0.459542,0.537921,0.310461,0.309237,0.399749,0.137717,0.073171
39,0.071415,0.091627,0.170524,0.962963,1.0,0.693467,0.14951,0.636364,0.160184,0.581395,...,0.0,0.0,0.92619,0.331298,0.513651,0.774607,0.934882,0.717181,0.379023,0.902439
162,0.504121,0.489949,0.747181,0.5,0.053279,0.447236,0.453431,0.636364,0.230791,0.497416,...,0.60888,0.736013,0.633333,0.367939,0.41319,0.216305,0.196787,0.294449,0.133376,0.097561
99,0.324182,0.35146,0.519973,0.777778,0.188525,0.678392,0.328431,0.636364,0.331762,0.396425,...,0.506325,0.568556,1.0,0.624427,0.658351,0.246842,0.39759,0.2769,0.308217,0.585366
144,0.465418,0.469897,0.668509,0.62963,0.02459,0.59799,0.470588,0.636364,0.190885,0.563307,...,0.818448,0.765039,0.733333,0.40458,0.563728,0.572571,0.610442,0.672749,0.12914,0.682927
36,0.063137,0.079828,0.173897,0.962963,0.729508,0.879397,0.134804,1.0,0.145367,0.525409,...,0.432753,0.45022,1.0,0.825954,0.795986,0.68963,0.807229,0.565499,0.092546,1.0
171,0.546503,0.538134,0.739155,0.481481,0.02459,0.341709,0.509804,0.636364,0.212438,0.49031,...,0.706976,0.762806,0.7,0.835115,0.882007,0.188312,0.333333,0.25545,0.0,0.390244


In [93]:
men_clothing_X = men_clothing[feature_columns]
men_clothing_y = men_clothing[target_column]

men_clothing_X_train, men_clothing_X_test, men_clothing_y_train, men_clothing_y_test = train_test_split(men_clothing_X, men_clothing_y, test_size=0.2, random_state=42)

In [94]:
other_clothing_X = other_clothing[feature_columns]
other_clothing_y = other_clothing[target_column]

other_clothing_X_train, other_clothing_X_test, other_clothing_y_train, other_clothing_y_test = train_test_split(other_clothing_X, other_clothing_y, test_size=0.2, random_state=42)

## Model Training

### Linear Regression

In [95]:
women_clothing_lr_model = LinearRegression()
women_clothing_lr_model.fit(women_clothing_X_train, women_clothing_y_train)

men_clothing_lr_model = LinearRegression()
men_clothing_lr_model.fit(men_clothing_X_train, men_clothing_y_train)

other_clothing_lr_model = LinearRegression()
other_clothing_lr_model.fit(other_clothing_X_train, other_clothing_y_train)

#### Generate Predictions and Submission

In [96]:
men_clothing_lr_model_predictions = men_clothing_lr_model.predict(men_clothing_X_test)
women_clothing_lr_model_predictions = women_clothing_lr_model.predict(women_clothing_X_test)
other_clothing_lr_model_predictions = other_clothing_lr_model.predict(other_clothing_X_test)

In [97]:
men_clothing_df = pd.DataFrame({
    'Month': men_clothing_X_test['Month'],
    'ProductCategory': 'MenClothing',
    'Sales(In ThousandDollars)': men_clothing_lr_model_predictions
})

women_clothing_df = pd.DataFrame({
    'Month': women_clothing_X_test['Month'],
    'ProductCategory': 'WomenClothing',
    'Sales(In ThousandDollars)': women_clothing_lr_model_predictions
})

other_clothing_df = pd.DataFrame({
    'Month': other_clothing_X_test['Month'],
    'ProductCategory': 'OtherClothing',
    'Sales(In ThousandDollars)': other_clothing_lr_model_predictions
})

In [98]:
lr_model_submission = pd.concat([men_clothing_df, women_clothing_df, other_clothing_df])
lr_model_submission.sort_values(by=['Month', 'ProductCategory'], ascending=[True, False], inplace=True)
lr_model_submission.reset_index(drop=True, inplace=True)
lr_model_submission.index = range(1, len(lr_model_submission) + 1)
lr_model_submission = lr_model_submission.rename_axis('Year')

lr_model_submission.drop(columns=['ProductCategory', 'Month'], inplace=True)

lr_model_submission.to_csv('../data/submission/LinearRegression/submission.csv')

### Decision Tree Regression

In [99]:
women_clothing_dt_model = DecisionTreeRegressor()
women_clothing_dt_model.fit(women_clothing_X_train, women_clothing_y_train)

men_clothing_dt_model = DecisionTreeRegressor()
men_clothing_dt_model.fit(men_clothing_X_train, men_clothing_y_train)

other_clothing_dt_model = DecisionTreeRegressor()
other_clothing_dt_model.fit(other_clothing_X_train, other_clothing_y_train)

#### Generate Predictions and Submission

In [100]:
men_clothing_dt_model_predictions = men_clothing_dt_model.predict(men_clothing_X_test)
women_clothing_dt_model_predictions = women_clothing_dt_model.predict(women_clothing_X_test)
other_clothing_dt_model_predictions = other_clothing_dt_model.predict(other_clothing_X_test)

In [101]:
men_clothing_df = pd.DataFrame({
    'Month': men_clothing_X_test['Month'],
    'ProductCategory': 'MenClothing',
    'Sales(In ThousandDollars)': men_clothing_dt_model_predictions
})

women_clothing_df = pd.DataFrame({
    'Month': women_clothing_X_test['Month'],
    'ProductCategory': 'WomenClothing',
    'Sales(In ThousandDollars)': women_clothing_dt_model_predictions
})

other_clothing_df = pd.DataFrame({
    'Month': other_clothing_X_test['Month'],
    'ProductCategory': 'OtherClothing',
    'Sales(In ThousandDollars)': other_clothing_dt_model_predictions
})

In [102]:
dt_model_submission = pd.concat([men_clothing_df, women_clothing_df, other_clothing_df])
dt_model_submission.sort_values(by=['Month', 'ProductCategory'], ascending=[True, False], inplace=True)
dt_model_submission.reset_index(drop=True, inplace=True)
dt_model_submission.index = range(1, len(dt_model_submission) + 1)
dt_model_submission = dt_model_submission.rename_axis('Year')

dt_model_submission.drop(columns=['ProductCategory', 'Month'], inplace=True)

dt_model_submission.to_csv('../data/submission/DecisionTreeRegressor/submission.csv')

### Random Forest Regression

In [103]:
women_clothing_rf_model = RandomForestRegressor()
women_clothing_rf_model.fit(women_clothing_X_train, women_clothing_y_train)

men_clothing_rf_model = RandomForestRegressor()
men_clothing_rf_model.fit(men_clothing_X_train, men_clothing_y_train)

other_clothing_rf_model = RandomForestRegressor()
other_clothing_rf_model.fit(other_clothing_X_train, other_clothing_y_train)

#### Generate Predictions and Submission

In [104]:
men_clothing_rf_model_predictions = men_clothing_rf_model.predict(men_clothing_X_test)
women_clothing_rf_model_predictions = women_clothing_rf_model.predict(women_clothing_X_test)
other_clothing_rf_model_predictions = other_clothing_rf_model.predict(other_clothing_X_test)

In [105]:
men_clothing_df = pd.DataFrame({
    'Month': men_clothing_X_test['Month'],
    'ProductCategory': 'MenClothing',
    'Sales(In ThousandDollars)': men_clothing_rf_model_predictions
})

women_clothing_df = pd.DataFrame({
    'Month': women_clothing_X_test['Month'],
    'ProductCategory': 'WomenClothing',
    'Sales(In ThousandDollars)': women_clothing_rf_model_predictions
})

other_clothing_df = pd.DataFrame({
    'Month': other_clothing_X_test['Month'],
    'ProductCategory': 'OtherClothing',
    'Sales(In ThousandDollars)': other_clothing_rf_model_predictions
})

In [106]:
rf_model_submission = pd.concat([men_clothing_df, women_clothing_df, other_clothing_df])
rf_model_submission.sort_values(by=['Month', 'ProductCategory'], ascending=[True, False], inplace=True)
rf_model_submission.reset_index(drop=True, inplace=True)
rf_model_submission.index = range(1, len(rf_model_submission) + 1)
rf_model_submission = rf_model_submission.rename_axis('Year')

rf_model_submission.drop(columns=['ProductCategory', 'Month'], inplace=True)

rf_model_submission.to_csv('../data/submission/RandomForestRegressor/submission.csv')

### Support Vector Regression

In [107]:
women_clothing_svr_model = SVR()
women_clothing_svr_model.fit(women_clothing_X_train, women_clothing_y_train)

men_clothing_svr_model = SVR()
men_clothing_svr_model.fit(men_clothing_X_train, men_clothing_y_train)

other_clothing_svr_model = SVR()
other_clothing_svr_model.fit(other_clothing_X_train, other_clothing_y_train)

#### Generate Predictions and Submission

In [108]:
men_clothing_svr_model_predictions = men_clothing_svr_model.predict(men_clothing_X_test)
women_clothing_svr_model_predictions = women_clothing_svr_model.predict(women_clothing_X_test)
other_clothing_svr_model_predictions = other_clothing_svr_model.predict(other_clothing_X_test)

In [109]:
men_clothing_df = pd.DataFrame({
    'Month': men_clothing_X_test['Month'],
    'ProductCategory': 'MenClothing',
    'Sales(In ThousandDollars)': men_clothing_svr_model_predictions
})

women_clothing_df = pd.DataFrame({
    'Month': women_clothing_X_test['Month'],
    'ProductCategory': 'WomenClothing',
    'Sales(In ThousandDollars)': women_clothing_svr_model_predictions
})

other_clothing_df = pd.DataFrame({
    'Month': other_clothing_X_test['Month'],
    'ProductCategory': 'OtherClothing',
    'Sales(In ThousandDollars)': other_clothing_svr_model_predictions
})

In [110]:
svr_model_submission = pd.concat([men_clothing_df, women_clothing_df, other_clothing_df])
svr_model_submission.sort_values(by=['Month', 'ProductCategory'], ascending=[True, False], inplace=True)
svr_model_submission.reset_index(drop=True, inplace=True)
svr_model_submission.index = range(1, len(svr_model_submission) + 1)
svr_model_submission = svr_model_submission.rename_axis('Year')

svr_model_submission.drop(columns=['ProductCategory', 'Month'], inplace=True)

svr_model_submission.to_csv('../data/submission/SVR/submission.csv')

### Gradient Boosting Regression

In [111]:
women_clothing_gb_model = GradientBoostingRegressor()
women_clothing_gb_model.fit(women_clothing_X_train, women_clothing_y_train)

men_clothing_gb_model = GradientBoostingRegressor()
men_clothing_gb_model.fit(men_clothing_X_train, men_clothing_y_train)

other_clothing_gb_model = GradientBoostingRegressor()
other_clothing_gb_model.fit(other_clothing_X_train, other_clothing_y_train)

#### Generate Predictions and Submission

In [112]:
men_clothing_gb_model_predictions = men_clothing_gb_model.predict(men_clothing_X_test)
women_clothing_gb_model_predictions = women_clothing_gb_model.predict(women_clothing_X_test)
other_clothing_gb_model_predictions = other_clothing_gb_model.predict(other_clothing_X_test)

In [113]:
men_clothing_df = pd.DataFrame({
    'Month': men_clothing_X_test['Month'],
    'ProductCategory': 'MenClothing',
    'Sales(In ThousandDollars)': men_clothing_gb_model_predictions
})

women_clothing_df = pd.DataFrame({
    'Month': women_clothing_X_test['Month'],
    'ProductCategory': 'WomenClothing',
    'Sales(In ThousandDollars)': women_clothing_gb_model_predictions
})

other_clothing_df = pd.DataFrame({
    'Month': other_clothing_X_test['Month'],
    'ProductCategory': 'OtherClothing',
    'Sales(In ThousandDollars)': other_clothing_gb_model_predictions
})

In [114]:
gb_model_submission = pd.concat([men_clothing_df, women_clothing_df, other_clothing_df])
gb_model_submission.sort_values(by=['Month', 'ProductCategory'], ascending=[True, False], inplace=True)
gb_model_submission.reset_index(drop=True, inplace=True)
gb_model_submission.index = range(1, len(gb_model_submission) + 1)
gb_model_submission = gb_model_submission.rename_axis('Year')

gb_model_submission.drop(columns=['ProductCategory', 'Month'], inplace=True)

gb_model_submission.to_csv('../data/submission/GradientBoostingRegressor/submission.csv')