# Walmart Sales Prediction

## Importing Libraries

In [1]:
import sys
sys.path.append('../')
import pandas as pd
import numpy as np
from sklearn.preprocessing import MinMaxScaler, LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.svm import SVR
from sklearn.ensemble import GradientBoostingRegressor
from src.utils import check_missing_data, evaluate_model, generate_predictions_and_submissions

## Data Preparation

### Process Macroeconomic Data

In [2]:
macro_economic_df = pd.read_excel('../data//macro_economic.xlsx')

In [3]:
check_missing_data(macro_economic_df)

'Success: No missing values.'

In [4]:
print(macro_economic_df.columns)

Index(['Year-Month', 'Monthly Nominal GDP Index (inMillion$)',
       'Monthly Real GDP Index (inMillion$)', 'CPI', 'PartyInPower',
       'unemployment rate', 'CommercialBankInterestRateonCreditCardPlans',
       'Finance Rate on Personal Loans at Commercial Banks, 24 Month Loan',
       'Earnings or wages  in dollars per hour',
       'AdvertisingExpenses (in Thousand Dollars)',
       'Cotton Monthly Price - US cents per Pound(lbs)', 'Change(in%)',
       'Average upland planted(million acres)',
       'Average upland harvested(million acres)', 'yieldperharvested acre',
       'Production (in  480-lb netweright in million bales)',
       'Mill use  (in  480-lb netweright in million bales)', 'Exports'],
      dtype='object')


In [5]:
macro_economic_df.drop(columns=['PartyInPower'], inplace=True)

macro_economic_df['AdvertisingExpenses (in Thousand Dollars)'].replace('?', np.nan, inplace=True)
macro_economic_df['AdvertisingExpenses (in Thousand Dollars)'] = macro_economic_df['AdvertisingExpenses (in Thousand Dollars)'].astype(float)
macro_economic_df['AdvertisingExpenses (in Thousand Dollars)'].fillna(macro_economic_df['AdvertisingExpenses (in Thousand Dollars)'].median(), inplace=True)
macro_economic_df['Year'] = macro_economic_df['Year-Month'].str.split('-', expand=True)[0]
macro_economic_df['Month'] = macro_economic_df['Year-Month'].str.split('-', expand=True)[1]
macro_economic_df = macro_economic_df.drop(['Year-Month'], axis=1)

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  macro_economic_df['AdvertisingExpenses (in Thousand Dollars)'].replace('?', np.nan, inplace=True)
  macro_economic_df['AdvertisingExpenses (in Thousand Dollars)'].replace('?', np.nan, inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  macro_economic_df['AdvertisingExpe

In [6]:
macro_economic_df['Month'] = macro_economic_df['Month'].str.lstrip()
month_to_number = {'Jan': 1, 'Feb': 2, 'Mar': 3, 'Apr': 4, 'May': 5, 'Jun': 6, 'Jul': 7, 'Aug': 8, 'Sep': 9, 'Oct': 10, 'Nov': 11, 'Dec': 12 }
macro_economic_df['Month'] = macro_economic_df['Month'].map(month_to_number)
macro_economic_df = macro_economic_df.sort_values(by=['Year','Month'])
macro_economic_df = macro_economic_df.reset_index(drop=True)
macro_economic_df['Month'] = macro_economic_df['Month'].astype(str)

In [7]:
check_missing_data(macro_economic_df)

'Success: No missing values.'

In [8]:
macro_economic_df.to_csv('../data/clean_data/macro_economic.csv')

### Process Weather Data

In [9]:
def load_all_sheets():
    path = '../data/WeatherData.xlsx'
    all_sheets_df = pd.read_excel(path, sheet_name=None)

    wth_df = pd.DataFrame()

    for year, temp_df in all_sheets_df.items():
        temp_df['Year-Month'] = year + " - " + temp_df['Month']
        wth_df = pd.concat([wth_df, temp_df])

    return wth_df

weather_df = load_all_sheets()

In [10]:
check_missing_data(weather_df)

Unnamed: 0,Column Name,Missing Values,Percentage Missing
22,WeatherEvent,1907,65.285861


In [11]:
print(len(weather_df))

2921


In [12]:
print(weather_df.columns)

Index(['Year', 'Month', 'Day', 'Temp high (°C)', 'Temp avg (°C)',
       'Temp low (°C)', 'Dew Point high (°C)', 'Dew Point avg (°C)',
       'Dew Point low (°C)', 'Humidity (%) high', 'Humidity (%) avg',
       'Humidity (%) low', 'Sea Level Press. (hPa) high',
       'Sea Level Press. (hPa) avg', 'Sea Level Press. (hPa) low',
       'Visibility (km) high', 'Visibility (km) avg', 'Visibility (km) low',
       'Wind (km/h) low', 'Wind (km/h) avg', 'Wind (km/h) high',
       'Precip. (mm) sum', 'WeatherEvent', 'Year-Month'],
      dtype='object')


In [13]:
weather_df.replace('-', np.nan, inplace=True)
weather_df['Precip.\xa0(mm) sum'].replace('T', 0, inplace=True)
weather_df.fillna(method='bfill', inplace=True)

  weather_df.replace('-', np.nan, inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  weather_df['Precip.\xa0(mm) sum'].replace('T', 0, inplace=True)
  weather_df['Precip.\xa0(mm) sum'].replace('T', 0, inplace=True)
  weather_df.fillna(method='bfill', inplace=True)


In [14]:
weather_df['Wind\xa0(km/h) low'] = weather_df['Wind\xa0(km/h) low'].astype(float)
weather_df['Wind\xa0(km/h) high'] = weather_df['Wind\xa0(km/h) high'].astype(float)
weather_df['Wind\xa0(km/h) avg'] = weather_df['Wind\xa0(km/h) avg'].astype(float)
weather_df['Precip.\xa0(mm) sum'] = weather_df['Precip.\xa0(mm) sum'].astype(float)

In [15]:
weather_df = weather_df.drop(columns=['Day'])
weather_df['WeatherEvent'] = weather_df['WeatherEvent'].astype(str)

mean_columns = [col for col in weather_df.columns if col not in ['Year', 'Month', 'Year-Month', 'WeatherEvent']]
mean_df = weather_df.groupby('Year-Month')[mean_columns].mean()
mean_df.reset_index(inplace=True)
mean_df[['Year', 'Month']] = mean_df['Year-Month'].str.split('-', expand=True)

mode_df = weather_df.groupby('Year-Month')['WeatherEvent'].agg(lambda x: ', '.join(x.unique())).reset_index()
weather_df = pd.merge(mean_df, mode_df, on='Year-Month')
weather_df = weather_df.drop(columns=['Year-Month'])

In [16]:
mode_df.head()

Unnamed: 0,Year-Month,WeatherEvent
0,2009 - Apr,"Rain, Fog , Rain"
1,2009 - Aug,"Fog , Rain, Rain"
2,2009 - Dec,"Rain, Rain , Snow, Fog , Snow"
3,2009 - Feb,"Fog , Snow, Rain, Rain , Snow"
4,2009 - Jan,"Snow, Rain, Fog , Rain , Snow, Fog , Snow"


In [17]:
weather_df['Month'] = weather_df['Month'].str.lstrip()
month_to_number = {'Jan': 1, 'Feb': 2, 'Mar': 3, 'Apr': 4, 'May': 5, 'Jun': 6, 'Jul': 7, 'Aug': 8, 'Sep': 9, 'Oct': 10, 'Nov': 11, 'Dec': 12 }
weather_df['Month'] = weather_df['Month'].map(month_to_number)
weather_df = weather_df.sort_values(by=['Year','Month'])
weather_df = weather_df.reset_index(drop=True)
weather_df['Month'] = weather_df['Month'].astype(str)

In [18]:
check_missing_data(weather_df)

'Success: No missing values.'

In [19]:
weather_df.to_csv('../data/clean_data/weather_df.csv')

### Process Events and Holidays Data

In [20]:
events_holidays_df = pd.read_excel('../data/Events_HolidaysData.xlsx')

In [21]:
check_missing_data(events_holidays_df)

'Success: No missing values.'

In [22]:
events_holidays_df['Year'] = events_holidays_df['Year'].astype(str)
events_holidays_df.rename(columns={'MonthDate': 'Month'}, inplace=True)
events_holidays_df['Month'] = events_holidays_df['Month'].astype(str)
events_holidays_df['Month'] = events_holidays_df['Month'].str.split('-', expand=True)[1]
events_holidays_df['Month'] = events_holidays_df['Month'].str.lstrip('0')

In [23]:
events_holidays_df['HolidayCount'] = events_holidays_df.groupby(['Year', 'Month'])['Event'].transform('count')
events_holidays_df = events_holidays_df.drop(['Event', 'DayCategory'], axis=1)
events_holidays_df = events_holidays_df.drop_duplicates()

In [24]:
years = events_holidays_df['Year'].unique()
missing_months_df = pd.DataFrame({'Year': [], 'Month': [], 'HolidayCount': []})
for year in years:
    for month in [3, 4, 8]:
        if not ((events_holidays_df['Year'] == year) & (events_holidays_df['Month'] == month)).any():
            missing_data = pd.DataFrame({'Year': [year], 'Month': [month], 'HolidayCount': [0]})
            missing_months_df = pd.concat([missing_months_df, missing_data])

events_holidays_df = pd.concat([events_holidays_df, missing_months_df])
events_holidays_df['Month'] = events_holidays_df['Month'].astype(int)
events_holidays_df['HolidayCount'] = events_holidays_df['HolidayCount'].astype(int)
events_holidays_df = events_holidays_df.groupby(['Year', 'Month'], as_index=False)['HolidayCount'].sum()
events_holidays_df = events_holidays_df.sort_values(by=['Year','Month'])
events_holidays_df = events_holidays_df.reset_index(drop=True)
events_holidays_df['Month'] = events_holidays_df['Month'].astype(str)

In [25]:
check_missing_data(events_holidays_df)

'Success: No missing values.'

In [26]:
events_holidays_df.to_csv('../data/clean_data/events_holidays.csv')

### Merge Macro economic, weather and events holidays datasets

In [27]:
macro_economic_df['Month'] = macro_economic_df['Month'].str.lstrip()
macro_economic_df['Year'] = macro_economic_df['Year'].str.lstrip()
events_holidays_df['Month'] = events_holidays_df['Month'].str.lstrip()
events_holidays_df['Year'] = events_holidays_df['Year'].str.lstrip()
macro_economic_df['Month'] = macro_economic_df['Month'].str.rstrip()
macro_economic_df['Year'] = macro_economic_df['Year'].str.rstrip()
events_holidays_df['Month'] = events_holidays_df['Month'].str.rstrip()
events_holidays_df['Year'] = events_holidays_df['Year'].str.rstrip()
weather_df['Month'] = weather_df['Month'].str.rstrip()
weather_df['Year'] = weather_df['Year'].str.rstrip()
weather_df['Month'] = weather_df['Month'].str.lstrip()
weather_df['Year'] = weather_df['Year'].str.lstrip()

In [28]:
macro_events_df = pd.merge(macro_economic_df, events_holidays_df, on=['Year', 'Month'], how='left')
macro_events_df.to_csv('../data/clean_data/macro_events.csv')

In [29]:
macro_events_weather_df = pd.merge(macro_events_df, weather_df, on=['Year', 'Month'], how='left')
macro_events_weather_df.to_csv('../data/clean_data/macro_events_weather.csv')

### Process Train Data

In [30]:
train_df = pd.read_csv('../data/train.csv')

In [31]:
print(len(train_df))

180


In [32]:
check_missing_data(train_df)

Unnamed: 0,Column Name,Missing Values,Percentage Missing
3,Sales(In ThousandDollars),10,5.555556


In [33]:
train_df['Month'] = train_df['Month'].astype(str)
train_df['Year'] = train_df['Year'].astype(str)

In [34]:
mean_sales = train_df['Sales(In ThousandDollars)'].mean()
train_df['Sales(In ThousandDollars)'].fillna(mean_sales, inplace=True)

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  train_df['Sales(In ThousandDollars)'].fillna(mean_sales, inplace=True)


In [35]:
check_missing_data(train_df)

'Success: No missing values.'

In [36]:
merged_df = pd.merge(macro_events_weather_df, train_df, on=['Year', 'Month'], how='left')

In [37]:
merged_df.to_csv('../data/clean_data/merged_df.csv')

### Prepare Training Data

In [38]:
le = LabelEncoder()
columns_to_encode = ['WeatherEvent']
for column in columns_to_encode:
    merged_df[column] = le.fit_transform(merged_df[column])

In [39]:
scaler = MinMaxScaler()

columns_to_scale = ['Monthly Nominal GDP Index (inMillion$)', 'Monthly Real GDP Index (inMillion$)', 'CPI', 'unemployment rate', 'CommercialBankInterestRateonCreditCardPlans', 'Finance Rate on Personal Loans at Commercial Banks, 24 Month Loan', 'Earnings or wages  in dollars per hour',     'AdvertisingExpenses (in Thousand Dollars)', 'Cotton Monthly Price - US cents per Pound(lbs)', 'Change(in%)', 'Average upland planted(million acres)', 'Average upland harvested(million acres)', 'yieldperharvested acre', 'Production (in  480-lb netweright in million bales)', 'Mill use  (in  480-lb netweright in million bales)', 'Exports', 'HolidayCount', 'Temp high (°C)', 'Temp avg (°C)', 'Temp low (°C)', 'Month',   'Dew Point high (°C)', 'Dew Point avg (°C)', 'Dew Point low (°C)', 'Humidity (%) high', 'Humidity (%) avg', 'Humidity (%) low', 'Sea Level Press. (hPa) high', 'Sea Level Press. (hPa) avg', 'Sea Level Press. (hPa) low', 'Visibility (km) high', 'Visibility (km) avg', 'Visibility (km) low', 'Wind (km/h) low', 'Wind (km/h) avg', 'Wind (km/h) high', 'Precip. (mm) sum', 'WeatherEvent']

merged_df[columns_to_scale] = scaler.fit_transform(merged_df[columns_to_scale])

In [40]:
merged_df['Year'] = merged_df['Year'].astype(int)

In [41]:
target_column = 'Sales(In ThousandDollars)'
feature_columns = merged_df.columns.drop(['Sales(In ThousandDollars)', 'ProductCategory'])

In [42]:
women_clothing = merged_df[merged_df['ProductCategory'] == 'WomenClothing']
men_clothing = merged_df[merged_df['ProductCategory'] == 'MenClothing']
other_clothing = merged_df[merged_df['ProductCategory'] == 'OtherClothing']

In [43]:
women_clothing.drop(columns=['ProductCategory'], inplace=True)
men_clothing.drop(columns=['ProductCategory'], inplace=True)
other_clothing.drop(columns=['ProductCategory'], inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  women_clothing.drop(columns=['ProductCategory'], inplace=True)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  men_clothing.drop(columns=['ProductCategory'], inplace=True)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  other_clothing.drop(columns=['ProductCategory'], inplace=True)


In [44]:
women_clothing.to_csv('../data/clean_data/women_clothing_df.csv')
men_clothing.to_csv('../data/clean_data/men_clothing_df.csv')
other_clothing.to_csv('../data/clean_data/other_clothing_df.csv')

In [45]:
women_clothing_train = women_clothing[(women_clothing['Year'] >= 2009) & (women_clothing['Year'] <= 2013)]
women_clothing_test = merged_df[merged_df['Year'] == 2014]

men_clothing_train = men_clothing[(men_clothing['Year'] >= 2009) & (men_clothing['Year'] <= 2013)]
men_clothing_test = merged_df[merged_df['Year'] == 2014]

other_clothing_train = other_clothing[(other_clothing['Year'] >= 2009) & (other_clothing['Year'] <= 2013)]
other_clothing_test = merged_df[merged_df['Year'] == 2014]

In [46]:
women_clothing_X_train = women_clothing_train[feature_columns]
women_clothing_y_train = women_clothing_train[target_column]
women_clothing_X_test = women_clothing_test[feature_columns]
women_clothing_y_test = women_clothing_test[target_column]

men_clothing_X_train = men_clothing_train[feature_columns]
men_clothing_y_train = men_clothing_train[target_column]
men_clothing_X_test = men_clothing_test[feature_columns]
men_clothing_y_test = men_clothing_test[target_column]

other_clothing_X_train = other_clothing_train[feature_columns]
other_clothing_y_train = other_clothing_train[target_column]
other_clothing_X_test = other_clothing_test[feature_columns]
other_clothing_y_test = other_clothing_test[target_column]

In [47]:
women_clothing_X_test.head()

Unnamed: 0,Monthly Nominal GDP Index (inMillion$),Monthly Real GDP Index (inMillion$),CPI,unemployment rate,CommercialBankInterestRateonCreditCardPlans,"Finance Rate on Personal Loans at Commercial Banks, 24 Month Loan",Earnings or wages in dollars per hour,AdvertisingExpenses (in Thousand Dollars),Cotton Monthly Price - US cents per Pound(lbs),Change(in%),...,Sea Level Press. (hPa) avg,Sea Level Press. (hPa) low,Visibility (km) high,Visibility (km) avg,Visibility (km) low,Wind (km/h) low,Wind (km/h) avg,Wind (km/h) high,Precip. (mm) sum,WeatherEvent
180,0.561563,0.52881,0.818077,0.37037,0.012295,0.386935,0.585784,0.636364,0.221474,0.593454,...,0.324996,0.243243,0.896667,0.123053,0.057778,0.482911,0.698929,0.854488,0.111835,0.341463
181,0.577097,0.550823,0.800056,0.388889,0.004098,0.321608,0.642157,0.636364,0.238817,0.58118,...,0.622017,0.566084,0.741667,0.148746,0.346984,0.594292,0.601406,0.81865,0.306972,0.0
182,0.594379,0.567119,0.829789,0.388889,0.004098,0.321608,0.622549,0.636364,0.255093,0.574289,...,0.586585,0.561858,1.0,0.835115,0.81319,0.712533,0.746988,0.840448,0.169462,0.95122
183,0.61547,0.592644,0.830226,0.296296,0.004098,0.321608,0.598039,0.636364,0.239659,0.446813,...,0.608359,0.59855,1.0,0.814046,0.795556,0.55391,0.586881,0.657019,0.149605,0.414634
184,0.644518,0.622408,0.868953,0.314815,0.004098,0.060302,0.573529,0.636364,0.231296,0.473945,...,0.575438,0.653401,0.733333,0.248855,0.421792,0.348632,0.232932,0.423149,0.480368,0.512195


## Model Training

### Linear Regression

In [48]:
women_clothing_lr_model = LinearRegression()
women_clothing_lr_model.fit(women_clothing_X_train, women_clothing_y_train)

men_clothing_lr_model = LinearRegression()
men_clothing_lr_model.fit(men_clothing_X_train, men_clothing_y_train)

other_clothing_lr_model = LinearRegression()
other_clothing_lr_model.fit(other_clothing_X_train, other_clothing_y_train)

In [49]:
generate_predictions_and_submissions(
    [men_clothing_lr_model, women_clothing_lr_model, other_clothing_lr_model],
    [men_clothing_X_test, women_clothing_X_test, other_clothing_X_test],
    ['MenClothing', 'WomenClothing', 'OtherClothing'],
    '../data/submission/LinearRegression/submission.csv'
)

### Decision Tree Regression

In [50]:
women_clothing_dt_model = DecisionTreeRegressor()
women_clothing_dt_model.fit(women_clothing_X_train, women_clothing_y_train)

men_clothing_dt_model = DecisionTreeRegressor()
men_clothing_dt_model.fit(men_clothing_X_train, men_clothing_y_train)

other_clothing_dt_model = DecisionTreeRegressor()
other_clothing_dt_model.fit(other_clothing_X_train, other_clothing_y_train)

In [51]:
generate_predictions_and_submissions(
    [men_clothing_dt_model, women_clothing_dt_model, other_clothing_dt_model],
    [men_clothing_X_test, women_clothing_X_test, other_clothing_X_test],
    ['MenClothing', 'WomenClothing', 'OtherClothing'],
    '../data/submission/DecisionTreeRegressor/submission.csv'
)

### Random Forest Regression

In [52]:
women_clothing_rf_model = RandomForestRegressor()
women_clothing_rf_model.fit(women_clothing_X_train, women_clothing_y_train)

men_clothing_rf_model = RandomForestRegressor()
men_clothing_rf_model.fit(men_clothing_X_train, men_clothing_y_train)

other_clothing_rf_model = RandomForestRegressor()
other_clothing_rf_model.fit(other_clothing_X_train, other_clothing_y_train)

In [53]:
generate_predictions_and_submissions(
    [men_clothing_rf_model, women_clothing_rf_model, other_clothing_rf_model],
    [men_clothing_X_test, women_clothing_X_test, other_clothing_X_test],
    ['MenClothing', 'WomenClothing', 'OtherClothing'],
    '../data/submission/RandomForestRegressor/submission.csv'
)

### Support Vector Regression

In [54]:
women_clothing_svr_model = SVR()
women_clothing_svr_model.fit(women_clothing_X_train, women_clothing_y_train)

men_clothing_svr_model = SVR()
men_clothing_svr_model.fit(men_clothing_X_train, men_clothing_y_train)

other_clothing_svr_model = SVR()
other_clothing_svr_model.fit(other_clothing_X_train, other_clothing_y_train)

In [55]:
generate_predictions_and_submissions(
    [men_clothing_svr_model, women_clothing_svr_model, other_clothing_svr_model],
    [men_clothing_X_test, women_clothing_X_test, other_clothing_X_test],
    ['MenClothing', 'WomenClothing', 'OtherClothing'],
    '../data/submission/SVR/submission.csv'
)

### Gradient Boosting Regression

In [56]:
women_clothing_gb_model = GradientBoostingRegressor()
women_clothing_gb_model.fit(women_clothing_X_train, women_clothing_y_train)

men_clothing_gb_model = GradientBoostingRegressor()
men_clothing_gb_model.fit(men_clothing_X_train, men_clothing_y_train)

other_clothing_gb_model = GradientBoostingRegressor()
other_clothing_gb_model.fit(other_clothing_X_train, other_clothing_y_train)

In [57]:
generate_predictions_and_submissions(
    [men_clothing_gb_model, women_clothing_gb_model, other_clothing_gb_model],
    [men_clothing_X_test, women_clothing_X_test, other_clothing_X_test],
    ['MenClothing', 'WomenClothing', 'OtherClothing'],
    '../data/submission/GradientBoostingRegressor/submission.csv'
)