# Walmart Sales Prediction

## Importing Libraries

In [1]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import MinMaxScaler, LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error
from math import sqrt

## Data Preparation

### Process Macroeconomic Data

In [2]:
macro_economic_df = pd.read_excel('../data//macro_economic.xlsx')

In [3]:
macro_economic_df.drop(columns=['PartyInPower'], inplace=True)

macro_economic_df['AdvertisingExpenses (in Thousand Dollars)'].replace('?', np.nan, inplace=True)
macro_economic_df['AdvertisingExpenses (in Thousand Dollars)'] = macro_economic_df['AdvertisingExpenses (in Thousand Dollars)'].astype(float)
macro_economic_df['AdvertisingExpenses (in Thousand Dollars)'].fillna(macro_economic_df['AdvertisingExpenses (in Thousand Dollars)'].median(), inplace=True)
macro_economic_df['Year'] = macro_economic_df['Year-Month'].str.split('-', expand=True)[0]
macro_economic_df['Month'] = macro_economic_df['Year-Month'].str.split('-', expand=True)[1]
macro_economic_df = macro_economic_df.drop(['Year-Month'], axis=1)

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  macro_economic_df['AdvertisingExpenses (in Thousand Dollars)'].replace('?', np.nan, inplace=True)
  macro_economic_df['AdvertisingExpenses (in Thousand Dollars)'].replace('?', np.nan, inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  macro_economic_df['AdvertisingExpe

In [4]:
macro_economic_df['Month'] = macro_economic_df['Month'].str.lstrip()
month_to_number = {'Jan': 1, 'Feb': 2, 'Mar': 3, 'Apr': 4, 'May': 5, 'Jun': 6, 'Jul': 7, 'Aug': 8, 'Sep': 9, 'Oct': 10, 'Nov': 11, 'Dec': 12 }
macro_economic_df['Month'] = macro_economic_df['Month'].map(month_to_number)
macro_economic_df = macro_economic_df.sort_values(by=['Year','Month'])
macro_economic_df = macro_economic_df.reset_index(drop=True)
macro_economic_df['Month'] = macro_economic_df['Month'].astype(str)

In [5]:
macro_economic_df.to_csv('../data/clean_data/macro_economic.csv')

### Process Weather Data

In [6]:
def load_all_sheets():
    path = '../data/WeatherData.xlsx'
    all_sheets_df = pd.read_excel(path, sheet_name=None)

    wth_df = pd.DataFrame()

    for year, temp_df in all_sheets_df.items():
        temp_df['Year-Month'] = year + " - " + temp_df['Month']
        wth_df = pd.concat([wth_df, temp_df])

    return wth_df

weather_df = load_all_sheets()

In [7]:
weather_df.replace('-', np.nan, inplace=True)
weather_df['Precip.\xa0(mm) sum'].replace('T', 0, inplace=True)
weather_df.fillna(method='bfill', inplace=True)

  weather_df.replace('-', np.nan, inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  weather_df['Precip.\xa0(mm) sum'].replace('T', 0, inplace=True)
  weather_df['Precip.\xa0(mm) sum'].replace('T', 0, inplace=True)
  weather_df.fillna(method='bfill', inplace=True)


In [8]:
weather_df['Wind\xa0(km/h) low'] = weather_df['Wind\xa0(km/h) low'].astype(float)
weather_df['Wind\xa0(km/h) high'] = weather_df['Wind\xa0(km/h) high'].astype(float)
weather_df['Wind\xa0(km/h) avg'] = weather_df['Wind\xa0(km/h) avg'].astype(float)
weather_df['Precip.\xa0(mm) sum'] = weather_df['Precip.\xa0(mm) sum'].astype(float)

In [9]:
weather_df = weather_df.drop(columns=['Day'])
weather_df['WeatherEvent'] = weather_df['WeatherEvent'].astype(str)

mean_columns = [col for col in weather_df.columns if col not in ['Year', 'Month', 'Year-Month', 'WeatherEvent']]
mean_df = weather_df.groupby('Year-Month')[mean_columns].mean()
mean_df.reset_index(inplace=True)
mean_df[['Year', 'Month']] = mean_df['Year-Month'].str.split('-', expand=True)

mode_df = weather_df.groupby('Year-Month')['WeatherEvent'].agg(lambda x: ', '.join(x.unique())).reset_index()
weather_df = pd.merge(mean_df, mode_df, on='Year-Month')
weather_df = weather_df.drop(columns=['Year-Month'])

In [10]:
mode_df.head()

Unnamed: 0,Year-Month,WeatherEvent
0,2009 - Apr,"Rain, Fog , Rain"
1,2009 - Aug,"Fog , Rain, Rain"
2,2009 - Dec,"Rain, Rain , Snow, Fog , Snow"
3,2009 - Feb,"Fog , Snow, Rain, Rain , Snow"
4,2009 - Jan,"Snow, Rain, Fog , Rain , Snow, Fog , Snow"


In [11]:
weather_df['Month'] = weather_df['Month'].str.lstrip()
month_to_number = {'Jan': 1, 'Feb': 2, 'Mar': 3, 'Apr': 4, 'May': 5, 'Jun': 6, 'Jul': 7, 'Aug': 8, 'Sep': 9, 'Oct': 10, 'Nov': 11, 'Dec': 12 }
weather_df['Month'] = weather_df['Month'].map(month_to_number)
weather_df = weather_df.sort_values(by=['Year','Month'])
weather_df = weather_df.reset_index(drop=True)
weather_df['Month'] = weather_df['Month'].astype(str)

In [12]:
weather_df.to_csv('../data/clean_data/weather_df.csv')

### Process Events and Holidays Data

In [13]:
events_holidays_df = pd.read_excel('../data/Events_HolidaysData.xlsx')

In [14]:
events_holidays_df['Year'] = events_holidays_df['Year'].astype(str)
events_holidays_df.rename(columns={'MonthDate': 'Month'}, inplace=True)
events_holidays_df['Month'] = events_holidays_df['Month'].astype(str)
events_holidays_df['Month'] = events_holidays_df['Month'].str.split('-', expand=True)[1]
events_holidays_df['Month'] = events_holidays_df['Month'].str.lstrip('0')

In [15]:
events_holidays_df['HolidayCount'] = events_holidays_df.groupby(['Year', 'Month'])['Event'].transform('count')
events_holidays_df = events_holidays_df.drop(['Event', 'DayCategory'], axis=1)
events_holidays_df = events_holidays_df.drop_duplicates()

In [16]:
years = events_holidays_df['Year'].unique()
missing_months_df = pd.DataFrame({'Year': [], 'Month': [], 'HolidayCount': []})
for year in years:
    for month in [3, 4, 8]:
        if not ((events_holidays_df['Year'] == year) & (events_holidays_df['Month'] == month)).any():
            missing_data = pd.DataFrame({'Year': [year], 'Month': [month], 'HolidayCount': [0]})
            missing_months_df = pd.concat([missing_months_df, missing_data])

events_holidays_df = pd.concat([events_holidays_df, missing_months_df])
events_holidays_df['Month'] = events_holidays_df['Month'].astype(int)
events_holidays_df['HolidayCount'] = events_holidays_df['HolidayCount'].astype(int)
events_holidays_df = events_holidays_df.groupby(['Year', 'Month'], as_index=False)['HolidayCount'].sum()
events_holidays_df = events_holidays_df.sort_values(by=['Year','Month'])
events_holidays_df = events_holidays_df.reset_index(drop=True)
events_holidays_df['Month'] = events_holidays_df['Month'].astype(str)

In [17]:
events_holidays_df.to_csv('../data/clean_data/events_holidays.csv')

### Merge Macro economic, weather and events holidays datasets

In [56]:
macro_economic_df['Month'] = macro_economic_df['Month'].str.lstrip()
macro_economic_df['Year'] = macro_economic_df['Year'].str.lstrip()
events_holidays_df['Month'] = events_holidays_df['Month'].str.lstrip()
events_holidays_df['Year'] = events_holidays_df['Year'].str.lstrip()
macro_economic_df['Month'] = macro_economic_df['Month'].str.rstrip()
macro_economic_df['Year'] = macro_economic_df['Year'].str.rstrip()
events_holidays_df['Month'] = events_holidays_df['Month'].str.rstrip()
events_holidays_df['Year'] = events_holidays_df['Year'].str.rstrip()
weather_df['Month'] = weather_df['Month'].str.rstrip()
weather_df['Year'] = weather_df['Year'].str.rstrip()
weather_df['Month'] = weather_df['Month'].str.lstrip()
weather_df['Year'] = weather_df['Year'].str.lstrip()

In [57]:
macro_events_df = pd.merge(macro_economic_df, events_holidays_df, on=['Year', 'Month'], how='left')
macro_events_df.to_csv('../data/clean_data/macro_events.csv')

In [59]:
macro_events_weather_df = pd.merge(macro_events_df, weather_df, on=['Year', 'Month'], how='left')
macro_events_weather_df.to_csv('../data/clean_data/macro_events_weather.csv')

### Process Train Data

In [60]:
train_df = pd.read_csv('../data/train.csv')

In [66]:
train_df['Month'] = train_df['Month'].astype(str)
train_df['Year'] = train_df['Year'].astype(str)

In [67]:
mean_sales = train_df['Sales(In ThousandDollars)'].mean()
train_df['Sales(In ThousandDollars)'].fillna(mean_sales, inplace=True)

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  train_df['Sales(In ThousandDollars)'].fillna(mean_sales, inplace=True)


In [71]:
merged_df = pd.merge(macro_events_weather_df, train_df, on=['Year', 'Month'], how='left')
merged_df.to_csv('../data/clean_data/merged_df.csv')

### Prepare Training Data

In [41]:
le = LabelEncoder()
columns_to_encode = ['WeatherEvent', 'Month']
for column in columns_to_encode:
    merged_df[column] = le.fit_transform(merged_df[column])

In [43]:
scaler = MinMaxScaler()
columns_to_scale = ['Monthly Nominal GDP Index (inMillion$)', 'Monthly Real GDP Index (inMillion$)', 'CPI', 'unemployment rate', 'CommercialBankInterestRateonCreditCardPlans', 'Finance Rate on Personal Loans at Commercial Banks, 24 Month Loan', 'Earnings or wages  in dollars per hour',     'AdvertisingExpenses (in Thousand Dollars)', 'Cotton Monthly Price - US cents per Pound(lbs)', 'Change(in%)', 'Average upland planted(million acres)', 'Average upland harvested(million acres)', 'yieldperharvested acre', 'Production (in  480-lb netweright in million bales)', 'Mill use  (in  480-lb netweright in million bales)', 'Exports', 'HolidayCount', 'Temp high (°C)', 'Temp avg (°C)', 'Temp low (°C)', 'Month',
       'Dew Point high (°C)', 'Dew Point avg (°C)', 'Dew Point low (°C)', 'Humidity (%) high', 'Humidity (%) avg', 'Humidity (%) low', 'Sea Level Press. (hPa) high', 'Sea Level Press. (hPa) avg', 'Sea Level Press. (hPa) low', 'Visibility (km) high',
       'Visibility (km) avg', 'Visibility (km) low', 'Wind (km/h) low', 'Wind (km/h) avg', 'Wind (km/h) high', 'Precip. (mm) sum',
       'WeatherEvent']
merged_df[columns_to_scale] = scaler.fit_transform(merged_df[columns_to_scale])

In [44]:
women_clothing = merged_df[merged_df['ProductCategory'] == 'WomenClothing']
men_clothing = merged_df[merged_df['ProductCategory'] == 'MenClothing']
other_clothing = merged_df[merged_df['ProductCategory'] == 'OtherClothing']
women_clothing.drop(columns=['ProductCategory'], inplace=True)
men_clothing.drop(columns=['ProductCategory'], inplace=True)
other_clothing.drop(columns=['ProductCategory'], inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  women_clothing.drop(columns=['ProductCategory'], inplace=True)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  men_clothing.drop(columns=['ProductCategory'], inplace=True)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  other_clothing.drop(columns=['ProductCategory'], inplace=True)


In [45]:
women_clothing.to_csv('../data/clean_data/women_clothing.csv')
men_clothing.to_csv('../data/clean_data/men_clothing.csv')
other_clothing.to_csv('../data/clean_data/other_clothing.csv')

In [46]:
target_column = 'Sales(In ThousandDollars)'
feature_columns = merged_df.columns.drop(['Sales(In ThousandDollars)', 'ProductCategory'])

In [47]:
women_clothing_X = women_clothing[feature_columns]
women_clothing_y = women_clothing[target_column]

women_clothing_X_train, women_clothing_X_test, women_clothing_y_train, women_clothing_y_test = train_test_split(women_clothing_X, women_clothing_y, test_size=0.2, random_state=42)

In [48]:
men_clothing_X = men_clothing[feature_columns]
men_clothing_y = men_clothing[target_column]

men_clothing_X_train, men_clothing_X_test, men_clothing_y_train, men_clothing_y_test = train_test_split(men_clothing_X, men_clothing_y, test_size=0.2, random_state=42)

In [49]:
other_clothing_X = other_clothing[feature_columns]
other_clothing_y = other_clothing[target_column]

other_clothing_X_train, other_clothing_X_test, other_clothing_y_train, other_clothing_y_test = train_test_split(other_clothing_X, other_clothing_y, test_size=0.2, random_state=42)

### Model Training

In [50]:
women_clothing_lr_model = LinearRegression()
women_clothing_lr_model.fit(women_clothing_X_train, women_clothing_y_train)

In [51]:
men_clothing_lr_model = LinearRegression()
men_clothing_lr_model.fit(men_clothing_X_train, men_clothing_y_train)

In [52]:
other_clothing_lr_model = LinearRegression()
other_clothing_lr_model.fit(other_clothing_X_train, other_clothing_y_train)

### Generate Predictions and Submission

In [53]:
men_clothing_predictions = men_clothing_lr_model.predict(men_clothing_X_test)
women_clothing_predictions = women_clothing_lr_model.predict(women_clothing_X_test)
other_clothing_predictions = other_clothing_lr_model.predict(other_clothing_X_test)

In [54]:
men_clothing_df = pd.DataFrame({
    'Month': men_clothing_X_test['Month'],
    'ProductCategory': 'MenClothing',
    'Sales(In ThousandDollars)': men_clothing_predictions
})

women_clothing_df = pd.DataFrame({
    'Month': women_clothing_X_test['Month'],
    'ProductCategory': 'WomenClothing',
    'Sales(In ThousandDollars)': women_clothing_predictions
})

other_clothing_df = pd.DataFrame({
    'Month': other_clothing_X_test['Month'],
    'ProductCategory': 'OtherClothing',
    'Sales(In ThousandDollars)': other_clothing_predictions
})

In [55]:
submission = pd.concat([men_clothing_df, women_clothing_df, other_clothing_df])
submission.sort_values(by=['Month', 'ProductCategory'], ascending=[True, False], inplace=True)
submission.reset_index(drop=True, inplace=True)
submission.index = range(1, len(submission) + 1)
submission = submission.rename_axis('Year')

submission.drop(columns=['ProductCategory', 'Month'], inplace=True)

submission.to_csv('../data/submission/submission.csv')