# Walmart Sales Prediction

## Importing Libraries

In [1]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import MinMaxScaler, LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error
from math import sqrt

## Data Preparation

### Process Macroeconomic Data

In [2]:
macro_economic_df = pd.read_excel('../data//macro_economic.xlsx')
print(macro_economic_df.isnull().sum())

Year-Month                                                           0
Monthly Nominal GDP Index (inMillion$)                               0
Monthly Real GDP Index (inMillion$)                                  0
CPI                                                                  0
PartyInPower                                                         0
unemployment rate                                                    0
CommercialBankInterestRateonCreditCardPlans                          0
Finance Rate on Personal Loans at Commercial Banks, 24 Month Loan    0
Earnings or wages  in dollars per hour                               0
AdvertisingExpenses (in Thousand Dollars)                            0
Cotton Monthly Price - US cents per Pound(lbs)                       0
Change(in%)                                                          0
Average upland planted(million acres)                                0
Average upland harvested(million acres)                              0
yieldp

In [3]:
macro_economic_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 96 entries, 0 to 95
Data columns (total 18 columns):
 #   Column                                                             Non-Null Count  Dtype  
---  ------                                                             --------------  -----  
 0   Year-Month                                                         96 non-null     object 
 1   Monthly Nominal GDP Index (inMillion$)                             96 non-null     float64
 2   Monthly Real GDP Index (inMillion$)                                96 non-null     float64
 3   CPI                                                                96 non-null     float64
 4   PartyInPower                                                       96 non-null     object 
 5   unemployment rate                                                  96 non-null     float64
 6   CommercialBankInterestRateonCreditCardPlans                        96 non-null     float64
 7   Finance Rate on Personal Loa

In [4]:
print(macro_economic_df[pd.to_numeric(macro_economic_df['AdvertisingExpenses (in Thousand Dollars)'], errors='coerce').isnull()])

    Year-Month  Monthly Nominal GDP Index (inMillion$)  \
2   2009 - Mar                            14340.701639   
4   2009 - May                            14345.904809   
5   2009 - Jun                            14348.530666   
6   2009 - Jul                            14317.372922   
7   2009 - Aug                            14424.284901   
..         ...                                     ...   
91  2016 - Aug                            18741.599947   
92  2016 - Sep                            18840.309646   
93  2016 - Oct                            18740.780023   
94  2016 - Nov                            18960.461568   
95  2016 - Dec                            19015.393408   

    Monthly Real GDP Index (inMillion$)      CPI PartyInPower  \
2                          14351.786822  235.067    Democrats   
4                          14368.123959  235.975    Democrats   
5                          14346.820106  237.172    Democrats   
6                          14345.676097  23

In [5]:
print(macro_economic_df['PartyInPower'].value_counts())

PartyInPower
Democrats    96
Name: count, dtype: int64


In [6]:
macro_economic_df.drop(columns=['PartyInPower'], inplace=True)

In [7]:
macro_economic_df['AdvertisingExpenses (in Thousand Dollars)'].replace('?', np.nan, inplace=True)
macro_economic_df['AdvertisingExpenses (in Thousand Dollars)'] = macro_economic_df['AdvertisingExpenses (in Thousand Dollars)'].astype(float)
macro_economic_df['AdvertisingExpenses (in Thousand Dollars)'].fillna(macro_economic_df['AdvertisingExpenses (in Thousand Dollars)'].median(), inplace=True)
macro_economic_df['Year'] = macro_economic_df['Year-Month'].str.split('-', expand=True)[0]
macro_economic_df['Month'] = macro_economic_df['Year-Month'].str.split('-', expand=True)[1]
macro_economic_df = macro_economic_df.drop(['Year-Month'], axis=1)

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  macro_economic_df['AdvertisingExpenses (in Thousand Dollars)'].replace('?', np.nan, inplace=True)
  macro_economic_df['AdvertisingExpenses (in Thousand Dollars)'].replace('?', np.nan, inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  macro_economic_df['AdvertisingExpe

### Process Weather Data

In [8]:
def load_all_sheets():
    path = '../data/WeatherData.xlsx'
    all_sheets_df = pd.read_excel(path, sheet_name=None)

    wth_df = pd.DataFrame()

    for year, temp_df in all_sheets_df.items():
        temp_df['Year-Month'] = year + " - " + temp_df['Month']
        wth_df = pd.concat([wth_df, temp_df])

    return wth_df

weather_df = load_all_sheets()

In [9]:
weather_df.replace('-', np.nan, inplace=True)
weather_df['Precip.\xa0(mm) sum'].replace('T', 0, inplace=True)
weather_df.fillna(method='bfill', inplace=True)

  weather_df.replace('-', np.nan, inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  weather_df['Precip.\xa0(mm) sum'].replace('T', 0, inplace=True)
  weather_df['Precip.\xa0(mm) sum'].replace('T', 0, inplace=True)
  weather_df.fillna(method='bfill', inplace=True)


In [10]:
weather_df['Wind\xa0(km/h) low'] = weather_df['Wind\xa0(km/h) low'].astype(float)
weather_df['Wind\xa0(km/h) high'] = weather_df['Wind\xa0(km/h) high'].astype(float)
weather_df['Wind\xa0(km/h) avg'] = weather_df['Wind\xa0(km/h) avg'].astype(float)
weather_df['Precip.\xa0(mm) sum'] = weather_df['Precip.\xa0(mm) sum'].astype(float)

In [11]:
weather_df = weather_df.drop(columns=['Day'])

In [12]:
weather_df['WeatherEvent'] = weather_df['WeatherEvent'].astype(str)

In [13]:
mean_columns = [col for col in weather_df.columns if col not in ['Year', 'Month', 'Year-Month', 'WeatherEvent']]
mean_df = weather_df.groupby('Year-Month')[mean_columns].mean()
mean_df.reset_index(inplace=True)
mean_df[['Year', 'Month']] = mean_df['Year-Month'].str.split('-', expand=True)
mode_df = weather_df.groupby('Year-Month')['WeatherEvent'].agg(lambda x: ', '.join(x.unique()))
weather_df = pd.concat([mean_df, mode_df], axis=1)

In [14]:
weather_df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 192 entries, 0 to 2016 - Sep
Data columns (total 23 columns):
 #   Column                       Non-Null Count  Dtype  
---  ------                       --------------  -----  
 0   Year-Month                   96 non-null     object 
 1   Temp high (°C)               96 non-null     float64
 2   Temp avg (°C)                96 non-null     float64
 3   Temp low (°C)                96 non-null     float64
 4   Dew Point high (°C)          96 non-null     float64
 5   Dew Point avg (°C)           96 non-null     float64
 6   Dew Point low (°C)           96 non-null     float64
 7   Humidity (%) high            96 non-null     float64
 8   Humidity (%) avg             96 non-null     float64
 9   Humidity (%) low             96 non-null     float64
 10  Sea Level Press. (hPa) high  96 non-null     float64
 11  Sea Level Press. (hPa) avg   96 non-null     float64
 12  Sea Level Press. (hPa) low   96 non-null     float64
 13  Visibility (km) hi

In [15]:
weather_df.to_csv('../data/clean_data/weather_df.csv')

### Process Events and Holidays Data

In [16]:
events_holidays_df = pd.read_excel('../data/Events_HolidaysData.xlsx')
events_holidays_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 150 entries, 0 to 149
Data columns (total 4 columns):
 #   Column       Non-Null Count  Dtype         
---  ------       --------------  -----         
 0   Year         150 non-null    int64         
 1   MonthDate    150 non-null    datetime64[ns]
 2   Event        150 non-null    object        
 3   DayCategory  150 non-null    object        
dtypes: datetime64[ns](1), int64(1), object(2)
memory usage: 4.8+ KB


In [17]:
events_holidays_df.rename(columns={'MonthDate': 'Month'}, inplace=True)
events_holidays_df['Month'] = events_holidays_df['Month'].astype(str)
events_holidays_df['Month'] = events_holidays_df['Month'].str.split('-', expand=True)[1]

In [18]:
events_holidays_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 150 entries, 0 to 149
Data columns (total 4 columns):
 #   Column       Non-Null Count  Dtype 
---  ------       --------------  ----- 
 0   Year         150 non-null    int64 
 1   Month        150 non-null    object
 2   Event        150 non-null    object
 3   DayCategory  150 non-null    object
dtypes: int64(1), object(3)
memory usage: 4.8+ KB


In [19]:
month_dict = {
        '01': 'Jan', '02': 'Feb', '03': 'Mar', '04': 'Apr', '05': 'May',
        '06': 'Jun', '07': 'Jul', '08': 'Aug', '09': 'Sep', '10': 'Oct',
        '11': 'Nov', '12': 'Dec'
    }

events_holidays_df.replace({'Month': month_dict}, inplace=True)

In [20]:
events_holidays_df['HolidayCount'] = events_holidays_df.groupby(['Year', 'Month'])['Event'].transform('count')
events_holidays_df = events_holidays_df.drop(['Event', 'DayCategory'], axis=1)
events_holidays_df = events_holidays_df.drop_duplicates()

In [21]:
events_holidays_df.to_csv('../data/clean_data/events_holidays.csv')

In [22]:
events_holidays_df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 81 entries, 0 to 146
Data columns (total 3 columns):
 #   Column        Non-Null Count  Dtype 
---  ------        --------------  ----- 
 0   Year          81 non-null     int64 
 1   Month         81 non-null     object
 2   HolidayCount  81 non-null     int64 
dtypes: int64(2), object(1)
memory usage: 2.5+ KB


### Process Train Data

In [23]:
train_df = pd.read_csv('../data/train.csv')

In [24]:
month_dict2 = {
        '1': 'Jan', '2': 'Feb', '3': 'Mar', '4': 'Apr', '5': 'May',
        '6': 'Jun', '7': 'Jul', '8': 'Aug', '9': 'Sep', '10': 'Oct',
        '11': 'Nov', '12': 'Dec'
    }

train_df['Month'] = train_df['Month'].astype(str)

train_df.replace({'Month': month_dict2}, inplace=True)

In [25]:
train_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 180 entries, 0 to 179
Data columns (total 4 columns):
 #   Column                     Non-Null Count  Dtype  
---  ------                     --------------  -----  
 0   Year                       180 non-null    int64  
 1   Month                      180 non-null    object 
 2   ProductCategory            180 non-null    object 
 3   Sales(In ThousandDollars)  170 non-null    float64
dtypes: float64(1), int64(1), object(2)
memory usage: 5.8+ KB


In [26]:
mean_sales = train_df['Sales(In ThousandDollars)'].mean()
train_df['Sales(In ThousandDollars)'].fillna(mean_sales, inplace=True)

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  train_df['Sales(In ThousandDollars)'].fillna(mean_sales, inplace=True)


In [27]:
print(train_df.isnull().sum())

Year                         0
Month                        0
ProductCategory              0
Sales(In ThousandDollars)    0
dtype: int64


### Merge Data

In [28]:
macro_economic_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 96 entries, 0 to 95
Data columns (total 18 columns):
 #   Column                                                             Non-Null Count  Dtype  
---  ------                                                             --------------  -----  
 0   Monthly Nominal GDP Index (inMillion$)                             96 non-null     float64
 1   Monthly Real GDP Index (inMillion$)                                96 non-null     float64
 2   CPI                                                                96 non-null     float64
 3   unemployment rate                                                  96 non-null     float64
 4   CommercialBankInterestRateonCreditCardPlans                        96 non-null     float64
 5   Finance Rate on Personal Loans at Commercial Banks, 24 Month Loan  96 non-null     float64
 6   Earnings or wages  in dollars per hour                             96 non-null     float64
 7   AdvertisingExpenses (in Thou

In [29]:
weather_df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 192 entries, 0 to 2016 - Sep
Data columns (total 23 columns):
 #   Column                       Non-Null Count  Dtype  
---  ------                       --------------  -----  
 0   Year-Month                   96 non-null     object 
 1   Temp high (°C)               96 non-null     float64
 2   Temp avg (°C)                96 non-null     float64
 3   Temp low (°C)                96 non-null     float64
 4   Dew Point high (°C)          96 non-null     float64
 5   Dew Point avg (°C)           96 non-null     float64
 6   Dew Point low (°C)           96 non-null     float64
 7   Humidity (%) high            96 non-null     float64
 8   Humidity (%) avg             96 non-null     float64
 9   Humidity (%) low             96 non-null     float64
 10  Sea Level Press. (hPa) high  96 non-null     float64
 11  Sea Level Press. (hPa) avg   96 non-null     float64
 12  Sea Level Press. (hPa) low   96 non-null     float64
 13  Visibility (km) hi

In [30]:
events_holidays_df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 81 entries, 0 to 146
Data columns (total 3 columns):
 #   Column        Non-Null Count  Dtype 
---  ------        --------------  ----- 
 0   Year          81 non-null     int64 
 1   Month         81 non-null     object
 2   HolidayCount  81 non-null     int64 
dtypes: int64(2), object(1)
memory usage: 2.5+ KB


In [31]:
weather_df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 192 entries, 0 to 2016 - Sep
Data columns (total 23 columns):
 #   Column                       Non-Null Count  Dtype  
---  ------                       --------------  -----  
 0   Year-Month                   96 non-null     object 
 1   Temp high (°C)               96 non-null     float64
 2   Temp avg (°C)                96 non-null     float64
 3   Temp low (°C)                96 non-null     float64
 4   Dew Point high (°C)          96 non-null     float64
 5   Dew Point avg (°C)           96 non-null     float64
 6   Dew Point low (°C)           96 non-null     float64
 7   Humidity (%) high            96 non-null     float64
 8   Humidity (%) avg             96 non-null     float64
 9   Humidity (%) low             96 non-null     float64
 10  Sea Level Press. (hPa) high  96 non-null     float64
 11  Sea Level Press. (hPa) avg   96 non-null     float64
 12  Sea Level Press. (hPa) low   96 non-null     float64
 13  Visibility (km) hi

In [32]:
macro_economic_df['Year'] = macro_economic_df['Year'].astype(str)
macro_economic_df['Month'] = macro_economic_df['Month'].astype(str)
weather_df['Year'] = weather_df['Year'].astype(str)
weather_df['Month'] = weather_df['Month'].astype(str)
events_holidays_df['Year'] = events_holidays_df['Year'].astype(str)
events_holidays_df['Month'] = events_holidays_df['Month'].astype(str)
train_df['Year'] = train_df['Year'].astype(str)
train_df['Month'] = train_df['Month'].astype(str)

In [33]:
print(len(weather_df))

192


In [34]:
print(weather_df.isnull().sum())

Year-Month                     96
Temp high (°C)                 96
Temp avg (°C)                  96
Temp low (°C)                  96
Dew Point high (°C)            96
Dew Point avg (°C)             96
Dew Point low (°C)             96
Humidity (%) high              96
Humidity (%) avg               96
Humidity (%) low               96
Sea Level Press. (hPa) high    96
Sea Level Press. (hPa) avg     96
Sea Level Press. (hPa) low     96
Visibility (km) high           96
Visibility (km) avg            96
Visibility (km) low            96
Wind (km/h) low                96
Wind (km/h) avg                96
Wind (km/h) high               96
Precip. (mm) sum               96
Year                            0
Month                           0
WeatherEvent                   96
dtype: int64


In [35]:
merged_df1 = pd.merge(macro_economic_df, events_holidays_df, on=['Year', 'Month'], how='outer')
merged_df2 = pd.merge(merged_df1, weather_df, on=['Year', 'Month'], how='outer')
merged_df = pd.merge(merged_df2, train_df, on=['Year', 'Month'], how='outer')

In [36]:
merged_df.fillna(method='bfill', inplace=True)

  merged_df.fillna(method='bfill', inplace=True)


In [37]:
merged_df.fillna(method='ffill', inplace=True)

  merged_df.fillna(method='ffill', inplace=True)


In [38]:
print(merged_df.isnull().sum())

Monthly Nominal GDP Index (inMillion$)                               0
Monthly Real GDP Index (inMillion$)                                  0
CPI                                                                  0
unemployment rate                                                    0
CommercialBankInterestRateonCreditCardPlans                          0
Finance Rate on Personal Loans at Commercial Banks, 24 Month Loan    0
Earnings or wages  in dollars per hour                               0
AdvertisingExpenses (in Thousand Dollars)                            0
Cotton Monthly Price - US cents per Pound(lbs)                       0
Change(in%)                                                          0
Average upland planted(million acres)                                0
Average upland harvested(million acres)                              0
yieldperharvested acre                                               0
Production (in  480-lb netweright in million bales)                  0
Mill u

In [39]:
merged_df.replace("nan", float("NaN"), inplace=True)
merged_df.dropna(inplace=True)

In [40]:
placeholder_in_df = merged_df.eq("nan") 
print(placeholder_in_df)


     Monthly Nominal GDP Index (inMillion$)  \
0                                     False   
1                                     False   
2                                     False   
3                                     False   
4                                     False   
..                                      ...   
302                                   False   
303                                   False   
304                                   False   
305                                   False   
306                                   False   

     Monthly Real GDP Index (inMillion$)    CPI  unemployment rate  \
0                                  False  False              False   
1                                  False  False              False   
2                                  False  False              False   
3                                  False  False              False   
4                                  False  False              False   
..             

In [41]:
merged_df.to_csv('../data/clean_data/merged_df.csv')

### Prepare Training Data

In [42]:
merged_df['Year-Month'] = merged_df['Year'].astype(str) + ' - ' + merged_df['Month']
merged_df.drop(columns=['Year'], inplace=True)
merged_df.set_index('Year-Month', inplace=True)

In [43]:
le = LabelEncoder()
columns_to_encode = ['WeatherEvent', 'Month']
for column in columns_to_encode:
    merged_df[column] = le.fit_transform(merged_df[column])

In [44]:
print(merged_df.columns)

Index(['Monthly Nominal GDP Index (inMillion$)',
       'Monthly Real GDP Index (inMillion$)', 'CPI', 'unemployment rate',
       'CommercialBankInterestRateonCreditCardPlans',
       'Finance Rate on Personal Loans at Commercial Banks, 24 Month Loan',
       'Earnings or wages  in dollars per hour',
       'AdvertisingExpenses (in Thousand Dollars)',
       'Cotton Monthly Price - US cents per Pound(lbs)', 'Change(in%)',
       'Average upland planted(million acres)',
       'Average upland harvested(million acres)', 'yieldperharvested acre',
       'Production (in  480-lb netweright in million bales)',
       'Mill use  (in  480-lb netweright in million bales)', 'Exports',
       'Month', 'HolidayCount', 'Temp high (°C)', 'Temp avg (°C)',
       'Temp low (°C)', 'Dew Point high (°C)', 'Dew Point avg (°C)',
       'Dew Point low (°C)', 'Humidity (%) high', 'Humidity (%) avg',
       'Humidity (%) low', 'Sea Level Press. (hPa) high',
       'Sea Level Press. (hPa) avg', 'Sea Level Pres

In [45]:
scaler = MinMaxScaler()
columns_to_scale = ['Monthly Nominal GDP Index (inMillion$)', 'Monthly Real GDP Index (inMillion$)', 'CPI', 'unemployment rate', 'CommercialBankInterestRateonCreditCardPlans', 'Finance Rate on Personal Loans at Commercial Banks, 24 Month Loan', 'Earnings or wages  in dollars per hour',     'AdvertisingExpenses (in Thousand Dollars)', 'Cotton Monthly Price - US cents per Pound(lbs)', 'Change(in%)', 'Average upland planted(million acres)', 'Average upland harvested(million acres)', 'yieldperharvested acre', 'Production (in  480-lb netweright in million bales)', 'Mill use  (in  480-lb netweright in million bales)', 'Exports', 'HolidayCount', 'Temp high (°C)', 'Temp avg (°C)', 'Temp low (°C)', 'Month',
       'Dew Point high (°C)', 'Dew Point avg (°C)', 'Dew Point low (°C)', 'Humidity (%) high', 'Humidity (%) avg', 'Humidity (%) low', 'Sea Level Press. (hPa) high', 'Sea Level Press. (hPa) avg', 'Sea Level Press. (hPa) low', 'Visibility (km) high',
       'Visibility (km) avg', 'Visibility (km) low', 'Wind (km/h) low', 'Wind (km/h) avg', 'Wind (km/h) high', 'Precip. (mm) sum',
       'WeatherEvent']
merged_df[columns_to_scale] = scaler.fit_transform(merged_df[columns_to_scale])

In [46]:
women_clothing = merged_df[merged_df['ProductCategory'] == 'WomenClothing']
men_clothing = merged_df[merged_df['ProductCategory'] == 'MenClothing']
other_clothing = merged_df[merged_df['ProductCategory'] == 'OtherClothing']
women_clothing.drop(columns=['ProductCategory'], inplace=True)
men_clothing.drop(columns=['ProductCategory'], inplace=True)
other_clothing.drop(columns=['ProductCategory'], inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  women_clothing.drop(columns=['ProductCategory'], inplace=True)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  men_clothing.drop(columns=['ProductCategory'], inplace=True)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  other_clothing.drop(columns=['ProductCategory'], inplace=True)


In [47]:
women_clothing.to_csv('../data/clean_data/women_clothing.csv')
men_clothing.to_csv('../data/clean_data/men_clothing.csv')
other_clothing.to_csv('../data/clean_data/other_clothing.csv')

In [48]:
target_column = 'Sales(In ThousandDollars)'
feature_columns = merged_df.columns.drop(['Sales(In ThousandDollars)', 'ProductCategory'])

In [49]:
women_clothing_X = women_clothing[feature_columns]
women_clothing_y = women_clothing[target_column]

women_clothing_X_train, women_clothing_X_test, women_clothing_y_train, women_clothing_y_test = train_test_split(women_clothing_X, women_clothing_y, test_size=0.2, random_state=42)

In [50]:
men_clothing_X = men_clothing[feature_columns]
men_clothing_y = men_clothing[target_column]

men_clothing_X_train, men_clothing_X_test, men_clothing_y_train, men_clothing_y_test = train_test_split(men_clothing_X, men_clothing_y, test_size=0.2, random_state=42)

In [51]:
other_clothing_X = other_clothing[feature_columns]
other_clothing_y = other_clothing[target_column]

other_clothing_X_train, other_clothing_X_test, other_clothing_y_train, other_clothing_y_test = train_test_split(other_clothing_X, other_clothing_y, test_size=0.2, random_state=42)

### Model Training

In [52]:
women_clothing_lr_model = LinearRegression()
women_clothing_lr_model.fit(women_clothing_X_train, women_clothing_y_train)

In [53]:
men_clothing_lr_model = LinearRegression()
men_clothing_lr_model.fit(men_clothing_X_train, men_clothing_y_train)

In [54]:
other_clothing_lr_model = LinearRegression()
other_clothing_lr_model.fit(other_clothing_X_train, other_clothing_y_train)

### Generate Predictions and Submission

In [55]:
men_clothing_predictions = men_clothing_lr_model.predict(men_clothing_X_test)
women_clothing_predictions = women_clothing_lr_model.predict(women_clothing_X_test)
other_clothing_predictions = other_clothing_lr_model.predict(other_clothing_X_test)

In [56]:
men_clothing_df = pd.DataFrame({
    'Month': men_clothing_X_test['Month'],
    'ProductCategory': 'MenClothing',
    'Sales(In ThousandDollars)': men_clothing_predictions
})

women_clothing_df = pd.DataFrame({
    'Month': women_clothing_X_test['Month'],
    'ProductCategory': 'WomenClothing',
    'Sales(In ThousandDollars)': women_clothing_predictions
})

other_clothing_df = pd.DataFrame({
    'Month': other_clothing_X_test['Month'],
    'ProductCategory': 'OtherClothing',
    'Sales(In ThousandDollars)': other_clothing_predictions
})

In [57]:
submission = pd.concat([men_clothing_df, women_clothing_df, other_clothing_df])
submission.sort_values(by=['Month', 'ProductCategory'], ascending=[True, False], inplace=True)
submission.reset_index(drop=True, inplace=True)
submission.index = range(1, len(submission) + 1)
submission = submission.rename_axis('Year')

submission.drop(columns=['ProductCategory', 'Month'], inplace=True)

submission.to_csv('../data/submission/submission.csv')

: 