In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline

In [2]:
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV 
from sklearn.metrics import mean_absolute_percentage_error
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import MinMaxScaler, OneHotEncoder
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LinearRegression, SGDRegressor, Lasso, Ridge
from sklearn.tree import DecisionTreeRegressor
from sklearn.neighbors import KNeighborsRegressor
from sklearn.ensemble import AdaBoostRegressor, GradientBoostingRegressor, RandomForestRegressor
from sklearn.svm import LinearSVR, SVR
from sklearn.neural_network import MLPRegressor

In [3]:
train_df = pd.read_csv('train.csv')

In [4]:
train_df.shape

(230130, 6)

In [5]:
train_df.sample(10)

Unnamed: 0,id,date,country,store,product,num_sold
57599,57599,2011-10-02,Singapore,Premium Sticker Mart,Kerneler Dark Mode,830.0
163914,163914,2014-12-27,Finland,Stickers for Less,Kerneler Dark Mode,977.0
106262,106262,2013-03-26,Norway,Discount Stickers,Kaggle Tiers,1099.0
57527,57527,2011-10-02,Finland,Discount Stickers,Kaggle Tiers,732.0
28062,28062,2010-11-08,Norway,Premium Sticker Mart,Kaggle Tiers,2299.0
152185,152185,2014-08-18,Singapore,Premium Sticker Mart,Holographic Goose,237.0
154465,154465,2014-09-13,Finland,Premium Sticker Mart,Holographic Goose,244.0
105459,105459,2013-03-17,Norway,Stickers for Less,Kerneler Dark Mode,2303.0
229948,229948,2016-12-29,Singapore,Premium Sticker Mart,Kerneler,1133.0
206311,206311,2016-04-11,Italy,Discount Stickers,Kaggle,457.0


In [6]:
train_df.drop(columns=['id'], inplace=True)

In [7]:
train_df.isna().sum()

date           0
country        0
store          0
product        0
num_sold    8871
dtype: int64

Missing data is only in the target col, and only 3.8%. So, removing it entirely.

In [8]:
8871/230130

0.038547777343240774

In [9]:
train_df.dropna(inplace=True)

In [10]:
train_df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 221259 entries, 1 to 230129
Data columns (total 5 columns):
 #   Column    Non-Null Count   Dtype  
---  ------    --------------   -----  
 0   date      221259 non-null  object 
 1   country   221259 non-null  object 
 2   store     221259 non-null  object 
 3   product   221259 non-null  object 
 4   num_sold  221259 non-null  float64
dtypes: float64(1), object(4)
memory usage: 10.1+ MB


In [11]:
train_df['date'] = pd.to_datetime(train_df['date'])

In [12]:
train_df['year'] = train_df['date'].dt.year
train_df['month'] = train_df['date'].dt.month
train_df['day'] = train_df['date'].dt.day
train_df['month_name'] = train_df['date'].dt.month_name()
train_df['day_of_week'] = train_df['date'].dt.dayofweek
train_df['day_name'] = train_df['date'].dt.day_name()
train_df['is_weekend'] = np.where(train_df['day_name'].isin(['Sunday', 'Saturday']), 1, 0)
train_df['week'] = train_df['date'].dt.isocalendar().week
train_df['quarter'] = train_df['date'].dt.quarter

In [13]:
train_df.head()

Unnamed: 0,date,country,store,product,num_sold,year,month,day,month_name,day_of_week,day_name,is_weekend,week,quarter
1,2010-01-01,Canada,Discount Stickers,Kaggle,973.0,2010,1,1,January,4,Friday,0,53,1
2,2010-01-01,Canada,Discount Stickers,Kaggle Tiers,906.0,2010,1,1,January,4,Friday,0,53,1
3,2010-01-01,Canada,Discount Stickers,Kerneler,423.0,2010,1,1,January,4,Friday,0,53,1
4,2010-01-01,Canada,Discount Stickers,Kerneler Dark Mode,491.0,2010,1,1,January,4,Friday,0,53,1
5,2010-01-01,Canada,Stickers for Less,Holographic Goose,300.0,2010,1,1,January,4,Friday,0,53,1


In [14]:
train_df.drop(columns=['date'], inplace=True)

In [15]:
num_cols = train_df.select_dtypes(include=['int', 'float']).columns

In [16]:
cat_cols = train_df.select_dtypes(include=['object']).columns

In [17]:
train_df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 221259 entries, 1 to 230129
Data columns (total 13 columns):
 #   Column       Non-Null Count   Dtype  
---  ------       --------------   -----  
 0   country      221259 non-null  object 
 1   store        221259 non-null  object 
 2   product      221259 non-null  object 
 3   num_sold     221259 non-null  float64
 4   year         221259 non-null  int32  
 5   month        221259 non-null  int32  
 6   day          221259 non-null  int32  
 7   month_name   221259 non-null  object 
 8   day_of_week  221259 non-null  int32  
 9   day_name     221259 non-null  object 
 10  is_weekend   221259 non-null  int32  
 11  week         221259 non-null  UInt32 
 12  quarter      221259 non-null  int32  
dtypes: UInt32(1), float64(1), int32(6), object(5)
memory usage: 17.9+ MB


In [18]:
train_df['week'] = train_df['week'].astype('int32')

In [19]:
def show_hist(col):
    plt.figure()
    sns.histplot(train_df, x=train_df[col], kde=True)
    plt.title(f'Histogram of {col}')
    plt.plot()

In [20]:
# for col in num_cols:
#     show_hist(col)

In [21]:
X = train_df.drop(columns=['num_sold'])
y = train_df['num_sold']

In [22]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

In [23]:
X_train.head()

Unnamed: 0,country,store,product,year,month,day,month_name,day_of_week,day_name,is_weekend,week,quarter
135972,Norway,Premium Sticker Mart,Kaggle Tiers,2014,2,19,February,2,Wednesday,0,8,1
120684,Singapore,Stickers for Less,Kerneler Dark Mode,2013,9,2,September,0,Monday,0,36,3
198622,Singapore,Stickers for Less,Kaggle Tiers,2016,1,16,January,5,Saturday,1,2,1
193430,Finland,Stickers for Less,Holographic Goose,2015,11,20,November,4,Friday,0,47,4
91907,Finland,Discount Stickers,Kaggle Tiers,2012,10,18,October,3,Thursday,0,42,4


In [24]:
num_cols_index = [3, 4, 5, 7, 9, 10, 11]
cat_cols_index = [0, 1, 2, 6, 8]

In [25]:
models = {
    'LinearRegression': LinearRegression(),
    'Lasso': Lasso(),
    'Ridge': Ridge(),
    'SGDRegressor': SGDRegressor(),
    'DecisionTree': DecisionTreeRegressor(),
    'KNN': KNeighborsRegressor(),
#     'SupportVectorRegressor': SVR(),
#     'LinearSVR': LinearSVR(),
    'AdaBoost': AdaBoostRegressor(),
    'RandomForest': RandomForestRegressor(),
    'GradientBoosting': GradientBoostingRegressor(),
    'MultiLayerPerceptronRegressor': MLPRegressor()
}

In [26]:
num_pre = Pipeline(
    [
        ('scaling', MinMaxScaler())
    ]
)

cat_pre = Pipeline(
    [
        ('encoding', OneHotEncoder(drop='first', handle_unknown='ignore'))
    ]
)

In [27]:
preprocessor = ColumnTransformer(
    [
        ('num', num_pre, num_cols_index),
        ('cat', cat_pre, cat_cols_index)
    ]
)

In [28]:
def model_building(X_train, y_train, X_test, y_test):
    for name, model in models.items():
        pipe = Pipeline(
            [
                ('preprocessor', preprocessor),
                ('model', model)
            ]
        )
        pipe.fit(X_train, y_train)
        y_pred = pipe.predict(X_test)
        print(f'{name} prediction: {mean_absolute_percentage_error(y_test, y_pred)}')

In [29]:
model_building(X_train, y_train, X_test, y_test)

LinearRegression prediction: 4.499125941581784
Lasso prediction: 4.362887734917057
Ridge prediction: 4.4984481945492885
SGDRegressor prediction: 4.467197347184031
DecisionTree prediction: 0.06640756502070069
KNN prediction: 0.08927837911156841
AdaBoost prediction: 4.66698007580507
RandomForest prediction: 0.050967926729802104
GradientBoosting prediction: 1.3663770585760535




MultiLayerPerceptronRegressor prediction: 0.7232030556963742


From above, we can see that Decision Tree Regressor, KNeighbors Regressor and Random Forest are the best models out of all.

**I have not trained Support Vector Machines, because they take a lot of time, and at this moment I didnt have that, so. Anyone seeing this should train those as well to see how they compare to others.**

In [31]:
test = pd.read_csv('test.csv')

In [32]:
test.head()

Unnamed: 0,id,date,country,store,product
0,230130,2017-01-01,Canada,Discount Stickers,Holographic Goose
1,230131,2017-01-01,Canada,Discount Stickers,Kaggle
2,230132,2017-01-01,Canada,Discount Stickers,Kaggle Tiers
3,230133,2017-01-01,Canada,Discount Stickers,Kerneler
4,230134,2017-01-01,Canada,Discount Stickers,Kerneler Dark Mode


In [34]:
ids = test['id']
ids

0        230130
1        230131
2        230132
3        230133
4        230134
          ...  
98545    328675
98546    328676
98547    328677
98548    328678
98549    328679
Name: id, Length: 98550, dtype: int64

In [38]:
test['date'] = pd.to_datetime(test['date'])

test['year'] = test['date'].dt.year
test['month'] = test['date'].dt.month
test['day'] = test['date'].dt.day
test['month_name'] = test['date'].dt.month_name()
test['day_of_week'] = test['date'].dt.dayofweek
test['day_name'] = test['date'].dt.day_name()
test['is_weekend'] = np.where(test['day_name'].isin(['Sunday', 'Saturday']), 1, 0)
test['week'] = test['date'].dt.isocalendar().week
test['quarter'] = test['date'].dt.quarter

In [41]:
test.drop(columns=['id', 'date'], inplace=True)

In [30]:
decision_tree_pipe = Pipeline(
    [
        ('preprocessor', preprocessor),
        ('model', DecisionTreeRegressor())
    ]
)

decision_tree_pipe.fit(X_train, y_train)

y_pred_deci_tree = decision_tree_pipe.predict(X_test)
mean_absolute_percentage_error(y_test, y_pred_deci_tree)

0.06634254878442868

In [44]:
test_pred_deci_tree = decision_tree_pipe.predict(test)

result_deci_tree = pd.DataFrame({'id': ids, 'num_sold': test_pred_deci_tree})
result_deci_tree.to_csv('Decision_Tree.csv', index=False)

In [45]:
KNeighbors_pipe = Pipeline(
    [
        ('preprocessor', preprocessor),
        ('model', KNeighborsRegressor())
    ]
)

KNeighbors_pipe.fit(X_train, y_train)

y_pred_KNeighbors = KNeighbors_pipe.predict(X_test)
mean_absolute_percentage_error(y_test, y_pred_KNeighbors)

0.08927837911156841

In [46]:
test_pred_KNeighbors = KNeighbors_pipe.predict(test)

result_KNeighbors = pd.DataFrame({'id': ids, 'num_sold': test_pred_KNeighbors})
result_KNeighbors.to_csv('KNeighbors_Regressor.csv', index=False)

In [47]:
rf_pipe = Pipeline(
    [
        ('preprocessor', preprocessor),
        ('model', RandomForestRegressor())
    ]
)

rf_pipe.fit(X_train, y_train)

y_pred_rf = rf_pipe.predict(X_test)
mean_absolute_percentage_error(y_test, y_pred_rf)

0.05090205416796682

In [48]:
test_pred_rf = rf_pipe.predict(test)

result_rf = pd.DataFrame({'id': ids, 'num_sold': test_pred_rf})
result_rf.to_csv('Random_Forest_Simple.csv', index=False)

***Training Random Forest on the whole dataset!!!***

In [49]:
rf_pipe.fit(X, y)

In [51]:
test_pred_rf_whole = rf_pipe.predict(test)

result_rf_whole = pd.DataFrame({'id': ids, 'num_sold': test_pred_rf})
result_rf_whole.to_csv('Random_Forest_On_whole_training_set.csv', index=False)