## Soft Drink Sales Prediction

Given *data about soft drinks*, let's try to predict the **quantity sold** of a given drink.

We will use a variety of regression models to make our predictions. 

### Importing Libraries

In [22]:
import numpy as np
import pandas as pd

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

from sklearn.neighbors import KNeighborsRegressor
from sklearn.linear_model import LinearRegression, Ridge
from sklearn.svm import LinearSVR, SVR
from sklearn.tree import DecisionTreeRegressor
from sklearn.neural_network import MLPRegressor
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor

In [23]:
train_df = pd.read_csv('train.csv')
test_df = pd.read_csv('test.csv')

In [24]:
train_df

Unnamed: 0,id,date,city,lat,long,pop,shop,brand,container,capacity,price,quantity
0,0,31/01/12,Athens,37.97945,23.71622,672130,shop_1,kinder-cola,glass,500ml,0.96,13280
1,1,31/01/12,Athens,37.97945,23.71622,672130,shop_1,kinder-cola,plastic,1.5lt,2.86,6727
2,2,31/01/12,Athens,37.97945,23.71622,672130,shop_1,kinder-cola,can,330ml,0.87,9848
3,3,31/01/12,Athens,37.97945,23.71622,672130,shop_1,adult-cola,glass,500ml,1.00,20050
4,4,31/01/12,Athens,37.97945,23.71622,672130,shop_1,adult-cola,can,330ml,0.39,25696
...,...,...,...,...,...,...,...,...,...,...,...,...
6475,6475,31/12/17,Athens,37.96245,23.68708,665871,shop_3,orange-power,plastic,1.5lt,1.02,33201
6476,6476,31/12/17,Larisa,39.63689,22.41761,144302,shop_5,orange-power,can,330ml,0.47,46971
6477,6477,31/12/17,Patra,38.24444,21.73444,168501,shop_6,adult-cola,glass,500ml,1.02,47708
6478,6478,31/12/17,Thessaloniki,40.64361,22.93086,353001,shop_4,gazoza,plastic,1.5lt,1.34,27115


In [25]:
test_df

Unnamed: 0,id,date,city,lat,long,pop,shop,brand,container,capacity,price,quantity
0,6480,31/01/18,Athens,37.97945,23.71622,664046,shop_1,kinder-cola,plastic,1.5lt,3.10,7056
1,6481,31/01/18,Athens,37.97945,23.71622,664046,shop_1,kinder-cola,can,330ml,0.85,12490
2,6482,31/01/18,Athens,37.97945,23.71622,664046,shop_1,adult-cola,glass,500ml,0.83,26640
3,6483,31/01/18,Athens,37.97945,23.71622,664046,shop_1,orange-power,glass,500ml,0.54,41892
4,6484,31/01/18,Athens,37.97945,23.71622,664046,shop_1,orange-power,plastic,1.5lt,0.83,22923
...,...,...,...,...,...,...,...,...,...,...,...,...
1075,7555,31/12/18,Athens,37.97945,23.71622,664046,shop_1,kinder-cola,plastic,1.5lt,2.52,13760
1076,7556,31/12/18,Athens,37.97945,23.71622,664046,shop_1,orange-power,plastic,1.5lt,2.18,16309
1077,7557,31/12/18,Patra,38.24444,21.73444,168034,shop_6,kinder-cola,can,330ml,0.85,24378
1078,7558,31/12/18,Thessaloniki,40.64361,22.93086,354290,shop_4,adult-cola,plastic,1.5lt,2.17,20691


In [26]:
train_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6480 entries, 0 to 6479
Data columns (total 12 columns):
 #   Column     Non-Null Count  Dtype  
---  ------     --------------  -----  
 0   id         6480 non-null   int64  
 1   date       6480 non-null   object 
 2   city       6480 non-null   object 
 3   lat        6429 non-null   float64
 4   long       6434 non-null   float64
 5   pop        6480 non-null   int64  
 6   shop       6480 non-null   object 
 7   brand      6480 non-null   object 
 8   container  6464 non-null   object 
 9   capacity   6465 non-null   object 
 10  price      6480 non-null   float64
 11  quantity   6480 non-null   int64  
dtypes: float64(3), int64(3), object(6)
memory usage: 607.6+ KB


### Preprocessing

In [27]:
X_train = train_df.copy()
X_test = test_df.copy()

In [28]:
# Drop id column
X_train = X_train.drop('id', axis=1)
X_test = X_test.drop('id', axis=1)

In [34]:
X_train.isna().all(axis=1).sum()

0

In [35]:
X_test.isna().all(axis=1).sum()

0

In [36]:
# Missing values
X_train.isna().sum()

date          0
city          0
lat          51
long         46
pop           0
shop          0
brand         0
container    16
capacity     15
price         0
quantity      0
dtype: int64

In [37]:
X_test.isna().sum()

date          0
city          0
lat           8
long         13
pop           0
shop          0
brand         0
container     3
capacity      4
price         0
quantity      0
dtype: int64

In [38]:
{column: X_train[column].unique() for column in X_train.select_dtypes('object').columns}

{'date': array(['31/01/12', '29/02/12', '31/03/12', '30/04/12', '31/05/12',
        '30/06/12', '31/07/12', '31/08/12', '30/09/12', '31/10/12',
        '30/11/12', '31/12/12', '31/01/13', '28/02/13', '31/03/13',
        '30/04/13', '31/05/13', '30/06/13', '31/07/13', '31/08/13',
        '30/09/13', '31/10/13', '30/11/13', '31/12/13', '31/01/14',
        '28/02/14', '31/03/14', '30/04/14', '31/05/14', '30/06/14',
        '31/07/14', '31/08/14', '30/09/14', '31/10/14', '30/11/14',
        '31/12/14', '31/01/15', '28/02/15', '31/03/15', '30/04/15',
        '31/05/15', '30/06/15', '31/07/15', '31/08/15', '30/09/15',
        '31/10/15', '30/11/15', '31/12/15', '31/01/16', '29/02/16',
        '31/03/16', '30/04/16', '31/05/16', '30/06/16', '31/07/16',
        '31/08/16', '30/09/16', '31/10/16', '30/11/16', '31/12/16',
        '31/01/17', '28/02/17', '31/03/17', '30/04/17', '31/05/17',
        '30/06/17', '31/07/17', '31/08/17', '30/09/17', '31/10/17',
        '30/11/17', '31/12/17'], dtype=o

In [39]:
# Fill numeric missing values with mean
for column in ['lat', 'long']:
    X_train[column] = X_train[column].fillna(X_train[column].mean())
    X_test[column] = X_test[column].fillna(X_test[column].mean())

In [40]:
X_train.isna().sum()

date          0
city          0
lat           0
long          0
pop           0
shop          0
brand         0
container    16
capacity     15
price         0
quantity      0
dtype: int64

In [41]:
X_test.isna().sum()

date         0
city         0
lat          0
long         0
pop          0
shop         0
brand        0
container    3
capacity     4
price        0
quantity     0
dtype: int64

In [42]:
# Fill ordinal missing values with mode
X_train['capacity'] = X_train['capacity'].fillna(X_train['capacity'].mode()[0])

In [43]:
X_test['capacity'] = X_test['capacity'].fillna(X_test['capacity'].mode()[0])

In [44]:
X_test.isna().sum()

date         0
city         0
lat          0
long         0
pop          0
shop         0
brand        0
container    3
capacity     0
price        0
quantity     0
dtype: int64

In [45]:
ordering_capacity = ['330ml', '500ml', '1.5lt']

In [46]:
# Ordinal encode capacity column
X_train['capacity'] = X_train['capacity'].apply(lambda x: ordering_capacity.index(x))

In [48]:
X_test['capacity'] = X_test['capacity'].apply(lambda x: ordering_capacity.index(x))

In [49]:
X_train

Unnamed: 0,date,city,lat,long,pop,shop,brand,container,capacity,price,quantity
0,31/01/12,Athens,37.97945,23.71622,672130,shop_1,kinder-cola,glass,1,0.96,13280
1,31/01/12,Athens,37.97945,23.71622,672130,shop_1,kinder-cola,plastic,2,2.86,6727
2,31/01/12,Athens,37.97945,23.71622,672130,shop_1,kinder-cola,can,0,0.87,9848
3,31/01/12,Athens,37.97945,23.71622,672130,shop_1,adult-cola,glass,1,1.00,20050
4,31/01/12,Athens,37.97945,23.71622,672130,shop_1,adult-cola,can,0,0.39,25696
...,...,...,...,...,...,...,...,...,...,...,...
6475,31/12/17,Athens,37.96245,23.68708,665871,shop_3,orange-power,plastic,2,1.02,33201
6476,31/12/17,Larisa,39.63689,22.41761,144302,shop_5,orange-power,can,0,0.47,46971
6477,31/12/17,Patra,38.24444,21.73444,168501,shop_6,adult-cola,glass,1,1.02,47708
6478,31/12/17,Thessaloniki,40.64361,22.93086,353001,shop_4,gazoza,plastic,2,1.34,27115


In [51]:
# One hot encoding container column
def onehot_encode(df, column, prefix):
    df = df.copy()
    dummies = pd.get_dummies(df[column], prefix=prefix)
    df = pd.concat([df, dummies], axis=1)
    df = df.drop(column, axis=1)
    return df

In [52]:
categorical_columns = ['city', 'shop', 'brand', 'container']

for column in categorical_columns:
    X_train = onehot_encode(X_train, column, column)
    X_test = onehot_encode(X_test, column, column)

In [53]:
X_train

Unnamed: 0,date,lat,long,pop,capacity,price,quantity,city_Athens,city_Irakleion,city_Larisa,...,shop_shop_5,shop_shop_6,brand_adult-cola,brand_gazoza,brand_kinder-cola,brand_lemon-boost,brand_orange-power,container_can,container_glass,container_plastic
0,31/01/12,37.97945,23.71622,672130,1,0.96,13280,True,False,False,...,False,False,False,False,True,False,False,False,True,False
1,31/01/12,37.97945,23.71622,672130,2,2.86,6727,True,False,False,...,False,False,False,False,True,False,False,False,False,True
2,31/01/12,37.97945,23.71622,672130,0,0.87,9848,True,False,False,...,False,False,False,False,True,False,False,True,False,False
3,31/01/12,37.97945,23.71622,672130,1,1.00,20050,True,False,False,...,False,False,True,False,False,False,False,False,True,False
4,31/01/12,37.97945,23.71622,672130,0,0.39,25696,True,False,False,...,False,False,True,False,False,False,False,True,False,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6475,31/12/17,37.96245,23.68708,665871,2,1.02,33201,True,False,False,...,False,False,False,False,False,False,True,False,False,True
6476,31/12/17,39.63689,22.41761,144302,0,0.47,46971,False,False,True,...,True,False,False,False,False,False,True,True,False,False
6477,31/12/17,38.24444,21.73444,168501,1,1.02,47708,False,False,False,...,False,True,True,False,False,False,False,False,True,False
6478,31/12/17,40.64361,22.93086,353001,2,1.34,27115,False,False,False,...,False,False,False,True,False,False,False,False,False,True


In [58]:
# Extract Date features
def encode_dates(df, column):
    df = df.copy()
    df[column] = pd.to_datetime(df[column])
    df[column + '_year'] = df[column].apply(lambda x: x.year)
    df[column + '_month'] = df[column].apply(lambda x: x.month)
    df[column + '_day'] = df[column].apply(lambda x: x.day)
    df = df.drop(column, axis=1)
    return df

In [59]:
X_train = encode_dates(X_train, column='date')
X_test = encode_dates(X_test, column='date')

In [60]:
X_train

Unnamed: 0,lat,long,pop,capacity,price,quantity,city_Athens,city_Irakleion,city_Larisa,city_Patra,...,brand_gazoza,brand_kinder-cola,brand_lemon-boost,brand_orange-power,container_can,container_glass,container_plastic,date_year,date_month,date_day
0,37.97945,23.71622,672130,1,0.96,13280,True,False,False,False,...,False,True,False,False,False,True,False,2012,1,31
1,37.97945,23.71622,672130,2,2.86,6727,True,False,False,False,...,False,True,False,False,False,False,True,2012,1,31
2,37.97945,23.71622,672130,0,0.87,9848,True,False,False,False,...,False,True,False,False,True,False,False,2012,1,31
3,37.97945,23.71622,672130,1,1.00,20050,True,False,False,False,...,False,False,False,False,False,True,False,2012,1,31
4,37.97945,23.71622,672130,0,0.39,25696,True,False,False,False,...,False,False,False,False,True,False,False,2012,1,31
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6475,37.96245,23.68708,665871,2,1.02,33201,True,False,False,False,...,False,False,False,True,False,False,True,2017,12,31
6476,39.63689,22.41761,144302,0,0.47,46971,False,False,True,False,...,False,False,False,True,True,False,False,2017,12,31
6477,38.24444,21.73444,168501,1,1.02,47708,False,False,False,True,...,False,False,False,False,False,True,False,2017,12,31
6478,40.64361,22.93086,353001,2,1.34,27115,False,False,False,False,...,True,False,False,False,False,False,True,2017,12,31


In [62]:
# Split df into X and y
y_train = X_train['quantity']
X_train = X_train.drop('quantity', axis=1)

In [63]:
y_test = X_test['quantity']
X_test = X_test.drop('quantity', axis=1)

In [65]:
# Scale X_train
scaler = StandardScaler()
scaler.fit(X_train)

X_train = pd.DataFrame(scaler.transform(X_train), columns=X_train.columns)
X_test = pd.DataFrame(scaler.transform(X_test), columns=X_test.columns)

In [66]:
X_train

Unnamed: 0,lat,long,pop,capacity,price,city_Athens,city_Irakleion,city_Larisa,city_Patra,city_Thessaloniki,...,brand_gazoza,brand_kinder-cola,brand_lemon-boost,brand_orange-power,container_can,container_glass,container_plastic,date_year,date_month,date_day
0,-0.195837,0.411791,1.361571,-0.003021,-0.281130,1.414214,-0.447214,-0.447214,-0.447214,-0.447214,...,-0.5,2.0,-0.5,-0.5,-0.705389,1.417165,-0.706370,-1.46385,-1.593255,0.696733
1,-0.195837,0.411791,1.361571,1.220597,2.051577,1.414214,-0.447214,-0.447214,-0.447214,-0.447214,...,-0.5,2.0,-0.5,-0.5,-0.705389,-0.705634,1.415688,-1.46385,-1.593255,0.696733
2,-0.195837,0.411791,1.361571,-1.226639,-0.391627,1.414214,-0.447214,-0.447214,-0.447214,-0.447214,...,-0.5,2.0,-0.5,-0.5,1.417658,-0.705634,-0.706370,-1.46385,-1.593255,0.696733
3,-0.195837,0.411791,1.361571,-0.003021,-0.232020,1.414214,-0.447214,-0.447214,-0.447214,-0.447214,...,-0.5,-0.5,-0.5,-0.5,-0.705389,1.417165,-0.706370,-1.46385,-1.593255,0.696733
4,-0.195837,0.411791,1.361571,-1.226639,-0.980942,1.414214,-0.447214,-0.447214,-0.447214,-0.447214,...,-0.5,-0.5,-0.5,-0.5,1.417658,-0.705634,-0.706370,-1.46385,-1.593255,0.696733
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6475,-0.206183,0.384885,1.334687,1.220597,-0.207465,1.414214,-0.447214,-0.447214,-0.447214,-0.447214,...,-0.5,-0.5,-0.5,2.0,-0.705389,-0.705634,1.415688,1.46385,1.593255,0.696733
6476,0.812867,-0.787282,-0.905625,-1.226639,-0.882723,-0.707107,-0.447214,2.236068,-0.447214,-0.447214,...,-0.5,-0.5,-0.5,2.0,1.417658,-0.705634,-0.706370,1.46385,1.593255,0.696733
6477,-0.034567,-1.418088,-0.801683,-0.003021,-0.207465,-0.707107,-0.447214,-0.447214,2.236068,-0.447214,...,-0.5,-0.5,-0.5,-0.5,-0.705389,1.417165,-0.706370,1.46385,1.593255,0.696733
6478,1.425548,-0.313372,-0.009194,1.220597,0.185412,-0.707107,-0.447214,-0.447214,-0.447214,2.236068,...,2.0,-0.5,-0.5,-0.5,-0.705389,-0.705634,1.415688,1.46385,1.593255,0.696733


### Training

In [67]:
models = {
    "                   K-Nearest Neighbors": KNeighborsRegressor(),
    "                     Linear Regression": LinearRegression(),
    "                 Ridge (L2) Regression": Ridge(),
    "Support Vector Machine (Linear Kernel)": LinearSVR(),
    "   Support Vector Machine (RBF Kernel)": SVR(),
    "                         Decision Tree": DecisionTreeRegressor(),
    "                        Neural Network": MLPRegressor(),
    "                         Random Forest": RandomForestRegressor(),
    "                     Gradient Boosting": GradientBoostingRegressor()
}

In [68]:
for name, model in models.items():
    model.fit(X_train, y_train)
    print(name + " trained.")

                   K-Nearest Neighbors trained.
                     Linear Regression trained.
                 Ridge (L2) Regression trained.
Support Vector Machine (Linear Kernel) trained.
   Support Vector Machine (RBF Kernel) trained.
                         Decision Tree trained.




                        Neural Network trained.
                         Random Forest trained.
                     Gradient Boosting trained.


### Results

In [70]:
for name, model in models.items():
    print(name + " R^2 Score: {:.5f}".format(model.score(X_test, y_test)))

                   K-Nearest Neighbors R^2 Score: 0.68272
                     Linear Regression R^2 Score: 0.57229
                 Ridge (L2) Regression R^2 Score: 0.57243
Support Vector Machine (Linear Kernel) R^2 Score: -1.98558
   Support Vector Machine (RBF Kernel) R^2 Score: -0.05487
                         Decision Tree R^2 Score: 0.82255
                        Neural Network R^2 Score: -0.19149
                         Random Forest R^2 Score: 0.92162
                     Gradient Boosting R^2 Score: 0.88483
