# Linear Regression

## Preparation

In [1]:
# Set to `True` to run the linear regression evaluations
run_lr = True
# Set to `True` to run the cross validation (takes quite some time)
run_cv = False
# Set to `True` to output the submission file to upload to Kaggle
lr_submission = True

### Imports

In [2]:
import pandas as pd
import numpy as np
from itertools import product

from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import Lasso
from sklearn.linear_model import Ridge

### Add data

In [3]:
train = pd.read_csv('data/technical/sales_train.csv')
test = pd.read_csv('data/technical/test.csv')

### Evaluation metric

In [4]:
def rmse(y_te, y_p):
    return np.sqrt(mean_squared_error(y_te, y_p))

## 1st attempt: apply LR on the raw dataset

### Sum the daily sales into monthly sales

In [5]:
if run_lr:
    # regroup the train data on the relevant columns
    group_on = ['date_block_num', 'shop_id', 'item_id']
    # sum it up
    df = train.groupby(group_on).agg({'item_cnt_day': ['sum']})
    train.drop('item_cnt_day', axis=1, inplace=True)
    df.columns = df.columns.droplevel(1)

    # merge it back
    df = df.merge(train, left_index=True, right_on=group_on, how='left')
    # remove the duplicates
    df.drop_duplicates(group_on, inplace=True)
    # rename the column and drop redundant columns
    df.rename(columns = {'item_cnt_day': 'item_cnt_month'}, inplace=True)
    df.drop('date', axis=1, inplace=True)

    # copy it back
    train = df.copy()
    del df

### Split the features and label

In [6]:
if run_lr:
    # assign features and label
    X = train.drop('item_cnt_month', axis=1)
    Y = train['item_cnt_month']

In [7]:
if run_lr:
    X_train, X_test, y_train, y_test = train_test_split(X, Y, train_size=0.8, random_state=42)

    LR = LinearRegression()
    LR.fit(X_train, y_train)
    prediction = LR.predict(X_test)

    print(f"RMSE: {rmse(y_test, prediction)}")

RMSE: 8.105058747327444


## 2nd attempt: clip the data

In [8]:
if run_lr:
    Y = train['item_cnt_month'].clip(0, 20)

In [9]:
if run_lr:
    X_train, X_test, y_train, y_test = train_test_split(X, Y, train_size=0.8, random_state=42)

    LR = LinearRegression()
    LR.fit(X_train, y_train)
    prediction = LR.predict(X_test).clip(0, 20)

    print(f"RMSE: {rmse(y_test, prediction)}")

RMSE: 2.56221523899121


## 3rd attempt: add zero sales

In [10]:
if run_lr:
    matrix = []
    for i in range(34):
        this_month = train[train.date_block_num == i]
        matrix.append(np.array(list(product(
            [i], this_month.shop_id.unique(), this_month.item_id.unique()
        ))))

    # vstack == stack array row-wise
    matrix = pd.DataFrame(np.vstack(matrix), columns=group_on)
    matrix = pd.merge(matrix, train, how='left', on=group_on)

    matrix['item_cnt_month'] = matrix['item_cnt_month'].clip(0, 20)
    matrix['item_cnt_month'] = matrix['item_cnt_month'].fillna(0)
    matrix.drop('item_price', axis=1, inplace=True)

In [11]:
if run_lr:
    # assign features and label
    X = matrix.drop('item_cnt_month', axis=1)
    Y = matrix['item_cnt_month']

In [12]:
if run_lr:
    X_train, X_test, y_train, y_test = train_test_split(X, Y, train_size=0.8, random_state=42)

    LR = LinearRegression()
    LR.fit(X_train, y_train)
    prediction = LR.predict(X_test).clip(0, 20)

    print(f"RMSE: {rmse(y_test, prediction)}")

RMSE: 1.2278876883086605


## 4th attempt: add feature engineering

In [13]:
if run_lr or run_cv:
    data = pd.read_pickle('data/out/data.pkl')

In [14]:
if run_lr:
    X = data[data.date_block_num < 34].drop(['item_cnt_month'], axis=1)
    Y = data[data.date_block_num < 34]['item_cnt_month']
    x = data[data.date_block_num == 34].drop(['item_cnt_month'], axis=1)

In [15]:
if run_lr:
    X_train, X_test, y_train, y_test = train_test_split(X, Y, train_size=0.8, random_state=42)
    lf = LinearRegression()
    lf.fit(X_train, y_train)
    prediction = lf.predict(X_test).clip(0,20)

    print(f"RMSE: {rmse(y_test, prediction)}")

RMSE: 0.7893883917525537


## 5th attempt: apply cross_val_score

In [16]:
if run_lr:
    X_train, X_test, y_train, y_test = train_test_split(X, Y, train_size=0.72, random_state=42)

    lr = LinearRegression()
    score = cross_val_score(lr, X_train, y_train, cv=5, scoring='neg_root_mean_squared_error')
    print('RMSE %2f' %(-1 * score.mean()))

RMSE 0.829568


## 6th attempt: drop superfluous shops

In [17]:
 if run_lr:
    shops_in_test_data = set()

    for i, row in test.iterrows():
        shops_in_test_data.add(row['shop_id'])

    all_shops = set([*range(0, 60)])
    shops_not_present = all_shops-shops_in_test_data
    shops_not_present

    for missing_shop in shops_not_present:
        index_names = data[ data['shop_id'] == missing_shop ].index
        data.drop(index_names, inplace=True)

In [18]:
if run_lr:
    X = data[data.date_block_num < 34].drop(['item_cnt_month'], axis=1)
    Y = data[data.date_block_num < 34]['item_cnt_month']
    x = data[data.date_block_num == 34].drop(['item_cnt_month'], axis=1)

In [19]:
if run_lr:
    X_train, X_test, y_train, y_test = train_test_split(X, Y, train_size=0.72, random_state=42)

    lr = LinearRegression()
    score = cross_val_score(lr, X_train, y_train, cv=5, scoring='neg_root_mean_squared_error')
    print('RMSE %2f' %(-1 * score.mean()))

RMSE 0.811479


## 7th attempt: try out LASSO regression

In [20]:
if run_lr:
    X_train, X_test, y_train, y_test = train_test_split(X, Y, train_size=0.72, random_state=42)

    model = Lasso()
    score = cross_val_score(model, X_train, y_train, cv=5, scoring='neg_root_mean_squared_error')
    print('RMSE %2f' %(-1 * score.mean()))

RMSE 0.985678


## 8th attempt: try out Ridge regression

In [21]:
if run_lr:
    X_train, X_test, y_train, y_test = train_test_split(X, Y, train_size=0.72, random_state=42)

    model = Ridge()
    score = cross_val_score(model, X_train, y_train, cv=5, scoring='neg_root_mean_squared_error')
    print('RMSE %2f' %(-1 * score.mean()))

RMSE 0.811479


## 9th attempt: cross validation on ridge hyperparameters

In [22]:
if run_cv:
    X_train, X_test, y_train, y_test = train_test_split(X, Y, train_size=0.72, random_state=42)
    parameter_candidates = [{'alpha': [0.8, 1, 1.2, 1.5],
                             'solver': ['auto', 'svd', 'sag']
                             }]
    model = Ridge()

    cv_model = GridSearchCV(estimator=model,
                            param_grid=parameter_candidates,
                            n_jobs=3)

    cv_model.fit(X_train, y_train)
    print(cv_model.best_params_)

In [23]:
model = Ridge(alpha=1.5)
score = cross_val_score(model, X_train, y_train, cv=5, scoring='neg_root_mean_squared_error')
print('RMSE %2f' %(-1 * score.mean()))

RMSE 0.811479


## 10th attempt: scaling the data

In [24]:
if run_lr:
    X_train, X_test, y_train, y_test = train_test_split(X, Y, train_size=0.72, random_state=42)

    scaler = StandardScaler()
    X_train = scaler.fit_transform(X_train)
    X_test = scaler.transform(X_test)

    lr = LinearRegression()
    score = cross_val_score(lr, X_train, y_train, cv=5, scoring='neg_root_mean_squared_error')
    print('RMSE %2f' %(-1 * score.mean()))

RMSE 0.811479


## Test set prediction

In [25]:
if lr_submission:
    lr = LinearRegression()
    model = lr.fit(X, Y)

    y_test = model.predict(x).clip(0, 20)

    submission = pd.DataFrame({
        "ID": test.index,
        "item_cnt_month": y_test
    })
    submission.to_csv('data/submissions/lr_submission.csv', index=False)