# statistical-machine-learning-models

In [74]:
import numpy as np 
import pandas as pd

from sklearn.model_selection import train_test_split,  KFold, cross_val_score
from sklearn.linear_model import LinearRegression, Ridge, RidgeCV, Lasso, LassoCV
from sklearn.metrics import mean_squared_error, r2_score, accuracy_score
from sklearn.preprocessing import MinMaxScaler

from copy import copy

### Structure of the problem task is:
X 
    - ['date', 'SKU', 'Category', 'geoCluster', 'Group', 'price']

Y
    - ['qty']


In [13]:
sales = pd.read_csv("../data/processed/nts_v1.csv", parse_dates=['date'])

#### Convert date column to columns that can be processed by ML models

In [18]:
sales["year"] = sales.date.apply(lambda x: x.year)
sales["month"] = sales.date.apply(lambda x: x.month)
sales["week"] = sales.date.apply(lambda x: x.week)
sales["day_of_week"] = sales.date.apply(lambda x: x.dayofweek)
sales = sales.drop("date", axis="columns")

In [78]:
sales.head()

Unnamed: 0,SKU,Category,geoCluster,Group,price,qty,year,month,week,day_of_week
0,534443,"Water, import, sparkling",2065,Mineral water,40.59,1.0,2020,1,5,0
1,233272,"Water, import, sparkling",2065,Mineral water,32.79,2.0,2020,2,9,2
2,233272,"Water, import, sparkling",2065,Mineral water,32.79,2.0,2020,2,9,3
3,233272,"Water, import, sparkling",2065,Mineral water,36.89,2.0,2020,3,10,1
4,787133,Avocado,2016,Tropical fruits,79.59,4.0,2020,4,14,2


In [27]:
X = sales.loc[:, sales.columns != 'qty']
y = sales['qty']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, random_state=42)

In [58]:
def minimum_preprocessing(X, y):
    print('Original shape:{}'.format(X.shape))
    categorical_columns = X.dtypes[X.dtypes == 'object'].index.values
    # We kill categorical columns
    X=X.drop(columns=categorical_columns)
    print('Droped: {}'.format(categorical_columns))
    # We remove missing values
    X=X.dropna()
    y=y[X.index]
    print('New shape:{}'.format(X.shape))
    return X, y

X_train, y_train = minimum_preprocessing(X_train,y_train)
X_test, y_test = minimum_preprocessing(X_test,y_test)

Original shape:(622168, 9)
Droped: ['Category' 'Group']
New shape:(622168, 7)
Original shape:(155543, 9)
Droped: ['Category' 'Group']
New shape:(155543, 7)


## Basic Prediction

In [77]:
# We instantiate a linear regression. 
lr = LinearRegression(normalize=False)

# Now we train it on train data with fit method
lr.fit(X_train,y_train)

# We can predict using the predict method
y_pred = lr.predict(X_train)

weights = lr.coef_
intercept = lr.intercept_
# You can access to some info about the model, like the weights.
print('Coefficients: \n', weights[:10])
print('Intercept: \n', intercept)


# You can also use sklearn implementation
mean_square_error_sk = mean_squared_error(y_train, y_pred)
print( f"Train Mean Squared Error (MSE): {mean_square_error_sk}" )

cross_val_metrics = pd.DataFrame(columns=['MSE', 'norm_MSE', 'R2'])

kf = KFold(n_splits=5)
i=1
for train_index, test_index in kf.split(X_train):
    print('Split {}: \n\tTest Folds: [{}] \n\tTrain Folds {}'.format(i, i, [j for j in range(1,6) if j != i]))
    
    x_train_fold = X_train.values[train_index]
    y_train_fold = y_train.values[train_index]
    x_test_fold = X_train.values[test_index,:]
    y_test_fold = y_train.values[test_index]

    lr = LinearRegression(normalize=False)
    lr.fit(x_train_fold,y_train_fold)
    y_pred_fold = lr.predict(x_test_fold)
    fold_mse = mean_squared_error(y_test_fold, y_pred_fold)
    fold_nmse =  1-r2_score(y_test_fold, y_pred_fold)
    fold_r2 = r2_score(y_test_fold, y_pred_fold)
    print('\tMSE: {} NMSE: {} R2: {}'.format(fold_mse,fold_nmse, fold_r2) )

    cross_val_metrics.loc['Fold {}'.format(i), :] = [fold_mse,fold_nmse, fold_r2]
    i+=1
    
    
cross_val_metrics.loc['Mean',:] = cross_val_metrics.mean()
cross_val_metrics



Coefficients: 
 [-1.70308483e-06 -1.87186318e-04 -1.33317737e-02  6.12101141e-01
  8.36417766e-02 -4.53097213e-03  2.40979676e-02]
Intercept: 
 -1231.837800882124
Train Mean Squared Error (MSE): 18.83765205126188
Split 1: 
	Test Folds: [1] 
	Train Folds [2, 3, 4, 5]




	MSE: 20.10730344147186 NMSE: 0.9271118386422821 R2: 0.07288816135771792
Split 2: 
	Test Folds: [2] 
	Train Folds [1, 3, 4, 5]




	MSE: 19.0985915572344 NMSE: 0.9219591101505626 R2: 0.07804088984943736
Split 3: 
	Test Folds: [3] 
	Train Folds [1, 2, 4, 5]




	MSE: 17.789157995735295 NMSE: 0.9159069753891894 R2: 0.08409302461081058
Split 4: 
	Test Folds: [4] 
	Train Folds [1, 2, 3, 5]
	MSE: 18.471516507743246 NMSE: 0.9175820616332909 R2: 0.08241793836670908
Split 5: 
	Test Folds: [5] 
	Train Folds [1, 2, 3, 4]




	MSE: 18.726104731776452 NMSE: 0.9160392333037006 R2: 0.08396076669629937


Unnamed: 0,MSE,norm_MSE,R2
Fold 1,20.107303,0.927112,0.072888
Fold 2,19.098592,0.921959,0.078041
Fold 3,17.789158,0.915907,0.084093
Fold 4,18.471517,0.917582,0.082418
Fold 5,18.726105,0.916039,0.083961
Mean,18.838535,0.91972,0.08028
