# Store Sales Time Series Forecasting  
Group members: Spencer Potter, Ben Smith, Carter Watson

[Kaggle Data](https://www.kaggle.com/competitions/store-sales-time-series-forecasting)

**Imports**

In [15]:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
from sklearn.compose import ColumnTransformer, make_column_transformer
from sklearn.dummy import DummyRegressor
from sklearn.impute import SimpleImputer
from sklearn.model_selection import cross_val_score, cross_validate, train_test_split
from sklearn.neighbors import KNeighborsClassifier
from sklearn.pipeline import Pipeline, make_pipeline
from sklearn.preprocessing import OneHotEncoder, StandardScaler, OrdinalEncoder
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier

**Introducing the DataSet**

In [16]:
train_df=pd.read_csv("needed_data/train.csv")
train_df.shape
test_df=pd.read_csv("needed_data/test.csv")
test_df.shape
print(train_df)

              id        date  store_nbr                      family     sales  \
0              0  2013-01-01          1                  AUTOMOTIVE     0.000   
1              1  2013-01-01          1                   BABY CARE     0.000   
2              2  2013-01-01          1                      BEAUTY     0.000   
3              3  2013-01-01          1                   BEVERAGES     0.000   
4              4  2013-01-01          1                       BOOKS     0.000   
...          ...         ...        ...                         ...       ...   
3000883  3000883  2017-08-15          9                     POULTRY   438.133   
3000884  3000884  2017-08-15          9              PREPARED FOODS   154.553   
3000885  3000885  2017-08-15          9                     PRODUCE  2419.729   
3000886  3000886  2017-08-15          9  SCHOOL AND OFFICE SUPPLIES   121.000   
3000887  3000887  2017-08-15          9                     SEAFOOD    16.000   

         onpromotion  
0   

In [17]:
train_df.sort_index()
train_df.set_index('id')
test_df.sort_index()
test_df.set_index('id')

Unnamed: 0_level_0,date,store_nbr,family,onpromotion
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
3000888,2017-08-16,1,AUTOMOTIVE,0
3000889,2017-08-16,1,BABY CARE,0
3000890,2017-08-16,1,BEAUTY,2
3000891,2017-08-16,1,BEVERAGES,20
3000892,2017-08-16,1,BOOKS,0
...,...,...,...,...
3029395,2017-08-31,9,POULTRY,1
3029396,2017-08-31,9,PREPARED FOODS,0
3029397,2017-08-31,9,PRODUCE,1
3029398,2017-08-31,9,SCHOOL AND OFFICE SUPPLIES,9


**Data Splitting**

In [18]:
x_train = train_df.loc[:, train_df.columns != 'sales']
y_train = train_df['sales']
x_test = test_df.loc[:, test_df.columns != 'sales']

**Preprocessing Data**

In [19]:
numeric_data=["onpromotion", "store_nbr"]
categorical_features = ["family"]
passthrough_data=["date"]
drop_data = ["id", "date"]

**Pipeline Creation**

In [20]:
preprocessor = make_column_transformer(
    (OneHotEncoder(handle_unknown="ignore"),categorical_features ),
    (StandardScaler(), numeric_data),
    ("drop", drop_data),
)

In [21]:
x_train_transformed = preprocessor.fit_transform(x_train)
x_train_transformed.shape

(3000888, 35)

## Dummy Model

In [22]:
baseline_model = make_pipeline(preprocessor, DummyRegressor(strategy='mean'))

In [24]:
results_dict = {}
scores = cross_val_score(baseline_model, x_train, y_train, cv=10)
results_dict['cross_val_scores'] = -scores
results_df = pd.DataFrame(results_dict)
print(results_df)

#Because we're using the dummy regressor, smaller values mean better accuracy

   cross_val_scores
0          0.062360
1          0.045408
2          0.006066
3          0.002078
4          0.001458
5          0.001344
6          0.005386
7          0.004660
8          0.008070
9          0.010618


In [25]:
baseline_model.fit(x_train, y_train)
prediction = baseline_model.predict(x_test)
prediction_df = pd.DataFrame({'id': x_test.id, 'sales': prediction})
prediction_df.to_csv("dummy_test_predictions.csv", index=False)

## Linear Regression Model

In [26]:
from sklearn.linear_model import LinearRegression
lRModel = make_pipeline(preprocessor, LinearRegression())
results_lR_dict = {}
scores = cross_val_score(lRModel, x_train, y_train, cv=10)
results_lR_dict['cross_val_scores'] = scores
results_lR_df = pd.DataFrame(results_lR_dict)
print(results_lR_df)

   cross_val_scores
0          0.301366
1          0.387207
2          0.458411
3          0.494414
4          0.506138
5          0.544965
6          0.547082
7          0.529239
8          0.574934
9          0.587695


In [27]:
lRModel.fit(x_train, y_train)
lR_prediciton=lRModel.predict(x_test)
lR_prediciton = [max(0, pred) for pred in lR_prediciton]
prediction_df = pd.DataFrame({'id': x_test.id, 'sales': lR_prediciton})
prediction_df.to_csv("linear_regression_test_predictions.csv", index=False)

## 