## Model Exploration - ARIMA


In [3]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.cluster import KMeans
import multiprocessing as mp
import gc
import datetime
from sklearn.preprocessing import LabelEncoder
import calendar
from scipy.sparse import csr_matrix,hstack
import tensorflow as tf
from sklearn.linear_model import LinearRegression
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics import mean_squared_error
from lightgbm import LGBMRegressor
from tqdm import tqdm
import pickle
import sys
from sklearn.model_selection import TimeSeriesSplit
from tqdm import tqdm
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV
import plotly.graph_objects as go
from statsmodels.tsa.arima.model import ARIMA
from statsmodels.tsa.statespace.sarimax import SARIMAX
from statsmodels.tsa.stattools import adfuller

In [4]:
# !pip install pmdarima
# import pmdarima as arima

In [5]:
from google.colab import drive
drive.mount('/content/gdrive', force_remount=True)

Mounted at /content/gdrive


In [6]:
!cp /content/gdrive/MyDrive/PredictiveAnalyticsData/utility/generate_submission_file.py .
from generate_submission_file import generate_submission

Mounted at /content/gdrive
Training Data Shape: (58327370, 42)
Testing Data Shape: (853720, 42)
Prediction Data Shape: (853720, 42)


In [7]:
def separate_features_and_parameters():
  df = pd.read_pickle("/content/gdrive/MyDrive/PredictiveAnalyticsData/output_data/final_train.pkl")
  train_df = df.drop(columns=['id'], axis=1)
  print(f"Training Data Shape: {train_df.shape}")
  y_train = train_df['sales']
  X_train = train_df.drop(columns=['sales'], axis=1)

  df = pd.read_pickle("/content/gdrive/MyDrive/PredictiveAnalyticsData/output_data/final_test.pkl")
  test_df = df.drop(columns=['id'], axis=1)
  print(f"Testing Data Shape: {test_df.shape}")
  y_test = test_df['sales']
  X_test = test_df.drop(columns=['sales'], axis=1)

  df = pd.read_pickle("/content/gdrive/MyDrive/PredictiveAnalyticsData/output_data/final_prediction_data.pkl")
  prediction_df = df.drop(columns=['id'], axis=1)
  print(f"Prediction Data Shape: {prediction_df.shape}")
  y_prediction = prediction_df['sales']
  X_prediction = prediction_df.drop(columns=['sales'], axis=1)

  del df

  return X_train, y_train, X_test, y_test, X_prediction, y_prediction

In [8]:
X_train, y_train, X_test, y_test, X_prediction, y_prediction = separate_features_and_parameters()

Training Data Shape: (58327370, 42)
Testing Data Shape: (853720, 42)
Prediction Data Shape: (853720, 42)


In [9]:
for col in ["sold_lag_7", "sold_lag_14", "sold_lag_21", "sold_lag_28", "sold_lag_35", "sold_lag_42", "sold_lag_49", "sold_lag_56", "roll_mean_7", "roll_mean_14", "roll_mean_28"]:
  X_train[col] = X_train[col].fillna(0.0)

In [10]:
def downcast(df):
    print(f"Memory occupied before downcast: {str(round(df.memory_usage(index=False, deep=True).sum()/(1024*1024),2))} MB")
    cols = df.dtypes.index.tolist()
    types = df.dtypes.values.tolist()
    for i,t in enumerate(types):
        if 'int' in str(t):
            # check the optimum range of column values to downcast to appropriate int-range value column
            if df[cols[i]].min() > np.iinfo(np.int8).min and df[cols[i]].max() < np.iinfo(np.int8).max:
                df[cols[i]] = df[cols[i]].astype(np.int8)
            elif df[cols[i]].min() > np.iinfo(np.int16).min and df[cols[i]].max() < np.iinfo(np.int16).max:
                df[cols[i]] = df[cols[i]].astype(np.int16)
            elif df[cols[i]].min() > np.iinfo(np.int32).min and df[cols[i]].max() < np.iinfo(np.int32).max:
                df[cols[i]] = df[cols[i]].astype(np.int32)
            else:
                df[cols[i]] = df[cols[i]].astype(np.int64)
        elif 'float' in str(t):
            # check the optimum range of column values to downcast to appropriate float-range value column
            if df[cols[i]].min() > np.finfo(np.float16).min and df[cols[i]].max() < np.finfo(np.float16).max:
                df[cols[i]] = df[cols[i]].astype(np.float16)
            elif df[cols[i]].min() > np.finfo(np.float32).min and df[cols[i]].max() < np.finfo(np.float32).max:
                df[cols[i]] = df[cols[i]].astype(np.float32)
            else:
                df[cols[i]] = df[cols[i]].astype(np.float64)
        elif t == 'object':
            if cols[i] == 'date':
                df[cols[i]] = pd.to_datetime(df[cols[i]], format='%Y-%m-%d')
            else:
                df[cols[i]] = df[cols[i]].astype('category')
    print(f"Memory occupied after downcast: {str(round(df.memory_usage(index=False, deep=True).sum()/(1024*1024),2))} MB\n")
    return df

In [11]:
X_train = downcast(X_train)
# y_train = downcast(y_train)

Memory occupied before downcast: 3893.77 MB
Memory occupied after downcast: 3782.52 MB



In [12]:
# from itertools import product

# >>> amin = 0
# >>> amax = 2
# >>> list(product(range(amin, amax), repeat=3))
# [(0, 0, 0), (0, 0, 1), (0, 1, 0),  (0, 1, 1),  (1, 0, 0), (1, 0, 1), (1, 1, 0), (1, 1, 1)]

In [None]:
# Example using statsmodels
model = SARIMAX(endog=y_train, order=(1, 0, 1))
model_fit = model.fit()