# Time series - Forecasting with ARIMA
Adapted from: https://www.machinelearningplus.com/time-series/arima-model-time-series-forecasting-python/

**Import libraries & load data**

In [3]:
#Import libraries
import pandas as pd
import numpy as np
import seaborn as sns
import os
import matplotlib.pyplot as plt
import matplotlib.dates as mdates
import datetime as dt
%matplotlib inline

In [4]:
#Get working directory
os.getcwd()

'C:\\Users\\Casper Damen\\DiP\\Python klasje\\notebooks'

In [5]:
#Set working directory
os.chdir('C:\\Users\\Casper Damen\\DiP\\Python klasje\\data')

In [6]:
df = pd.read_csv('zoekopdracten_jan2019_july2020.csv')

In [7]:
df.head()

Unnamed: 0,timestamp__to_date,searches,sessions
0,2019-11-26,2554,18659
1,2020-07-20,2509,36260
2,2020-07-01,2376,44005
3,2020-07-27,2359,38313
4,2020-07-06,2333,38421


In [8]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 578 entries, 0 to 577
Data columns (total 3 columns):
 #   Column              Non-Null Count  Dtype 
---  ------              --------------  ----- 
 0   timestamp__to_date  578 non-null    object
 1   searches            578 non-null    int64 
 2   sessions            578 non-null    int64 
dtypes: int64(2), object(1)
memory usage: 13.7+ KB


## Data pre-processing

In [10]:
#Preprocessing functions
def change_dtype(dataf):
    
    new_df = (dataf
                .assign(date= lambda d: pd.to_datetime(d['timestamp__to_date']))
             )
    
    return new_df

def add_ratio(dataf):
    
    new_df = (dataf
              .assign(searches_per_session = round(dataf['searches'] / dataf['sessions'], 3))
             )
        
    return new_df

def add_date_parts(dataf):
    
    new_df = (dataf
              .assign(year = lambda d: d['date'].dt.year,
                      month_number = lambda d: d['date'].dt.month,
                      month_name = lambda d: d['date'].dt.strftime('%B'),
                      day_number =  lambda d: d['date'].dt.dayofweek,
                      day_name =  lambda d: d['date'].dt.strftime('%A'),
                      week = lambda d: d['date'].dt.strftime('%W'))
             )
        
    return new_df
    

def keep_columns(dataf):
    
    new_df = (dataf[['date', 'searches', 'sessions', 
                   'searches_per_session', 'year', 'month_number', 
                   'month_name', 'day_number', 'day_name', 'week']]
             )
    
    
    return new_df

def filter_months(dataf):
    
    new_df = (dataf
              .loc[dataf['month_number'] < 8]
             )
    
    return new_df

def filter_years(dataf):
    
    new_df = (dataf
              .loc[dataf['year'] == 2020]
             )
    
    return new_df

def date_index(dataf):
    
    sort_df = (dataf
               .set_index('date')
               .sort_index()
              )
    
    return sort_df

In [11]:
clean_df = (df
            .pipe(change_dtype)
            .pipe(add_ratio)
            .pipe(add_date_parts)
            .pipe(keep_columns)
            .pipe(filter_months)
            .pipe(filter_years)
            .pipe(date_index)
           )

In [12]:
clean_df.head()

Unnamed: 0_level_0,searches,sessions,searches_per_session,year,month_number,month_name,day_number,day_name,week
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
2020-01-01,602,9218,0.065,2020,1,January,2,Wednesday,0
2020-01-02,1284,17588,0.073,2020,1,January,3,Thursday,0
2020-01-03,1246,16371,0.076,2020,1,January,4,Friday,0
2020-01-04,908,11299,0.08,2020,1,January,5,Saturday,0
2020-01-05,869,12477,0.07,2020,1,January,6,Sunday,0
