In [3]:
# Data Manipulation Libraries
import pandas as pd
import numpy as np
import datetime
import os

import matplotlib.pyplot as plt
!pip install plotly
import plotly.express as px
!pip install lightgbm
from lightgbm import LGBMRegressor
import joblib

from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import OneHotEncoder

import warnings
warnings.filterwarnings("ignore")



In [4]:
retail = pd.read_csv('Train.csv')
retail_test = pd.read_csv('Test.csv')
retail_data = retail.copy()
retail_test_data = retail_test.copy()

#Drop Duplicate rows
retail_data.drop_duplicates(subset=None, keep='first', inplace=True)
#Only dropped one outlier
retail_data.drop(retail_data.loc[retail_data['UnitPrice']>35000,:].index,inplace=True)

# #No missing values
# import missingno as msno
# msno.matrix(retail_test)
# print(retail_test.isna().sum())
#Seperate Categorical and Numerical Columns
cat_cols = retail_data.select_dtypes(include=['object','category']).columns.tolist()
print(cat_cols)

num_cols = retail_data.select_dtypes(include=['int64','float64']).columns.tolist()
print(num_cols)

retail_data.head()

['InvoiceDate']
['InvoiceNo', 'StockCode', 'Description', 'Quantity', 'UnitPrice', 'CustomerID', 'Country']


Unnamed: 0,InvoiceNo,StockCode,Description,Quantity,InvoiceDate,UnitPrice,CustomerID,Country
0,6141,1583,144,3,2011-05-06 16:54:00,3.75,14056.0,35
1,6349,1300,3682,6,2011-05-11 07:35:00,1.95,13098.0,35
2,16783,2178,1939,4,2011-11-20 13:20:00,5.95,15044.0,35
3,16971,2115,2983,1,2011-11-22 12:07:00,0.83,15525.0,35
4,6080,1210,2886,12,2011-05-06 09:00:00,1.65,13952.0,35


In [5]:
def sum_of_rolling_mean():
    retail_data.groupby('StockCode')['UnitPrice'].rolling(3,center=True,min_periods=1).mean().reset_index(drop=True).sum()
    
def drop_irrelavant_columns(df):
    df.drop(columns=['InvoiceNo','Description'],inplace=True)
    
def sample_by_hour_set_index(df):
    df['InvoiceDate'] = pd.to_datetime(df['InvoiceDate'])
#     df['InvoiceDate'] = pd.to_datetime(df['InvoiceDate'].strftime('%Y-%m-%d %H:%M:%S'))
    df.sort_values(by='InvoiceDate',inplace=True)

def get_uniques(df_train,df_test):
    unique_SC_test_values = df_test['StockCode'].unique()
    unique_SC_train_values = df_train['StockCode'].unique()
    unique_unseen_SC_test_values = []
    unique_common_SC_values = []
    for i in unique_SC_test_values:
        if i in unique_SC_train_values:
            unique_common_SC_values.append(i)
        else:
            unique_unseen_SC_test_values.append(i)

    unique = {'SC_test':unique_SC_test_values,'SC_train':unique_SC_train_values,'SC_common_values':unique_common_SC_values,'unseen_SC':unique_unseen_SC_test_values}
    return unique

def perform_ops_for_both_train_and_test(data):
    drop_irrelavant_columns(data)
    sample_by_hour_set_index(data)
    

In [6]:
perform_ops_for_both_train_and_test(retail_data)
perform_ops_for_both_train_and_test(retail_test_data)
unique = get_uniques(retail_data,retail_test_data)

In [7]:
unique['SC_test']

array([2649, 1489, 1480, ..., 3667, 3654, 2275])

In [8]:
unique_country = np.sort(retail_test_data['Country'].unique())

In [9]:
retail_test_data['UnitPrice'] = 0

In [10]:
unique['SC_common_values'][1]

1489

In [11]:
retail_data['Country'].value_counts()

35    250667
14      6648
13      5939
10      5238
30      1773
23      1680
3       1435
32      1293
26      1044
0        883
24       766
18       561
6        536
12       489
7        419
31       324
1        303
9        273
25       244
19       238
33       210
17       184
36       169
29       152
16       132
5        106
15       101
22        89
34        52
27        47
11        40
20        34
8         26
21        25
4         20
2         10
28         8
Name: Country, dtype: int64

In [12]:
sd = retail_test_data['Country'].unique()
cur_choosing = np.setdiff1d(sd,[28,2,4])

In [13]:
# country 14, 13 has less values can use for test - 35 largest values
from fbprophet import Prophet
pro_df = retail_data.loc[retail_data['StockCode']==1489 & (retail_data['Country']==35),['InvoiceDate','UnitPrice']]
pro_df['InvoiceDate'] = pd.to_datetime(pro_df['InvoiceDate'])
pro_df.rename(columns={'InvoiceDate':'ds','UnitPrice':'y'},inplace=True)
model = Prophet()
# fit the model
model.fit(pro_df)
future = pro_df.drop(columns=['y'])
# use the model to make a forecast
forecast = model.predict(future)
# summarize the forecast
print(forecast[['ds', 'yhat', 'yhat_lower', 'yhat_upper']].head())

INFO:fbprophet:Disabling yearly seasonality. Run prophet with yearly_seasonality=True to override this.
INFO:fbprophet:n_changepoints greater than number of observations. Using 16.


                   ds      yhat  yhat_lower  yhat_upper
0 2010-12-01 08:45:00  0.880740    0.820890    0.939560
1 2010-12-10 10:56:00  0.951576    0.893550    1.017661
2 2011-01-05 14:48:00  0.853427    0.792489    0.917311
3 2011-01-17 11:46:00  0.792443    0.727877    0.852782
4 2011-01-18 14:04:00  0.795384    0.731752    0.859464


In [14]:
oosp = retail_test_data.loc[retail_test_data['StockCode']==1489 & (retail_test_data['Country']==35),['InvoiceDate']]
oosp.rename(columns={'InvoiceDate':'ds'},inplace=True)

In [15]:
forecast = model.predict(oosp)
# summarize the forecast
print(forecast[['ds', 'yhat', 'yhat_lower', 'yhat_upper']].head())

                   ds      yhat  yhat_lower  yhat_upper
0 2010-12-09 14:49:00  0.934882    0.877287    1.000204
1 2010-12-10 12:33:00  0.895259    0.829487    0.954197
2 2011-01-20 10:48:00  0.825299    0.765013    0.890765
3 2011-01-31 09:57:00  0.807909    0.743712    0.868667
4 2011-02-04 10:31:00  0.870506    0.804942    0.939302


In [16]:
retail_data.head()

Unnamed: 0,StockCode,Quantity,InvoiceDate,UnitPrice,CustomerID,Country
59748,2802,6,2010-12-01 08:26:00,3.39,17850.0,35
124355,2855,8,2010-12-01 08:26:00,2.75,17850.0,35
281503,2803,6,2010-12-01 08:26:00,3.39,17850.0,35
53973,1605,2,2010-12-01 08:26:00,7.65,17850.0,35
33757,755,6,2010-12-01 08:26:00,4.25,17850.0,35


In [17]:
import logging
logging.getLogger('fbprophet').setLevel(logging.WARNING)

In [18]:
exception = []
for cur_country in cur_choosing:
    
    retail_data_cc = retail_data.loc[retail_data['Country']==cur_country]
    retail_test_data_cc = retail_test_data.loc[retail_test_data['Country']==cur_country]
    print('Processing for country : {}'.format(cur_country))
    number_of_stock_codes_processed = 0
    for usct in unique['SC_common_values']:
            
            stockuniq = retail_data_cc.loc[retail_data_cc['StockCode']==usct,['InvoiceDate','UnitPrice']]
            stockuniqt = retail_test_data_cc.loc[retail_test_data_cc['StockCode']==usct,['InvoiceDate']]
            stockuniqt['second'] = 'sc'
            
            try:
                pro_df = stockuniq.rename(columns={'InvoiceDate':'ds','UnitPrice':'y'})
                model = Prophet()
                # fit the model
                model.fit(pro_df)
                future = stockuniqt.rename(columns={'InvoiceDate':'ds'})
                # use the model to make a forecast
                forecast = model.predict(future)
                retail_test_data.loc[(retail_test_data['Country']==cur_country) & (retail_test_data['StockCode']==usct),['UnitPrice']] = forecast['yhat'].values
                number_of_stock_codes_processed += 1
            except Exception:
                exception.append([cur_country,usct])
                continue
    print('Number of countries processed : {}'.format(number_of_stock_codes_processed))
    retail_test_data.loc[(retail_test_data['Country']==cur_country),['UnitPrice']].to_csv('prophet_country_'+str(cur_country))


Processing for country : 0
Number of countries processed : 104
Processing for country : 1
Number of countries processed : 8
Processing for country : 3
Number of countries processed : 157
Processing for country : 5
Number of countries processed : 0
Processing for country : 6
Number of countries processed : 49
Processing for country : 7
Number of countries processed : 14
Processing for country : 8
Number of countries processed : 1
Processing for country : 9
Number of countries processed : 14
Processing for country : 10
Number of countries processed : 656
Processing for country : 11
Number of countries processed : 0
Processing for country : 12




Number of countries processed : 26
Processing for country : 13
Number of countries processed : 574
Processing for country : 14
Number of countries processed : 695
Processing for country : 15
Number of countries processed : 0
Processing for country : 16
Number of countries processed : 14
Processing for country : 17
Number of countries processed : 3
Processing for country : 18
Number of countries processed : 40
Processing for country : 19
Number of countries processed : 19
Processing for country : 20
Number of countries processed : 0
Processing for country : 21
Number of countries processed : 0
Processing for country : 22
Number of countries processed : 6
Processing for country : 23
Number of countries processed : 200
Processing for country : 24
Number of countries processed : 83
Processing for country : 25
Number of countries processed : 25
Processing for country : 26
Number of countries processed : 110
Processing for country : 27
Number of countries processed : 0
Processing for country

In [None]:
len(exception)

In [1]:
retail_test_data[retail_test_data['UnitPrice']==0]

NameError: name 'retail_test_data' is not defined

In [None]:
retail_test_data.to_csv('prophet01.csv')

In [2]:
str(4)

'4'

Processing for country : 0
Number of countries processed : 104
Processing for country : 1
Number of countries processed : 8
Processing for country : 3
Number of countries processed : 157
Processing for country : 5
Number of countries processed : 0
Processing for country : 6
Number of countries processed : 49
Processing for country : 7
Number of countries processed : 14
Processing for country : 8
Number of countries processed : 1
Processing for country : 9
Number of countries processed : 14
Processing for country : 10
Number of countries processed : 656
Processing for country : 11
Number of countries processed : 0
Processing for country : 12
WARNING:fbprophet.models:Optimization terminated abnormally. Falling back to Newton.
WARNING:fbprophet.models:Optimization terminated abnormally. Falling back to Newton.
Number of countries processed : 26
Processing for country : 13
Number of countries processed : 574
Processing for country : 14
Number of countries processed : 695
Processing for country : 15
Number of countries processed : 0
Processing for country : 16
Number of countries processed : 14
Processing for country : 17
Number of countries processed : 3
Processing for country : 18
Number of countries processed : 40
Processing for country : 19
Number of countries processed : 19
Processing for country : 20
Number of countries processed : 0
Processing for country : 21
Number of countries processed : 0
Processing for country : 22
Number of countries processed : 6
Processing for country : 23
Number of countries processed : 200
Processing for country : 24
Number of countries processed : 83
Processing for country : 25
Number of countries processed : 25
Processing for country : 26
Number of countries processed : 110
Processing for country : 27
Number of countries processed : 0
Processing for country : 29
Number of countries processed : 5
Processing for country : 30
Number of countries processed : 204
Processing for country : 31
Number of countries processed : 25
Processing for country : 32
Number of countries processed : 125
Processing for country : 33
Number of countries processed : 5
Processing for country : 34
Number of countries processed : 0
Processing for country : 35
Number of countries processed : 3229
Processing for country : 36
Number of countries processed : 1