In [84]:
# Data Manipulation Libraries
import pandas as pd
import numpy as np
import datetime
import os

import warnings
warnings.filterwarnings("ignore")

In [85]:
retail = pd.read_csv('Train.csv')
retail_test = pd.read_csv('Test.csv')
retail_data = retail.copy()
retail_test_data = retail_test.copy()

#Drop Duplicate rows
retail_data.drop_duplicates(subset=None, keep='first', inplace=True)
#Only dropped one outlier
retail_data.drop(retail_data.loc[retail_data['UnitPrice']>35000,:].index,inplace=True)

# #No missing values
# import missingno as msno
# msno.matrix(retail_test)
# print(retail_test.isna().sum())
#Seperate Categorical and Numerical Columns
cat_cols = retail_data.select_dtypes(include=['object','category']).columns.tolist()
print(cat_cols)

num_cols = retail_data.select_dtypes(include=['int64','float64']).columns.tolist()
print(num_cols)

retail_data.head()

['InvoiceDate']
['InvoiceNo', 'StockCode', 'Description', 'Quantity', 'UnitPrice', 'CustomerID', 'Country']


Unnamed: 0,InvoiceNo,StockCode,Description,Quantity,InvoiceDate,UnitPrice,CustomerID,Country
0,6141,1583,144,3,2011-05-06 16:54:00,3.75,14056.0,35
1,6349,1300,3682,6,2011-05-11 07:35:00,1.95,13098.0,35
2,16783,2178,1939,4,2011-11-20 13:20:00,5.95,15044.0,35
3,16971,2115,2983,1,2011-11-22 12:07:00,0.83,15525.0,35
4,6080,1210,2886,12,2011-05-06 09:00:00,1.65,13952.0,35


In [86]:
def sum_of_rolling_mean():
    retail_data.groupby('StockCode')['UnitPrice'].rolling(3,center=True,min_periods=1).mean().reset_index(drop=True).sum()
    
def drop_irrelavant_columns(df):
    df.drop(columns=['InvoiceNo','Description'],inplace=True)
    
def sample_by_hour_set_index(df):
    df['InvoiceDate'] = pd.to_datetime(df['InvoiceDate'])
#     df['InvoiceDate'] = pd.to_datetime(df['InvoiceDate'].strftime('%Y-%m-%d %H:%M:%S'))
    df.sort_values(by='InvoiceDate',inplace=True)

def get_uniques(df_train,df_test):
    unique_SC_test_values = df_test['StockCode'].unique()
    unique_SC_train_values = df_train['StockCode'].unique()
    unique_unseen_SC_test_values = []
    unique_common_SC_values = []
    for i in unique_SC_test_values:
        if i in unique_SC_train_values:
            unique_common_SC_values.append(i)
        else:
            unique_unseen_SC_test_values.append(i)

    unique = {'SC_test':unique_SC_test_values,'SC_train':unique_SC_train_values,'SC_common_values':unique_common_SC_values,'unseen_SC':unique_unseen_SC_test_values}
    return unique

def perform_ops_for_both_train_and_test(data):
    drop_irrelavant_columns(data)
    sample_by_hour_set_index(data)
    

In [87]:
perform_ops_for_both_train_and_test(retail_data)
perform_ops_for_both_train_and_test(retail_test_data)
unique = get_uniques(retail_data,retail_test_data)

In [88]:
retail_data.head()

Unnamed: 0,StockCode,Quantity,InvoiceDate,UnitPrice,CustomerID,Country
59748,2802,6,2010-12-01 08:26:00,3.39,17850.0,35
124355,2855,8,2010-12-01 08:26:00,2.75,17850.0,35
281503,2803,6,2010-12-01 08:26:00,3.39,17850.0,35
53973,1605,2,2010-12-01 08:26:00,7.65,17850.0,35
33757,755,6,2010-12-01 08:26:00,4.25,17850.0,35


In [89]:
# #one time prophet
# train = retail_data.loc[:,['InvoiceDate','UnitPrice']]
# pro_df = train.rename(columns={'InvoiceDate':'ds','UnitPrice':'y'})
# model = Prophet()
# # fit the model
# model.fit(pro_df)
# future = retail_test_data.loc[:,['InvoiceDate']].rename(columns={'InvoiceDate':'ds'})
# # use the model to make a forecast
# forecast = model.predict(future)

In [90]:
sd = retail_test_data['Country'].unique()
cur_choosing = np.setdiff1d(sd,[28,2,4])

In [91]:
retail_test_data['UnitPrice'] = np.nan

In [92]:

for cur_country in cur_choosing:
    df = pd.read_csv('prophet_country_'+str(cur_country))
    df.rename(columns={'Unnamed: 0':'idx'},inplace=True)
    df.set_index('idx',inplace=True)
    retail_test_data.loc[df.index,'UnitPrice'] = df['UnitPrice']

In [93]:
unfilled = retail_test_data[retail_test_data['UnitPrice'].isna()]['StockCode'].unique()
for s in unfilled:
    if s in unique['unseen_SC']:
        print(s)

In [94]:
fg = pd.read_csv('kkffbf02.csv')
zero_index = retail_test_data.loc[retail_test_data['UnitPrice']==0].index
retail_test_data.loc[retail_test_data['UnitPrice']==0,'UnitPrice'] = fg.loc[zero_index,'UnitPrice']

In [95]:
retail_test_data.sort_index(inplace=True)
retail_test_data.head()

Unnamed: 0,StockCode,Quantity,InvoiceDate,CustomerID,Country,UnitPrice
0,1709,3,2011-02-22 15:22:00,16161.0,35,1.65
1,510,1,2010-12-08 12:46:00,17341.0,35,1.145726
2,604,36,2011-10-25 13:53:00,15158.0,35,4.1049
3,1478,2,2011-06-27 12:38:00,16033.0,35,1.863983
4,3216,1,2011-11-06 16:14:00,15351.0,35,12.857854


In [96]:
retail_test_data['UnitPrice'].fillna(0,inplace=True)

In [97]:
retail_test_data['UnitPrice'].isna().sum()

0

In [98]:
up = retail_test_data['UnitPrice'].apply(lambda x: round(x, 2))
UnitPrice = pd.DataFrame(up.astype('float').values,columns=['UnitPrice'])
UnitPrice.to_csv('prophet04.csv', index=False, index_label=None)

In [None]:
#the loop was run based on country+stockcode subdivision seperate prediction!
#moving average performed better....