In [1]:
import pandas as pd
import numpy as np

from mylibs import *

from sklearn.svm import SVR
from sklearn.metrics import accuracy_score, mean_squared_error, r2_score, classification_report

from sklearn.model_selection import train_test_split

from sklearn.preprocessing import StandardScaler

import pickle, random

random.seed(73)

import warnings
warnings.filterwarnings('ignore')

In [2]:
# Data loading
df_sales = pd.read_csv('./dataset/Sales_Detail.csv')
df_sales_n = pd.read_csv('./dataset/wide_qtrx.csv')
df_sales_usd = pd.read_csv('./dataset/wide_mtrx.csv')

In [3]:
print(len(df_sales))
df_sales.head(2)

541909


Unnamed: 0,InvoiceNo,StockCode,Description,Quantity,InvoiceDate,UnitPrice,CustomerID,Country
0,536365,85123A,WHITE HANGING HEART T-LIGHT HOLDER,6,12/1/10 8:26,2.55,17850,United Kingdom
1,536365,71053,WHITE METAL LANTERN,6,12/1/10 8:26,3.39,17850,United Kingdom


In [4]:
df_sales_n.head(2)

Unnamed: 0,CustomerID,201012,201101,201102,201103,201104,201105,201106,201107,201108,201109,201110,201111,201112
0,12346,0,2,0,0,0,0,0,0,0,0,0,0,0
1,12347,1,1,0,0,1,0,1,0,1,0,1,0,1


In [5]:
df_sales_usd.head(2)

Unnamed: 0,CustomerID,201012,201101,201102,201103,201104,201105,201106,201107,201108,201109,201110,201111,201112
0,12346,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,12347,711.79,475.39,0.0,0.0,636.25,0.0,382.52,0.0,584.91,0.0,1294.32,0.0,224.82


In [6]:
df_sales = df_sales[df_sales['CustomerID'].isin(df_sales_n['CustomerID'].to_list())]
df_sales['tot_invoice'] = df_sales['Quantity']*df_sales['UnitPrice']
df_sales['Dates'] = pd.to_datetime(df_sales['InvoiceDate']).dt.date
df_sales[["Dates"]] = df_sales[["Dates"]].apply(pd.to_datetime)

df_filt = df_sales[['CustomerID', 'Country', 'Description', 'Quantity', 'tot_invoice', 'Dates']]
df_filt

Unnamed: 0,CustomerID,Country,Description,Quantity,tot_invoice,Dates
0,17850,United Kingdom,WHITE HANGING HEART T-LIGHT HOLDER,6,15.30,2010-12-01
1,17850,United Kingdom,WHITE METAL LANTERN,6,20.34,2010-12-01
2,17850,United Kingdom,CREAM CUPID HEARTS COAT HANGER,8,22.00,2010-12-01
3,17850,United Kingdom,KNITTED UNION FLAG HOT WATER BOTTLE,6,20.34,2010-12-01
4,17850,United Kingdom,RED WOOLLY HOTTIE WHITE HEART.,6,20.34,2010-12-01
...,...,...,...,...,...,...
541904,14397,United Kingdom,ZINC T-LIGHT HOLDER STARS SMALL,-11,-9.13,2011-12-09
541905,15498,United Kingdom,Manual,-1,-224.69,2011-12-09
541906,15311,United Kingdom,VICTORIAN SEWING BOX LARGE,-5,-54.75,2011-12-09
541907,17315,United Kingdom,HANGING HEART JAR T-LIGHT HOLDER,-1,-1.25,2011-12-09


# Modeling case 1

# R

In [7]:
#a_df = df_sales[(df_sales['Dates'] < '2011-10-01') and (df_sales['Dates'] < '2011-04-01')]
a_df = df_sales[(df_sales['Dates'] <= '2011-10-01')]
a_df = a_df[(a_df['Dates'] >= '2011-04-01')]

df_daily_sort = a_df.sort_values(['CustomerID', 'Dates'], ascending = [False, True])
ct = a_df.Dates.max().date()

print(ct)

df_last_transaction = df_daily_sort.drop_duplicates(['CustomerID'],keep= 'last')
df_last_transaction['days'] = df_last_transaction['Dates'].dt.date
df_last_transaction['days'] = ct - df_last_transaction['days']
df_last_transaction['days'] = df_last_transaction['days'].dt.days.astype('int16')

df_r = df_last_transaction[['CustomerID', 'days']]
df_r

2011-09-30


Unnamed: 0,CustomerID,days
195164,18287,131
320740,18283,25
538007,18282,52
218939,18281,110
355985,18278,3
...,...,...
163261,12354,162
193835,12353,134
358613,12352,2
352596,12348,5


# M

In [8]:
df_daily_sort = a_df.sort_values(['CustomerID', 'Dates'], ascending = [False, True])
df_monetary = df_daily_sort.groupby(['CustomerID'])['tot_invoice'].sum()
df_m = df_monetary.to_frame().reset_index()
df_m.columns = ['CustomerID', 'monetary']
df_m

Unnamed: 0,CustomerID,monetary
0,12347,1603.68
1,12348,677.00
2,12352,632.50
3,12353,89.00
4,12354,1079.40
...,...,...
2969,18278,173.90
2970,18281,80.82
2971,18282,98.76
2972,18283,802.77


In [9]:
# inner join
df_rm = pd.merge(df_r, df_m, on='CustomerID', how='inner')
df_rm

Unnamed: 0,CustomerID,days,monetary
0,18287,131,765.28
1,18283,25,802.77
2,18282,52,98.76
3,18281,110,80.82
4,18278,3,173.90
...,...,...,...
2969,12354,162,1079.40
2970,12353,134,89.00
2971,12352,2,632.50
2972,12348,5,677.00


In [10]:
df_rm.describe()

Unnamed: 0,CustomerID,days,monetary
count,2974.0,2974.0,2974.0
mean,15274.802959,56.813046,1442.949561
std,1723.471292,51.991332,10212.059599
min,12347.0,0.0,-4287.63
25%,13780.25,11.0,257.9625
50%,15246.5,39.0,517.8
75%,16755.75,94.0,1145.28
max,18287.0,182.0,487314.03


In [11]:
df_rm['r'] = [0 if x >= 57 else 1 for x in df_rm['days']]
df_rm['m'] = [0 if x <= 12000 else 1 for x in df_rm['monetary']]

y_client = []

for x in range(len(df_rm)):
    #print(df_rm['r'].iloc[0])
    if (df_rm['r'].iloc[x] == 0) and (df_rm['m'].iloc[x] == 0):
        y_client.append('C')
    elif (df_rm['r'].iloc[x] == 0) and (df_rm['m'].iloc[x] == 1):
        y_client.append('B')
    elif (df_rm['r'].iloc[x] == 1) and (df_rm['m'].iloc[x] == 0):
        y_client.append('B')
    elif (df_rm['r'].iloc[x] == 1) and (df_rm['m'].iloc[x] == 1):
        y_client.append('A')
        
df_rm['class'] = y_client
df_rm

Unnamed: 0,CustomerID,days,monetary,r,m,class
0,18287,131,765.28,0,0,C
1,18283,25,802.77,1,0,B
2,18282,52,98.76,1,0,B
3,18281,110,80.82,0,0,C
4,18278,3,173.90,1,0,B
...,...,...,...,...,...,...
2969,12354,162,1079.40,0,0,C
2970,12353,134,89.00,0,0,C
2971,12352,2,632.50,1,0,B
2972,12348,5,677.00,1,0,B


# Building the data

In [12]:
x = df_sales_n[['201104',
       '201105', '201106', '201107', '201108', '201109']]
y = df_sales_n['201110'].to_list()

In [13]:
y_true = []

for i in range(len(y)):
    b_units = x['201108'].iloc[i] + x['201109'].iloc[i]   
    #if y[i]>0 or b_units>0: y_true.append(1)
    if y[i]>0: y_true.append(1)
    else: y_true.append(0)

In [14]:
print(len(y))
print(len(y_true))

4372
4372


In [15]:
print(y[:10])
print(y_true[:10])

[0, 1, 0, 0, 0, 0, 0, 0, 0, 0]
[0, 1, 0, 0, 0, 0, 0, 0, 0, 0]


In [16]:
c = 0
for i in range(len(y)):
    if (y[i] == 0 and y_true[i]==0) or (y[i]==y_true[i]):
        c+=1
print(c)

3926


In [17]:
X_train, X_test, y_train, y_test = train_test_split(x, y_true, test_size=0.33, random_state=73, stratify = y_true)

In [18]:
print(X_train.shape)
print(len(y_train))
print(X_test.shape)
print(len(y_test))

(2929, 6)
2929
(1443, 6)
1443


In [19]:
svr_model = SVR(kernel='rbf')
svr_model.fit(X_train, y_train)
y_pred = svr_model.predict(X_test)
y_pred

array([0.13518017, 0.09985061, 0.11127934, ..., 0.1002525 , 0.11847965,
       0.13316591])

In [20]:
y_pred_th = []

for i in range(len(y_pred)):
    if y_pred[i]>=0.5: y_pred_th.append(1)
    else: y_pred_th.append(0)
        
acc_val = accuracy_score(y_test, y_pred_th)

print(f'Accuracy: {acc_val}')

Accuracy: 0.6964656964656964


In [21]:
pkl_model_name = "./models/rbf/best_rbf_model.pkl"  

with open(pkl_model_name, 'wb') as file:  
    pickle.dump(svr_model, file)

### Using mean zero - std 1

In [22]:
scaler_trn = StandardScaler()
scaler_tst = StandardScaler()

X_train_s = scaler_trn.fit_transform(X_train)
X_test_s = scaler_tst.fit_transform(X_test)

In [23]:
svr_model = SVR(kernel='rbf')
svr_model.fit(X_train_s, y_train)
y_pred = svr_model.predict(X_test_s)
y_pred

array([ 0.38734259,  0.16717465, -0.04873708, ...,  0.5867872 ,
        0.30985903,  0.48934132])

In [24]:
y_pred_th = []

for i in range(len(y_pred)):
    if y_pred[i]>=0.5: y_pred_th.append(1)
    else: y_pred_th.append(0)
        
acc_val = accuracy_score(y_test, y_pred_th)

print(f'Accuracy: {acc_val}')

Accuracy: 0.5821205821205822


## Using DT CLF

In [23]:
from sklearn.tree import DecisionTreeClassifier

In [24]:
clf = DecisionTreeClassifier()

# Train Decision Tree Classifer
clf = clf.fit(X_train,y_train)

#Predict the response for test dataset
y_pred = clf.predict(X_test)

In [26]:
acc_val = accuracy_score(y_test, y_pred)

print(f'Accuracy: {acc_val}')

Accuracy: 0.6874566874566874


In [28]:
y_pred_prob = clf.predict_proba(X_test)
y_pred_prob

array([[0.8       , 0.2       ],
       [0.79130435, 0.20869565],
       [0.75      , 0.25      ],
       ...,
       [0.69601677, 0.30398323],
       [0.625     , 0.375     ],
       [0.        , 1.        ]])

# Using logistic model

In [29]:
from sklearn.linear_model import LogisticRegression

model = LogisticRegression()
model.fit(X_train, y_train)

LogisticRegression()

In [30]:
model.score(X_test, y_test)*100

70.47817047817048

In [31]:
model_name = './models/logistic/best_logic_mod.sav'

pickle.dump(model, open(model_name, 'wb'))

In [33]:
X_test.head(3)

Unnamed: 0,201104,201105,201106,201107,201108,201109
2465,0,1,0,1,0,1
930,0,0,0,1,0,0
381,0,0,2,0,0,1


In [36]:
best_model = pickle.load(open(model_name, 'rb'))

print(best_model.predict([[0, 1, 1, 1, 1, 1]]))
print(best_model.predict_proba([[0, 1, 1, 1, 1, 1]]))

[0]
[[0.53966477 0.46033523]]
