#Supervised ML model for sales data

In [14]:
#Packages related to general operating system & warnings
import os 
import warnings
warnings.filterwarnings('ignore')
#Packages related to data importing, manipulation, exploratory data #analysis, data understanding
import numpy as np
import pandas as pd
from pandas import Series, DataFrame
from termcolor import colored as cl # text customization
#Packages related to data visualizaiton
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline
#Setting plot sizes and type of plot
plt.rc("font", size=14)
plt.rcParams['axes.grid'] = True
plt.figure(figsize=(6,3))
plt.gray()
from matplotlib.backends.backend_pdf import PdfPages
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn import metrics
from sklearn.impute import MissingIndicator, SimpleImputer
from sklearn.preprocessing import  PolynomialFeatures, KBinsDiscretizer, FunctionTransformer
from sklearn.preprocessing import StandardScaler, MinMaxScaler, MaxAbsScaler
from sklearn.preprocessing import LabelEncoder, OneHotEncoder, LabelBinarizer, OrdinalEncoder
import statsmodels.formula.api as smf
import statsmodels.tsa as tsa
from sklearn.linear_model import LogisticRegression, LinearRegression, ElasticNet, Lasso, Ridge
from sklearn.neighbors import KNeighborsClassifier, KNeighborsRegressor
from sklearn.tree import DecisionTreeClassifier, DecisionTreeRegressor, export_graphviz, export_text
from sklearn.ensemble import BaggingClassifier, BaggingRegressor,RandomForestClassifier,RandomForestRegressor
from sklearn.ensemble import GradientBoostingClassifier,GradientBoostingRegressor, AdaBoostClassifier, AdaBoostRegressor 
from sklearn.svm import LinearSVC, LinearSVR, SVC, SVR
from xgboost import XGBClassifier
from sklearn.metrics import f1_score
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix

<Figure size 432x216 with 0 Axes>

In [15]:
#import data, replace NA values
df=pd.read_csv("sales.csv")
df.Quant =pd.to_numeric(df.Quant, errors ='coerce').fillna(0).astype('int')
df.Val =pd.to_numeric(df.Val, errors ='coerce').fillna(0).astype('int')
df.head


<bound method NDFrame.head of         Unnamed: 0     ID   Prod  Quant    Val  Insp
0                1     v1     p1    182   1665  unkn
1                2     v2     p1   3072   8780  unkn
2                3     v3     p1  20393  76990  unkn
3                4     v4     p1    112   1100  unkn
4                5     v3     p1   6164  20260  unkn
...            ...    ...    ...    ...    ...   ...
401141      411814  v5918  p4126    113   9330  unkn
401142      411815  v2540  p4126    118   3355  unkn
401143      411816  v5896  p4127    143   1835  unkn
401144      411817  v1796  p4127    102   1445  unkn
401145      411818  v2540  p4127    106   1275  unkn

[401146 rows x 6 columns]>

In [16]:
#split data into known and unknown
data = df[df.Insp != 'unkn']
unknown = df[df.Insp == 'unkn']
data.head

<bound method NDFrame.head of         Unnamed: 0     ID   Prod  Quant     Val Insp
48              53    v42    p11  51097  310780   ok
51              56    v45    p11    260    1925   ok
63              68    v42    p11  51282  278770   ok
72              77    v50    p11  46903  281485   ok
77              82    v46    p12    475    2600   ok
...            ...    ...    ...    ...     ...  ...
400795      411458  v3854  p4096   1567   21180   ok
400797      411460  v3179  p4096    907   17350   ok
400798      411461  v4561  p4096   1212   31970   ok
400814      411477  v3179  p4096    650   18290   ok
400815      411478   v739  p4096   2646   59720   ok

[15732 rows x 6 columns]>

In [17]:
#descriptive statistics of known data
Total_transactions = len(data)
normal = len(data[data.Insp == 'ok'])
fraudulent = len(data[data.Insp == 'fraud'])
fraud_percentage = round(fraudulent/normal*100, 2)
print(cl('Total number of Transactions are {}'.format(Total_transactions), attrs = ['bold']))
print(cl('Number of Normal Transactions are {}'.format(normal), attrs = ['bold']))
print(cl('Number of fraudulent Transactions are {}'.format(fraudulent), attrs = ['bold']))
print(cl('Percentage of fraud Transactions is {}'.format(fraud_percentage), attrs = ['bold']))

[1mTotal number of Transactions are 15732[0m
[1mNumber of Normal Transactions are 14462[0m
[1mNumber of fraudulent Transactions are 1270[0m
[1mPercentage of fraud Transactions is 8.78[0m


In [18]:
#change fraud and ok to numerical values
data['Insp'] = data['Insp'].replace(['ok','fraud'],[0,1])

#standardize value and quantity columns
sc = StandardScaler()
amount = data['Quant'].values
data['Amount'] = sc.fit_transform(amount.reshape(-1, 1))
value = data['Val'].values
data['Value'] = sc.fit_transform(value.reshape(-1, 1))

data.shape

(15732, 8)

##Train Data

In [19]:
#defining independent and dependent variables
X = data.drop(['ID', 'Prod', 'Insp'], axis = 1)
y = data.Insp

#splitting training and testing data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.25, random_state = 50)

X = pd.concat([X_train, y_train], axis=1)

# separate minority and majority classes (oversampling)
not_fraud = X[X.Insp==0]
fraud = X[X.Insp==1]

# upsample minority
from sklearn.utils import resample
fraud_upsampled = resample(fraud,
                          replace=True, # sample with replacement
                          n_samples=len(not_fraud), # match number in majority class
                          random_state=50) # reproducible results

# combine majority and upsampled minority
upsampled = pd.concat([not_fraud, fraud_upsampled])

# check new class counts
upsampled.Insp.value_counts()

1    10847
0    10847
Name: Insp, dtype: int64

###Modeling

In [20]:
#Decision Tree
DT = DecisionTreeClassifier(max_depth = 4, criterion = 'entropy')
DT.fit(X_train, y_train)
dt_yhat = DT.predict(X_test)

print('Accuracy score of the Decision Tree model is {}'.format(accuracy_score(y_test, dt_yhat)))
print('F1 score of the Decision Tree model is {}'.format(f1_score(y_test, dt_yhat)))

#check confusion matrix
confusion_matrix(y_test, dt_yhat, labels = [0,1])

Accuracy score of the Decision Tree model is 0.9224510551741673
F1 score of the Decision Tree model is 0.37371663244353187


array([[3537,   78],
       [ 227,   91]])

In [21]:
#K Nearest Neighbors
n = 3
KNN = KNeighborsClassifier(n_neighbors = n)
KNN.fit(X_train, y_train)
knn_yhat = KNN.predict(X_test)

print('Accuracy score of the K-Nearest Neighbors model is {}'.format(accuracy_score(y_test, knn_yhat)))
print('F1 score of the K-Nearest Neighbors model is {}'.format(f1_score(y_test, knn_yhat)))

#check confusion matrix
confusion_matrix(y_test, knn_yhat, labels = [0,1])

#I ran this with several cluster values and 3 clusters allowed for the largest F1 score

Accuracy score of the K-Nearest Neighbors model is 0.9310958555809814
F1 score of the K-Nearest Neighbors model is 0.43186582809224316


array([[3559,   56],
       [ 215,  103]])

In [22]:
#Logistic Regression
lr = LogisticRegression()
lr.fit(X_train, y_train)
lr_yhat = lr.predict(X_test)

print('Accuracy score of the Logistic Regression model is {}'.format(accuracy_score(y_test, lr_yhat)))
print('F1 score of the Logistic Regression model is {}'.format(f1_score(y_test, lr_yhat)))

#check confusion matrix
confusion_matrix(y_test, lr_yhat, labels = [0,1])

Accuracy score of the Logistic Regression model is 0.9196542079837274
F1 score of the Logistic Regression model is 0.018633540372670808


array([[3614,    1],
       [ 315,    3]])

In [23]:
#Random Forest
rf = RandomForestClassifier(max_depth = 4)
rf.fit(X_train, y_train)
rf_yhat = rf.predict(X_test)

print('Accuracy score of the Random Forest model is {}'.format(accuracy_score(y_test, rf_yhat)))
print('F1 score of the Random Forest model is {}'.format(f1_score(y_test, rf_yhat)))

#check confusion matrix
confusion_matrix(y_test, rf_yhat, labels = [0,1])

Accuracy score of the Random Forest model is 0.9206712433257056
F1 score of the Random Forest model is 0.04294478527607362


array([[3614,    1],
       [ 311,    7]])

In [24]:
#support vector machines
svm = SVC()
svm.fit(X_train, y_train)
svm_yhat = svm.predict(X_test)

print('Accuracy score of the Support Vector Machines model is {}'.format(accuracy_score(y_test, svm_yhat)))
print('F1 score of the Support Vector Machines model is {}'.format(f1_score(y_test, svm_yhat)))

#check confusion matrix
confusion_matrix(y_test, svm_yhat, labels = [0,1])

Accuracy score of the Support Vector Machines model is 0.9196542079837274
F1 score of the Support Vector Machines model is 0.0125


array([[3615,    0],
       [ 316,    2]])

###Running Best Model on unknown data

In [25]:
#standardize value and quantity columns
sc = StandardScaler()
amount1 = unknown['Quant'].values
unknown['Amount'] = sc.fit_transform(amount1.reshape(-1, 1))
value1 = unknown['Val'].values
unknown['Value'] = sc.fit_transform(value1.reshape(-1, 1))

unknown.shape

(385414, 8)

In [26]:
#Use KNN model to make predictions about the data
Xnew = unknown.drop(['ID', 'Prod', 'Insp'], axis = 1)
ynew = KNN.predict(Xnew)

print(ynew)

[0 0 0 ... 1 0 0]


###Running Best Model on All Data

In [27]:
#standardize value and quantity columns
sc = StandardScaler()
amount1 = df['Quant'].values
df['Amount'] = sc.fit_transform(amount1.reshape(-1, 1))
value1 = df['Val'].values
df['Value'] = sc.fit_transform(value1.reshape(-1, 1))

df.shape

(401146, 8)

In [28]:
#Use KNN model to make predictions about the data
Xall = df.drop(['ID', 'Prod', 'Insp'], axis = 1)
yall = KNN.predict(Xall)

print(yall)

dpre = pd.DataFrame(yall.T)
dpre.to_csv('myfile.csv')

[0 0 0 ... 1 0 0]
