# Invoice Punctuality Prediction using Several Machine learning Models

## Import Libraries

In [1]:
# Import libraries
import numpy as np
import pandas as pd
import math
import matplotlib.pyplot as plt
import seaborn as sns


from sklearn.linear_model import LogisticRegression
from sklearn import metrics
from sklearn.preprocessing import StandardScaler

# Modelling
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn import svm
from sklearn.metrics import accuracy_score, confusion_matrix, precision_score, recall_score, ConfusionMatrixDisplay
from sklearn.model_selection import RandomizedSearchCV, train_test_split

## Read the dataset CSV

In [2]:
# Read the csv file
df = pd.read_csv('account_receivable.csv')

# Show the csv file
df.head()

Unnamed: 0,countryCode,customerID,PaperlessDate,invoiceNumber,InvoiceDate,DueDate,InvoiceAmount,Disputed,SettledDate,PaperlessBill,DaysToSettle,DaysLate
0,391,0379-NEVHP,4/6/2013,611365,1/2/2013,2/1/2013,55.94,No,1/15/2013,Paper,13,0
1,406,8976-AMJEO,3/3/2012,7900770,1/26/2013,2/25/2013,61.74,Yes,3/3/2013,Electronic,36,6
2,391,2820-XGXSB,1/26/2012,9231909,7/3/2013,8/2/2013,65.88,No,7/8/2013,Electronic,5,0
3,406,9322-YCTQO,4/6/2012,9888306,2/10/2013,3/12/2013,105.92,No,3/17/2013,Electronic,35,5
4,818,6627-ELFBK,11/26/2012,15752855,10/25/2012,11/24/2012,72.27,Yes,11/28/2012,Paper,34,4


In [3]:
# Check the shape of the file
df.shape

(2466, 12)

## Data Cleaning

In [4]:
# Check missing value
df.isna().sum()


countryCode      0
customerID       0
PaperlessDate    0
invoiceNumber    0
InvoiceDate      0
DueDate          0
InvoiceAmount    0
Disputed         0
SettledDate      0
PaperlessBill    0
DaysToSettle     0
DaysLate         0
dtype: int64

Note: there's no missing value

In [5]:
# Check duplicate
df.duplicated().sum()

0

Note: there's no duplicate

## Feature Engineering

### Rename Columns to Increase Readability

In [6]:
# Define columns name map
columns_map = {'countryCode':'country_code',
               'customerID':'customer_id',
               'PaperlessDate':'paperless_date',
               'invoiceNumber':'invoice_number',
               'InvoiceDate':'invoice_date',
               'DueDate':'due_date',
               'InvoiceAmount':'invoice_amount',
               'Disputed':'is_disputed',
               'SettledDate':'settled_date',
               'PaperlessBill':'is_paperless',
               'DaysToSettle':'days_to_settle',
               'DaysLate':'days_late'
               }

# Set the new columns name based on the map
df.rename(columns = columns_map, inplace = True)
df.head()

Unnamed: 0,country_code,customer_id,paperless_date,invoice_number,invoice_date,due_date,invoice_amount,is_disputed,settled_date,is_paperless,days_to_settle,days_late
0,391,0379-NEVHP,4/6/2013,611365,1/2/2013,2/1/2013,55.94,No,1/15/2013,Paper,13,0
1,406,8976-AMJEO,3/3/2012,7900770,1/26/2013,2/25/2013,61.74,Yes,3/3/2013,Electronic,36,6
2,391,2820-XGXSB,1/26/2012,9231909,7/3/2013,8/2/2013,65.88,No,7/8/2013,Electronic,5,0
3,406,9322-YCTQO,4/6/2012,9888306,2/10/2013,3/12/2013,105.92,No,3/17/2013,Electronic,35,5
4,818,6627-ELFBK,11/26/2012,15752855,10/25/2012,11/24/2012,72.27,Yes,11/28/2012,Paper,34,4


### Convert days_late (target) into Categorical Data

In [7]:
# Define a function to get a dummy variable based on wheter the data is on-time (1) or late (0)
def label_is_on_time(data, column):
    if data[column] > 0:
        return 0
    else:
        return 1 

In [8]:
# Apply the function to create a new column
df['is_on_time'] = df.apply(label_is_on_time, args=('days_late', ), axis=1)

# Drop the columns
df = df.drop('days_late', axis=1)
df.head()

Unnamed: 0,country_code,customer_id,paperless_date,invoice_number,invoice_date,due_date,invoice_amount,is_disputed,settled_date,is_paperless,days_to_settle,is_on_time
0,391,0379-NEVHP,4/6/2013,611365,1/2/2013,2/1/2013,55.94,No,1/15/2013,Paper,13,1
1,406,8976-AMJEO,3/3/2012,7900770,1/26/2013,2/25/2013,61.74,Yes,3/3/2013,Electronic,36,0
2,391,2820-XGXSB,1/26/2012,9231909,7/3/2013,8/2/2013,65.88,No,7/8/2013,Electronic,5,1
3,406,9322-YCTQO,4/6/2012,9888306,2/10/2013,3/12/2013,105.92,No,3/17/2013,Electronic,35,0
4,818,6627-ELFBK,11/26/2012,15752855,10/25/2012,11/24/2012,72.27,Yes,11/28/2012,Paper,34,0


### Convert is_disputed into Categorical Data

In [9]:
# Define a function to set a dummy variable
def label_is_disputed(data, column):
    if data[column] == 'Yes':
        return 1
    else:
        return 0

In [10]:
# Apply the function to the column
df['is_disputed'] = df.apply(label_is_disputed, args=('is_disputed', ), axis=1)
df.head()

Unnamed: 0,country_code,customer_id,paperless_date,invoice_number,invoice_date,due_date,invoice_amount,is_disputed,settled_date,is_paperless,days_to_settle,is_on_time
0,391,0379-NEVHP,4/6/2013,611365,1/2/2013,2/1/2013,55.94,0,1/15/2013,Paper,13,1
1,406,8976-AMJEO,3/3/2012,7900770,1/26/2013,2/25/2013,61.74,1,3/3/2013,Electronic,36,0
2,391,2820-XGXSB,1/26/2012,9231909,7/3/2013,8/2/2013,65.88,0,7/8/2013,Electronic,5,1
3,406,9322-YCTQO,4/6/2012,9888306,2/10/2013,3/12/2013,105.92,0,3/17/2013,Electronic,35,0
4,818,6627-ELFBK,11/26/2012,15752855,10/25/2012,11/24/2012,72.27,1,11/28/2012,Paper,34,0


### Get Dummies out of Categorical Data

### country_code

In [11]:
# Define a function to get the dummy data
def get_dummies(data, columns):
    data = pd.get_dummies(data, columns=columns, drop_first=True, dtype=int)
    
    return data

In [12]:
# Get the dummies variables out of country code
df = get_dummies(df, ['country_code'])
df.head()

Unnamed: 0,customer_id,paperless_date,invoice_number,invoice_date,due_date,invoice_amount,is_disputed,settled_date,is_paperless,days_to_settle,is_on_time,country_code_406,country_code_770,country_code_818,country_code_897
0,0379-NEVHP,4/6/2013,611365,1/2/2013,2/1/2013,55.94,0,1/15/2013,Paper,13,1,0,0,0,0
1,8976-AMJEO,3/3/2012,7900770,1/26/2013,2/25/2013,61.74,1,3/3/2013,Electronic,36,0,1,0,0,0
2,2820-XGXSB,1/26/2012,9231909,7/3/2013,8/2/2013,65.88,0,7/8/2013,Electronic,5,1,0,0,0,0
3,9322-YCTQO,4/6/2012,9888306,2/10/2013,3/12/2013,105.92,0,3/17/2013,Electronic,35,0,1,0,0,0
4,6627-ELFBK,11/26/2012,15752855,10/25/2012,11/24/2012,72.27,1,11/28/2012,Paper,34,0,0,0,1,0


### is_paperless

In [13]:
# Define the function
def label_is_paperless(data, column):
    if data[column] == 'Electronic':
        return 1
    else:
        return 0 

In [14]:
# Apply the function to the column
df['is_paperless'] = df.apply(label_is_paperless, args=('is_paperless', ), axis=1)
df.head()

Unnamed: 0,customer_id,paperless_date,invoice_number,invoice_date,due_date,invoice_amount,is_disputed,settled_date,is_paperless,days_to_settle,is_on_time,country_code_406,country_code_770,country_code_818,country_code_897
0,0379-NEVHP,4/6/2013,611365,1/2/2013,2/1/2013,55.94,0,1/15/2013,0,13,1,0,0,0,0
1,8976-AMJEO,3/3/2012,7900770,1/26/2013,2/25/2013,61.74,1,3/3/2013,1,36,0,1,0,0,0
2,2820-XGXSB,1/26/2012,9231909,7/3/2013,8/2/2013,65.88,0,7/8/2013,1,5,1,0,0,0,0
3,9322-YCTQO,4/6/2012,9888306,2/10/2013,3/12/2013,105.92,0,3/17/2013,1,35,0,1,0,0,0
4,6627-ELFBK,11/26/2012,15752855,10/25/2012,11/24/2012,72.27,1,11/28/2012,0,34,0,0,0,1,0


### Get a label to indicate if the paperless is issued before or after the invoice date

In [15]:
# Define the function
def label_is_date_later(data, column_1, column_2):
    if data[column_1] > data[column_2]:
        return 1
    else:
        return 0 

# Run the function
df['is_paperless_later'] = df.apply(label_is_date_later, args=('paperless_date', 'invoice_date'), axis=1)
df.head()

Unnamed: 0,customer_id,paperless_date,invoice_number,invoice_date,due_date,invoice_amount,is_disputed,settled_date,is_paperless,days_to_settle,is_on_time,country_code_406,country_code_770,country_code_818,country_code_897,is_paperless_later
0,0379-NEVHP,4/6/2013,611365,1/2/2013,2/1/2013,55.94,0,1/15/2013,0,13,1,0,0,0,0,1
1,8976-AMJEO,3/3/2012,7900770,1/26/2013,2/25/2013,61.74,1,3/3/2013,1,36,0,1,0,0,0,1
2,2820-XGXSB,1/26/2012,9231909,7/3/2013,8/2/2013,65.88,0,7/8/2013,1,5,1,0,0,0,0,0
3,9322-YCTQO,4/6/2012,9888306,2/10/2013,3/12/2013,105.92,0,3/17/2013,1,35,0,1,0,0,0,1
4,6627-ELFBK,11/26/2012,15752855,10/25/2012,11/24/2012,72.27,1,11/28/2012,0,34,0,0,0,1,0,1


## Remove Unnecessary and Potential-to-Leak Data

In [16]:
# Remove data that will hinder the prediction of future data and/or will leak the data
columns = ['customer_id', 'invoice_number', 'settled_date', 'days_to_settle']
df = df.drop(columns, axis=1)
df.head()

Unnamed: 0,paperless_date,invoice_date,due_date,invoice_amount,is_disputed,is_paperless,is_on_time,country_code_406,country_code_770,country_code_818,country_code_897,is_paperless_later
0,4/6/2013,1/2/2013,2/1/2013,55.94,0,0,1,0,0,0,0,1
1,3/3/2012,1/26/2013,2/25/2013,61.74,1,1,0,1,0,0,0,1
2,1/26/2012,7/3/2013,8/2/2013,65.88,0,1,1,0,0,0,0,0
3,4/6/2012,2/10/2013,3/12/2013,105.92,0,1,0,1,0,0,0,1
4,11/26/2012,10/25/2012,11/24/2012,72.27,1,0,0,0,0,1,0,1


## Get Cyclical Variable out of Date

### Convert Data Type to Date

In [17]:
# Define a function to conver
def convert_to_date(data, *columns):
    for col in columns:
        data[col] = pd.to_datetime(data[col].stack()).unstack()
    
    return data

# Convert the data type
df = convert_to_date(df,['paperless_date','invoice_date', 'due_date'])

# Check the data types
df.dtypes

paperless_date        datetime64[ns]
invoice_date          datetime64[ns]
due_date              datetime64[ns]
invoice_amount               float64
is_disputed                    int64
is_paperless                   int64
is_on_time                     int64
country_code_406               int64
country_code_770               int64
country_code_818               int64
country_code_897               int64
is_paperless_later             int64
dtype: object

### Get the Month and Day of Week of the Date

In [18]:
def get_m_dow(data, column):
    month = data[column].dt.strftime('%m').astype(int)
    day = data[column].dt.strftime('%w').astype(int)
    data = data.drop(column, axis=1)
    
    return data, month, day
    

In [19]:
# Get the Year, Month, and Day of Weekk of Paperless Date
df, df['paperless_month'], df['paperless_dow'] = get_m_dow(df, 'paperless_date')
df.head()

Unnamed: 0,invoice_date,due_date,invoice_amount,is_disputed,is_paperless,is_on_time,country_code_406,country_code_770,country_code_818,country_code_897,is_paperless_later,paperless_month,paperless_dow
0,2013-01-02,2013-02-01,55.94,0,0,1,0,0,0,0,1,4,6
1,2013-01-26,2013-02-25,61.74,1,1,0,1,0,0,0,1,3,6
2,2013-07-03,2013-08-02,65.88,0,1,1,0,0,0,0,0,1,4
3,2013-02-10,2013-03-12,105.92,0,1,0,1,0,0,0,1,4,5
4,2012-10-25,2012-11-24,72.27,1,0,0,0,0,1,0,1,11,1


In [20]:
# Get the Year, Month, and Day of Weekk of Invoice Date
df, df['invoice_month'], df['invoice_dow'] = get_m_dow(df, 'invoice_date')
df.head()

Unnamed: 0,due_date,invoice_amount,is_disputed,is_paperless,is_on_time,country_code_406,country_code_770,country_code_818,country_code_897,is_paperless_later,paperless_month,paperless_dow,invoice_month,invoice_dow
0,2013-02-01,55.94,0,0,1,0,0,0,0,1,4,6,1,3
1,2013-02-25,61.74,1,1,0,1,0,0,0,1,3,6,1,6
2,2013-08-02,65.88,0,1,1,0,0,0,0,0,1,4,7,3
3,2013-03-12,105.92,0,1,0,1,0,0,0,1,4,5,2,0
4,2012-11-24,72.27,1,0,0,0,0,1,0,1,11,1,10,4


In [21]:
# Get the Year, Month, and Day of Weekk of Due Date
df, df['due_month'], df['due_dow'] = get_m_dow(df, 'due_date')
df.head()

Unnamed: 0,invoice_amount,is_disputed,is_paperless,is_on_time,country_code_406,country_code_770,country_code_818,country_code_897,is_paperless_later,paperless_month,paperless_dow,invoice_month,invoice_dow,due_month,due_dow
0,55.94,0,0,1,0,0,0,0,1,4,6,1,3,2,5
1,61.74,1,1,0,1,0,0,0,1,3,6,1,6,2,1
2,65.88,0,1,1,0,0,0,0,0,1,4,7,3,8,5
3,105.92,0,1,0,1,0,0,0,1,4,5,2,0,3,2
4,72.27,1,0,0,0,0,1,0,1,11,1,10,4,11,6


### Convert Cyclical Data (Month & Day of Week) into a Sine and Cosine Value

Reference: </br>
- https://www.ideadrops.info/post/how-to-encode-cyclic-time
- https://webcache.googleusercontent.com/search?q=cache:https://towardsdatascience.com/stop-one-hot-encoding-your-time-based-features-24c699face2f



In [22]:
# Define a function to get the sin and cos value of the month value
def get_sin_cos_month(data, column):
    sin_col_name = column + '_sin'
    cos_col_name = column + '_cos'
    data[sin_col_name] = np.sin(data[column]*(2.*np.pi/12))
    data[cos_col_name] = np.cos(data[column]*(2.*np.pi/12))
    
    data = data.drop(column, axis=1)
    
    return data

In [23]:
# Define a function to get the sin and cos value of the day of week value
def get_sin_cos_dow(data, column):
    sin_col_name = column + '_sin'
    cos_col_name = column + '_cos'
    data[sin_col_name] = np.sin(data[column]*(2.*np.pi/7))
    data[cos_col_name] = np.cos(data[column]*(2.*np.pi/7))
    
    data = data.drop(column, axis=1)
    
    return data

In [24]:
cyclical_month_columns = [
    'paperless_month',
    'invoice_month',
    'due_month'
]

cyclical_dow_columns = [
    'paperless_dow',
    'invoice_dow',
    'due_dow'
]

In [25]:
for i in cyclical_month_columns:
    df = get_sin_cos_month(df, i)

In [26]:
for i in cyclical_dow_columns:
    df = get_sin_cos_dow(df, i)

In [27]:
df.head()

Unnamed: 0,invoice_amount,is_disputed,is_paperless,is_on_time,country_code_406,country_code_770,country_code_818,country_code_897,is_paperless_later,paperless_month_sin,...,invoice_month_sin,invoice_month_cos,due_month_sin,due_month_cos,paperless_dow_sin,paperless_dow_cos,invoice_dow_sin,invoice_dow_cos,due_dow_sin,due_dow_cos
0,55.94,0,0,1,0,0,0,0,1,0.866025,...,0.5,0.866025,0.866025,0.5,-0.781831,0.62349,0.433884,-0.900969,-0.974928,-0.222521
1,61.74,1,1,0,1,0,0,0,1,1.0,...,0.5,0.866025,0.866025,0.5,-0.781831,0.62349,-0.781831,0.62349,0.781831,0.62349
2,65.88,0,1,1,0,0,0,0,0,0.5,...,-0.5,-0.866025,-0.866025,-0.5,-0.433884,-0.900969,0.433884,-0.900969,-0.974928,-0.222521
3,105.92,0,1,0,1,0,0,0,1,0.866025,...,0.866025,0.5,1.0,6.123234000000001e-17,-0.974928,-0.222521,0.0,1.0,0.974928,-0.222521
4,72.27,1,0,0,0,0,1,0,1,-0.5,...,-0.866025,0.5,-0.5,0.8660254,0.781831,0.62349,-0.433884,-0.900969,-0.781831,0.62349


## Split Input-Output

In [28]:
def extract_input_output(data,
                       output_column_name):
    """
    Fungsi untuk memisahkan data input dan output
    :param data: <pandas dataframe> data seluruh sample
    :param output_column_name: <string> nama kolom output
    :return input_data: <pandas dataframe> data input
    :return output_data: <pandas series> data output
    """
    output_data = data[output_column_name]
    input_data = data.drop(output_column_name,
                           axis = 1)
    
    return input_data, output_data

In [29]:
X, y = extract_input_output(data = df,
                          output_column_name = "is_on_time")

In [30]:
X.head()

Unnamed: 0,invoice_amount,is_disputed,is_paperless,country_code_406,country_code_770,country_code_818,country_code_897,is_paperless_later,paperless_month_sin,paperless_month_cos,invoice_month_sin,invoice_month_cos,due_month_sin,due_month_cos,paperless_dow_sin,paperless_dow_cos,invoice_dow_sin,invoice_dow_cos,due_dow_sin,due_dow_cos
0,55.94,0,0,0,0,0,0,1,0.866025,-0.5,0.5,0.866025,0.866025,0.5,-0.781831,0.62349,0.433884,-0.900969,-0.974928,-0.222521
1,61.74,1,1,1,0,0,0,1,1.0,6.123234000000001e-17,0.5,0.866025,0.866025,0.5,-0.781831,0.62349,-0.781831,0.62349,0.781831,0.62349
2,65.88,0,1,0,0,0,0,0,0.5,0.8660254,-0.5,-0.866025,-0.866025,-0.5,-0.433884,-0.900969,0.433884,-0.900969,-0.974928,-0.222521
3,105.92,0,1,1,0,0,0,1,0.866025,-0.5,0.866025,0.5,1.0,6.123234000000001e-17,-0.974928,-0.222521,0.0,1.0,0.974928,-0.222521
4,72.27,1,0,0,0,1,0,1,-0.5,0.8660254,-0.866025,0.5,-0.5,0.8660254,0.781831,0.62349,-0.433884,-0.900969,-0.781831,0.62349


In [31]:
y.head()

0    1
1    0
2    1
3    0
4    0
Name: is_on_time, dtype: int64

## Split Train-Test

In [32]:
# Train test split
X_train, X_test, y_train, y_test = train_test_split(X, y,
                                                    test_size = 0.25,
                                                    random_state = 12)

In [33]:
# Sanity check the split result
print(X_train.shape)
print(X_test.shape)

(1849, 20)
(617, 20)


In [34]:
# Ratio
X_test.shape[0] / X.shape[0]

# Hasil 0.25 - sesuai dengan test_size kita

0.2502027575020276

## EDA di Train Dataset

## Standardize the Data

In [35]:
# Buat fungsi
def data_standardizer(data):
    """
    Fungsi untuk melakukan standarisasi data
    :param data: <pandas dataframe> sampel data
    :return standardized_data: <pandas dataframe> sampel data standard
    :return standardizer: method untuk standardisasi data
    """
    data_columns = data.columns  # agar nama kolom tidak hilang
    data_index = data.index  # agar index tidak hilang

    # buat (fit) standardizer
    standardizer = StandardScaler()
    standardizer.fit(data)

    # transform data
    standardized_data_raw = standardizer.transform(data)
    standardized_data = pd.DataFrame(standardized_data_raw)
    standardized_data.columns = data_columns
    standardized_data.index = data_index

    return standardized_data, standardizer

In [36]:
X_train_std, standardizer = data_standardizer(data = X_train)

In [37]:
X_train_std.head()

Unnamed: 0,invoice_amount,is_disputed,is_paperless,country_code_406,country_code_770,country_code_818,country_code_897,is_paperless_later,paperless_month_sin,paperless_month_cos,invoice_month_sin,invoice_month_cos,due_month_sin,due_month_cos,paperless_dow_sin,paperless_dow_cos,invoice_dow_sin,invoice_dow_cos,due_dow_sin,due_dow_cos
685,0.692876,-0.547979,-0.980185,-0.537124,-0.510289,-0.430414,-0.446197,0.956075,1.210587,-0.626385,0.700082,1.321681,1.256347,0.760291,-0.111428,1.510835,-1.097538,0.895812,1.11623,0.875892
61,0.293405,-0.547979,-0.980185,-0.537124,-0.510289,-0.430414,-0.446197,0.956075,1.394469,0.108043,-1.181916,0.78856,-0.666233,1.281432,1.240915,-0.267949,-1.097538,0.895812,1.11623,0.875892
360,-1.85136,-0.547979,-0.980185,-0.537124,1.959672,-0.430414,-0.446197,0.956075,1.210587,-0.626385,-1.181916,0.78856,-0.666233,1.281432,1.240915,-0.267949,0.612604,-1.271842,-1.37284,-0.318381
1339,-0.849487,1.824887,-0.980185,-0.537124,-0.510289,-0.430414,2.241161,0.956075,-0.664305,-1.164023,1.204363,-0.667954,0.741193,-1.184634,0.973067,0.963008,-1.097538,0.895812,1.11623,0.875892
1007,-0.066758,-0.547979,-0.980185,-0.537124,-0.510289,2.323343,-0.446197,0.956075,-1.350563,0.108043,0.700082,1.321681,1.256347,0.760291,0.490422,-1.255099,-1.369166,-0.307145,0.00849,1.407393


In [38]:
def standardize(data):
    data_columns = data.columns  # agar nama kolom tidak hilang
    data_index = data.index  # agar index tidak hilang

    data_std = pd.DataFrame(standardizer.transform(data))

    data_std.columns = data_columns
    data_std.index = data_index
    
    return data_std

X_test_std = standardize(X_test)

## Predict using various ML Models

### Baseline

In [39]:
# Predict the output using test dataset
n = y_test.shape[0]

y_pred = pd.Series(1, index=range(n))


In [40]:
# Get the confusion matrix performance of the dataset
base_accuracy = accuracy_score(y_test, y_pred)
base_precision = precision_score(y_test, y_pred)
base_recall = recall_score(y_test, y_pred)
base_f1 = 2 * (base_precision * base_recall) / (base_precision + base_recall)

print("Accuracy:", base_accuracy)
print("Precision:", base_precision)
print("Recall:", base_recall)
print("F1 Score:", base_f1)

Accuracy: 0.6564019448946515
Precision: 0.6564019448946515
Recall: 1.0
F1 Score: 0.7925636007827788


### Logistic Regression

In [41]:
# Define the model
lr = LogisticRegression()

# Fit the model to the train dataset
lr.fit(X_train_std, y_train)

# Predict the output using test dataset
y_pred = lr.predict(X_test_std)

In [42]:
# Get the confusion matrix performance of the dataset
lr_accuracy = accuracy_score(y_test, y_pred)
lr_precision = precision_score(y_test, y_pred)
lr_recall = recall_score(y_test, y_pred)
lr_f1 = 2 * (lr_precision * lr_recall) / (lr_precision + lr_recall)

print("Accuracy:", lr_accuracy)
print("Precision:", lr_precision)
print("Recall:", lr_recall)
print("F1 Score:", lr_f1)

Accuracy: 0.7504051863857374
Precision: 0.7545638945233266
Recall: 0.9185185185185185
F1 Score: 0.8285077951002228


### Decision Tree

In [43]:
# Define the model
from sklearn.tree import DecisionTreeClassifier
dtc = DecisionTreeClassifier()

# Fit the model to the train dataset
dtc.fit(X_train_std, y_train)

# Predict the output using test dataset
y_pred = dtc.predict(X_test_std)

In [44]:
# Get the confusion matrix performance of the dataset
dtc_accuracy = accuracy_score(y_test, y_pred)
dtc_precision = precision_score(y_test, y_pred)
dtc_recall = recall_score(y_test, y_pred)
dtc_f1 = 2 * (dtc_precision * dtc_recall) / (dtc_precision + dtc_recall)


print("Accuracy:", dtc_accuracy)
print("Precision:", dtc_precision)
print("Recall:", dtc_recall)
print("F1 Score:", dtc_f1)

Accuracy: 0.7617504051863857
Precision: 0.8241206030150754
Recall: 0.8098765432098766
F1 Score: 0.8169364881693649


### Random Forest

In [45]:
# Define the model
rfc = RandomForestClassifier()

# Fit the model to the train dataset
rfc.fit(X_train_std, y_train)

# Predict the output using test dataset
y_pred = rfc.predict(X_test_std)

In [46]:
# Get the confusion matrix performance of the dataset
rfc_accuracy = accuracy_score(y_test, y_pred)
rfc_precision = precision_score(y_test, y_pred)
rfc_recall = recall_score(y_test, y_pred)
rfc_f1 = 2 * (rfc_precision * rfc_recall) / (rfc_precision + rfc_recall)

print("Accuracy:", rfc_accuracy)
print("Precision:", rfc_precision)
print("Recall:", rfc_recall)
print("F1 Score:", rfc_f1)

Accuracy: 0.7860615883306321
Precision: 0.8109339407744874
Recall: 0.8790123456790123
F1 Score: 0.8436018957345971


### Gradient Boosting Classifier

In [47]:
# Define the model
gbc = GradientBoostingClassifier()

# Fit the model to the train dataset
gbc.fit(X_train_std, y_train)

#Predict the output of test dataset
y_pred = gbc.predict(X_test_std)


In [48]:
# Get the confusion matrix performance of the dataset
gbc_accuracy = accuracy_score(y_test, y_pred)
gbc_precision = precision_score(y_test, y_pred)
gbc_recall = recall_score(y_test, y_pred)
gbc_f1 = 2 * (gbc_precision * gbc_recall) / (gbc_precision + gbc_recall)

print("Accuracy:", gbc_accuracy)
print("Precision:", gbc_precision)
print("Recall:", gbc_recall)
print("F1 Score:", gbc_f1)

Accuracy: 0.7909238249594813
Precision: 0.7974137931034483
Recall: 0.9135802469135802
F1 Score: 0.8515535097813578


### SVM

In [49]:
#Create a svm Classifier
clf = svm.SVC(kernel='linear') # Linear Kernel

#Train the model using the training sets
clf.fit(X_train_std, y_train)

#Predict the output of test dataset
y_pred = clf.predict(X_test_std)

In [50]:
# Get the confusion matrix performance of the dataset
svm_accuracy = accuracy_score(y_test, y_pred)
svm_precision = precision_score(y_test, y_pred)
svm_recall = recall_score(y_test, y_pred)
svm_f1 = 2 * (svm_precision * svm_recall) / (svm_precision + svm_recall)

print("Accuracy:", svm_accuracy)
print("Precision:", svm_precision)
print("Recall:", svm_recall)
print("F1 Score:", svm_f1)

Accuracy: 0.7374392220421394
Precision: 0.7515527950310559
Recall: 0.8962962962962963
F1 Score: 0.8175675675675674


## Compare the Performance of Each Model

In [51]:
base_perf = []

base_perf.append(base_accuracy)
base_perf.append(base_precision)
base_perf.append(base_recall)
base_perf.append(base_f1)

In [52]:
lr_perf = []

lr_perf.append(lr_accuracy)
lr_perf.append(lr_precision)
lr_perf.append(lr_recall)
lr_perf.append(lr_f1)

In [53]:
dtc_perf = []

dtc_perf.append(dtc_accuracy)
dtc_perf.append(dtc_precision)
dtc_perf.append(dtc_recall)
dtc_perf.append(dtc_f1)

In [54]:
rfc_perf = []

rfc_perf.append(rfc_accuracy)
rfc_perf.append(rfc_precision)
rfc_perf.append(rfc_recall)
rfc_perf.append(rfc_f1)

In [55]:
gbc_perf = []

gbc_perf.append(gbc_accuracy)
gbc_perf.append(gbc_precision)
gbc_perf.append(gbc_recall)
gbc_perf.append(gbc_f1)

In [56]:
svm_perf = []

svm_perf.append(svm_accuracy)
svm_perf.append(svm_precision)
svm_perf.append(svm_recall)
svm_perf.append(svm_f1)

In [57]:
models_perf = [base_perf, lr_perf, dtc_perf, rfc_perf, gbc_perf, svm_perf]

df_comparison = pd.DataFrame(models_perf).transpose()
df_comparison.columns = ['baseline', 'log_reg', 'decis_tree', 'rand_forest', 'grad_boost', 'svm']
df_comparison.index = ['accuracy', 'precision', 'recall', 'f1_score']
df_comparison

Unnamed: 0,baseline,log_reg,decis_tree,rand_forest,grad_boost,svm
accuracy,0.656402,0.750405,0.76175,0.786062,0.790924,0.737439
precision,0.656402,0.754564,0.824121,0.810934,0.797414,0.751553
recall,1.0,0.918519,0.809877,0.879012,0.91358,0.896296
f1_score,0.792564,0.828508,0.816936,0.843602,0.851554,0.817568


Insights:
1. Baseline method (by predictiong that all the invoice GONNA BE ON TIME) can find all the on time invoice, hence the perfect recall score of 1.
2. But the PRECISION is only 65% thus the 35% of them, the late invoice, gonna be an inefficiency.
3. On the other hand, LOGISTIC, RANDOM FOREST, GRADIENT BOOST, and SVM could find more than 85% of the on time invoice, with the PRECISION of XX to XX %.
4. YANG TERBAIK ADALAH XXX -->

Reference: 
- https://builtin.com/data-science/precision-and-recall
- https://www.v7labs.com/blog/f1-score-guide

Decision:
- We'll be using **Gradient Boosting Classifier** model to predict the invoice punctuality and proceed to deploy the model as a web service.

In [82]:
params = {
    'max_depth': np.arange(3, 11),
    'subsample': np.arange(0.5, 1.0, 0.1),
    'max_features': ['sqrt', 'log2', None]
    }

search = RandomizedSearchCV(GradientBoostingClassifier(random_state=0, n_estimators=500, n_iter_no_change=5), 
                            params, n_iter=50, cv=3, n_jobs=-1)

search.fit(X_train_std, y_train)

print(search.best_params_)

clf = search.best_estimator_
print(f'R2 score (train): {clf.score(X_train_std, y_train):.4f}')
print(f'R2 score (test): {clf.score(X_test_std, y_test):.4f}')

{'subsample': 0.7, 'max_features': None, 'max_depth': 9}
R2 score (train): 0.9686
R2 score (test): 0.8071


In [83]:
clf.fit(X_train_std, y_train)
print(clf.score(X_train_std, y_train))

y_pred = clf.predict(X_test_std)

# Get the confusion matrix performance of the dataset
clf_accuracy = accuracy_score(y_test, y_pred)
clf_precision = precision_score(y_test, y_pred)
clf_recall = recall_score(y_test, y_pred)
clf_f1 = 2 * (clf_precision * clf_recall) / (clf_precision + clf_recall)

print("Accuracy:", clf_accuracy)
print("Precision:", clf_precision)
print("Recall:", clf_recall)
print("F1 Score:", clf_f1)

0.9686316928069226
Accuracy: 0.807131280388979
Precision: 0.8235294117647058
Recall: 0.8987654320987655
F1 Score: 0.859504132231405
