## 📝 Fraudulent Firm Prediction (Model Selection)

Given *data about audits of firms*, let's try to predict whether a given firm will be **fraudulent** or not. 

We will use a variety of classification models to make our predictions.

Data Source: https://www.kaggle.com/datasets/sid321axn/audit-data

### Importing Libraries

In [3]:
import pandas as pd
import numpy as np
pd.set_option("display.max_columns", None)

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import LinearSVC, SVC
from sklearn.neural_network import MLPClassifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier

import warnings
warnings.filterwarnings(action="ignore")

In [4]:
data = pd.read_csv("audit_data.csv")
data

Unnamed: 0,Sector_score,LOCATION_ID,PARA_A,Score_A,Risk_A,PARA_B,Score_B,Risk_B,TOTAL,numbers,Score_B.1,Risk_C,Money_Value,Score_MV,Risk_D,District_Loss,PROB,RiSk_E,History,Prob,Risk_F,Score,Inherent_Risk,CONTROL_RISK,Detection_Risk,Audit_Risk,Risk
0,3.89,23,4.18,0.6,2.508,2.50,0.2,0.500,6.68,5.0,0.2,1.0,3.38,0.2,0.676,2,0.2,0.4,0,0.2,0.0,2.4,8.574,0.4,0.5,1.7148,1
1,3.89,6,0.00,0.2,0.000,4.83,0.2,0.966,4.83,5.0,0.2,1.0,0.94,0.2,0.188,2,0.2,0.4,0,0.2,0.0,2.0,2.554,0.4,0.5,0.5108,0
2,3.89,6,0.51,0.2,0.102,0.23,0.2,0.046,0.74,5.0,0.2,1.0,0.00,0.2,0.000,2,0.2,0.4,0,0.2,0.0,2.0,1.548,0.4,0.5,0.3096,0
3,3.89,6,0.00,0.2,0.000,10.80,0.6,6.480,10.80,6.0,0.6,3.6,11.75,0.6,7.050,2,0.2,0.4,0,0.2,0.0,4.4,17.530,0.4,0.5,3.5060,1
4,3.89,6,0.00,0.2,0.000,0.08,0.2,0.016,0.08,5.0,0.2,1.0,0.00,0.2,0.000,2,0.2,0.4,0,0.2,0.0,2.0,1.416,0.4,0.5,0.2832,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
771,55.57,9,0.49,0.2,0.098,0.40,0.2,0.080,0.89,5.0,0.2,1.0,0.00,0.2,0.000,2,0.2,0.4,0,0.2,0.0,2.0,1.578,0.4,0.5,0.3156,0
772,55.57,16,0.47,0.2,0.094,0.37,0.2,0.074,0.84,5.0,0.2,1.0,0.00,0.2,0.000,2,0.2,0.4,0,0.2,0.0,2.0,1.568,0.4,0.5,0.3136,0
773,55.57,14,0.24,0.2,0.048,0.04,0.2,0.008,0.28,5.0,0.2,1.0,0.00,0.2,0.000,2,0.2,0.4,0,0.2,0.0,2.0,1.456,0.4,0.5,0.2912,0
774,55.57,18,0.20,0.2,0.040,0.00,0.2,0.000,0.20,5.0,0.2,1.0,0.00,0.2,0.000,2,0.2,0.4,0,0.2,0.0,2.0,1.440,0.4,0.5,0.2880,0


In [5]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 776 entries, 0 to 775
Data columns (total 27 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   Sector_score    776 non-null    float64
 1   LOCATION_ID     776 non-null    object 
 2   PARA_A          776 non-null    float64
 3   Score_A         776 non-null    float64
 4   Risk_A          776 non-null    float64
 5   PARA_B          776 non-null    float64
 6   Score_B         776 non-null    float64
 7   Risk_B          776 non-null    float64
 8   TOTAL           776 non-null    float64
 9   numbers         776 non-null    float64
 10  Score_B.1       776 non-null    float64
 11  Risk_C          776 non-null    float64
 12  Money_Value     775 non-null    float64
 13  Score_MV        776 non-null    float64
 14  Risk_D          776 non-null    float64
 15  District_Loss   776 non-null    int64  
 16  PROB            776 non-null    float64
 17  RiSk_E          776 non-null    flo

### Preprocessing

In [6]:
df = data.copy()

In [7]:
df.isna().sum()

Sector_score      0
LOCATION_ID       0
PARA_A            0
Score_A           0
Risk_A            0
PARA_B            0
Score_B           0
Risk_B            0
TOTAL             0
numbers           0
Score_B.1         0
Risk_C            0
Money_Value       1
Score_MV          0
Risk_D            0
District_Loss     0
PROB              0
RiSk_E            0
History           0
Prob              0
Risk_F            0
Score             0
Inherent_Risk     0
CONTROL_RISK      0
Detection_Risk    0
Audit_Risk        0
Risk              0
dtype: int64

In [8]:
# Fill missing value
df['Money_Value'] = df['Money_Value'].fillna(df['Money_Value'].mean())

In [10]:
df.isna().sum().sum()

0

In [14]:
### One Hot Encoding the LOCATION_ID column
location_dummies = pd.get_dummies(df['LOCATION_ID'], dtype=int, prefix='location')
df = pd.concat([df, location_dummies], axis=1)
df = df.drop('LOCATION_ID', axis=1)

In [15]:
df

Unnamed: 0,Sector_score,PARA_A,Score_A,Risk_A,PARA_B,Score_B,Risk_B,TOTAL,numbers,Score_B.1,Risk_C,Money_Value,Score_MV,Risk_D,District_Loss,PROB,RiSk_E,History,Prob,Risk_F,Score,Inherent_Risk,CONTROL_RISK,Detection_Risk,Audit_Risk,Risk,location_1,location_11,location_12,location_13,location_14,location_15,location_16,location_17,location_18,location_19,location_2,location_20,location_21,location_22,location_23,location_24,location_25,location_27,location_28,location_29,location_3,location_30,location_31,location_32,location_33,location_34,location_35,location_36,location_37,location_38,location_39,location_4,location_40,location_41,location_42,location_43,location_44,location_5,location_6,location_7,location_8,location_9,location_LOHARU,location_NUH,location_SAFIDON
0,3.89,4.18,0.6,2.508,2.50,0.2,0.500,6.68,5.0,0.2,1.0,3.38,0.2,0.676,2,0.2,0.4,0,0.2,0.0,2.4,8.574,0.4,0.5,1.7148,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
1,3.89,0.00,0.2,0.000,4.83,0.2,0.966,4.83,5.0,0.2,1.0,0.94,0.2,0.188,2,0.2,0.4,0,0.2,0.0,2.0,2.554,0.4,0.5,0.5108,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0
2,3.89,0.51,0.2,0.102,0.23,0.2,0.046,0.74,5.0,0.2,1.0,0.00,0.2,0.000,2,0.2,0.4,0,0.2,0.0,2.0,1.548,0.4,0.5,0.3096,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0
3,3.89,0.00,0.2,0.000,10.80,0.6,6.480,10.80,6.0,0.6,3.6,11.75,0.6,7.050,2,0.2,0.4,0,0.2,0.0,4.4,17.530,0.4,0.5,3.5060,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0
4,3.89,0.00,0.2,0.000,0.08,0.2,0.016,0.08,5.0,0.2,1.0,0.00,0.2,0.000,2,0.2,0.4,0,0.2,0.0,2.0,1.416,0.4,0.5,0.2832,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
771,55.57,0.49,0.2,0.098,0.40,0.2,0.080,0.89,5.0,0.2,1.0,0.00,0.2,0.000,2,0.2,0.4,0,0.2,0.0,2.0,1.578,0.4,0.5,0.3156,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0
772,55.57,0.47,0.2,0.094,0.37,0.2,0.074,0.84,5.0,0.2,1.0,0.00,0.2,0.000,2,0.2,0.4,0,0.2,0.0,2.0,1.568,0.4,0.5,0.3136,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
773,55.57,0.24,0.2,0.048,0.04,0.2,0.008,0.28,5.0,0.2,1.0,0.00,0.2,0.000,2,0.2,0.4,0,0.2,0.0,2.0,1.456,0.4,0.5,0.2912,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
774,55.57,0.20,0.2,0.040,0.00,0.2,0.000,0.20,5.0,0.2,1.0,0.00,0.2,0.000,2,0.2,0.4,0,0.2,0.0,2.0,1.440,0.4,0.5,0.2880,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0


In [16]:
# Split df into X and y
y = df['Risk']
X = df.drop('Risk', axis=1)

In [17]:
# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.7, shuffle=True, random_state=1)
X_train

Unnamed: 0,Sector_score,PARA_A,Score_A,Risk_A,PARA_B,Score_B,Risk_B,TOTAL,numbers,Score_B.1,Risk_C,Money_Value,Score_MV,Risk_D,District_Loss,PROB,RiSk_E,History,Prob,Risk_F,Score,Inherent_Risk,CONTROL_RISK,Detection_Risk,Audit_Risk,location_1,location_11,location_12,location_13,location_14,location_15,location_16,location_17,location_18,location_19,location_2,location_20,location_21,location_22,location_23,location_24,location_25,location_27,location_28,location_29,location_3,location_30,location_31,location_32,location_33,location_34,location_35,location_36,location_37,location_38,location_39,location_4,location_40,location_41,location_42,location_43,location_44,location_5,location_6,location_7,location_8,location_9,location_LOHARU,location_NUH,location_SAFIDON
11,3.89,15.38,0.6,9.228,40.14,0.6,24.084,55.52,5.0,0.2,1.0,0.96,0.2,0.192,2,0.4,0.8,1,0.4,0.4,4.0,35.704,1.2,0.5,21.4224,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0
214,2.72,5.65,0.6,3.390,67.16,0.6,40.296,72.81,6.0,0.6,3.6,27.23,0.6,16.338,2,0.2,0.4,0,0.2,0.0,4.8,64.024,0.4,0.5,12.8048,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0
88,3.89,6.85,0.6,4.110,65.93,0.6,39.558,72.78,5.5,0.4,2.2,6.02,0.4,2.408,4,0.2,0.8,0,0.2,0.0,4.4,49.076,0.8,0.5,19.6304,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
479,1.85,0.21,0.2,0.042,0.00,0.2,0.000,0.21,5.0,0.2,1.0,0.00,0.2,0.000,2,0.2,0.4,0,0.2,0.0,2.0,1.442,0.4,0.5,0.2884,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0
602,55.57,0.60,0.2,0.120,0.46,0.2,0.092,1.06,5.0,0.2,1.0,0.00,0.2,0.000,2,0.2,0.4,0,0.2,0.0,2.0,1.612,0.4,0.5,0.3224,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
715,55.57,0.00,0.2,0.000,0.00,0.2,0.000,0.00,5.0,0.2,1.0,0.00,0.2,0.000,2,0.2,0.4,0,0.2,0.0,2.0,1.400,0.4,0.5,0.2800,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
767,55.57,0.36,0.2,0.072,0.54,0.2,0.108,0.90,5.0,0.2,1.0,0.21,0.2,0.042,2,0.2,0.4,0,0.2,0.0,2.0,1.622,0.4,0.5,0.3244,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
72,3.89,4.24,0.6,2.544,8.06,0.4,3.224,12.30,5.0,0.2,1.0,10.69,0.6,6.414,2,0.2,0.4,0,0.2,0.0,3.4,13.582,0.4,0.5,2.7164,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
235,2.72,4.90,0.6,2.940,98.71,0.6,59.226,103.61,5.5,0.4,2.2,82.99,0.6,49.794,2,0.2,0.4,1,0.4,0.4,4.6,114.960,0.8,0.5,45.9840,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0


In [18]:
y_train

11     1
214    1
88     1
479    0
602    0
      ..
715    0
767    0
72     1
235    1
37     0
Name: Risk, Length: 543, dtype: int64

In [19]:
# Scale X
scaler = StandardScaler()
scaler.fit(X_train)

In [20]:
X_train = pd.DataFrame(scaler.transform(X_train), index=X_train.index, columns=X_train.columns)
X_test = pd.DataFrame(scaler.transform(X_test), index=X_test.index, columns=X_test.columns)

In [21]:
X_train

Unnamed: 0,Sector_score,PARA_A,Score_A,Risk_A,PARA_B,Score_B,Risk_B,TOTAL,numbers,Score_B.1,Risk_C,Money_Value,Score_MV,Risk_D,District_Loss,PROB,RiSk_E,History,Prob,Risk_F,Score,Inherent_Risk,CONTROL_RISK,Detection_Risk,Audit_Risk,location_1,location_11,location_12,location_13,location_14,location_15,location_16,location_17,location_18,location_19,location_2,location_20,location_21,location_22,location_23,location_24,location_25,location_27,location_28,location_29,location_3,location_30,location_31,location_32,location_33,location_34,location_35,location_36,location_37,location_38,location_39,location_4,location_40,location_41,location_42,location_43,location_44,location_5,location_6,location_7,location_8,location_9,location_LOHARU,location_NUH,location_SAFIDON
11,-0.666896,2.392461,1.368968,2.402906,1.371985,1.657938,1.378710,1.753517,-0.232477,-0.278621,-0.266910,-0.202404,-0.582748,-0.206065,-0.399153,5.252651,1.070359,1.558508,2.660368,1.044634,1.510376,0.429590,1.461936,0.0,0.835180,-0.096404,-0.150329,-0.237529,-0.233171,-0.174243,-0.210311,-0.285785,-0.042954,-0.129823,-0.300602,-0.233171,-0.096404,-0.105703,-0.185164,0.0,-0.042954,-0.074536,-0.105703,-0.105703,-0.162681,-0.060802,-0.042954,-0.129823,-0.205491,-0.042954,0.0,-0.042954,-0.060802,-0.105703,-0.074536,-0.096404,-0.233171,-0.074536,-0.042954,0.0,-0.096404,0.0,-0.254374,-0.224255,-0.060802,2.810985,-0.258452,-0.042954,0.0,-0.042954
214,-0.715284,0.588386,1.368968,0.618393,2.582421,1.657938,2.585408,2.449293,3.426516,4.938314,4.710943,0.222161,1.918693,0.228428,-0.399153,-0.169762,-0.412307,-0.197239,-0.257975,-0.173919,2.450359,1.062614,-0.389565,0.0,0.367093,-0.096404,-0.150329,-0.237529,-0.233171,-0.174243,-0.210311,-0.285785,-0.042954,-0.129823,-0.300602,-0.233171,-0.096404,-0.105703,-0.185164,0.0,-0.042954,-0.074536,-0.105703,-0.105703,-0.162681,-0.060802,-0.042954,-0.129823,-0.205491,-0.042954,0.0,-0.042954,-0.060802,-0.105703,-0.074536,-0.096404,4.288689,-0.074536,-0.042954,0.0,-0.096404,0.0,-0.254374,-0.224255,-0.060802,-0.355747,-0.258452,-0.042954,0.0,-0.042954
88,-0.666896,0.810883,1.368968,0.838477,2.527320,1.657938,2.530477,2.448086,1.597019,2.329847,2.030561,-0.120626,0.667972,-0.146432,1.281003,-0.169762,1.070359,-0.197239,-0.257975,-0.173919,1.980368,0.728488,0.536185,0.0,0.737843,-0.096404,-0.150329,-0.237529,-0.233171,-0.174243,-0.210311,-0.285785,-0.042954,-0.129823,-0.300602,-0.233171,-0.096404,-0.105703,-0.185164,0.0,-0.042954,-0.074536,-0.105703,-0.105703,-0.162681,-0.060802,-0.042954,-0.129823,4.866397,-0.042954,0.0,-0.042954,-0.060802,-0.105703,-0.074536,-0.096404,-0.233171,-0.074536,-0.042954,0.0,-0.096404,0.0,-0.254374,-0.224255,-0.060802,-0.355747,-0.258452,-0.042954,0.0,-0.042954
479,-0.751265,-0.420264,-0.904272,-0.404997,-0.426199,-0.677368,-0.413920,-0.472241,-0.232477,-0.278621,-0.266910,-0.217919,-0.582748,-0.211232,-0.399153,-0.169762,-0.412307,-0.197239,-0.257975,-0.173919,-0.839579,-0.336254,-0.389565,0.0,-0.312767,-0.096404,-0.150329,-0.237529,-0.233171,-0.174243,-0.210311,-0.285785,-0.042954,-0.129823,-0.300602,-0.233171,-0.096404,-0.105703,-0.185164,0.0,-0.042954,-0.074536,-0.105703,-0.105703,-0.162681,-0.060802,-0.042954,-0.129823,-0.205491,-0.042954,0.0,-0.042954,-0.060802,-0.105703,-0.074536,-0.096404,-0.233171,-0.074536,-0.042954,0.0,-0.096404,0.0,-0.254374,-0.224255,-0.060802,2.810985,-0.258452,-0.042954,0.0,-0.042954
602,1.470447,-0.347953,-0.904272,-0.381154,-0.405592,-0.677368,-0.407072,-0.438036,-0.232477,-0.278621,-0.266910,-0.217919,-0.582748,-0.211232,-0.399153,-0.169762,-0.412307,-0.197239,-0.257975,-0.173919,-0.839579,-0.332454,-0.389565,0.0,-0.310920,-0.096404,-0.150329,-0.237529,-0.233171,-0.174243,-0.210311,-0.285785,-0.042954,-0.129823,-0.300602,-0.233171,-0.096404,-0.105703,-0.185164,0.0,-0.042954,-0.074536,-0.105703,-0.105703,6.147009,-0.060802,-0.042954,-0.129823,-0.205491,-0.042954,0.0,-0.042954,-0.060802,-0.105703,-0.074536,-0.096404,-0.233171,-0.074536,-0.042954,0.0,-0.096404,0.0,-0.254374,-0.224255,-0.060802,-0.355747,-0.258452,-0.042954,0.0,-0.042954
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
715,1.470447,-0.459201,-0.904272,-0.417835,-0.426199,-0.677368,-0.413920,-0.480692,-0.232477,-0.278621,-0.266910,-0.217919,-0.582748,-0.211232,-0.399153,-0.169762,-0.412307,-0.197239,-0.257975,-0.173919,-0.839579,-0.337192,-0.389565,0.0,-0.313224,-0.096404,-0.150329,-0.237529,-0.233171,-0.174243,-0.210311,-0.285785,-0.042954,-0.129823,-0.300602,-0.233171,-0.096404,-0.105703,-0.185164,0.0,-0.042954,-0.074536,9.460444,-0.105703,-0.162681,-0.060802,-0.042954,-0.129823,-0.205491,-0.042954,0.0,-0.042954,-0.060802,-0.105703,-0.074536,-0.096404,-0.233171,-0.074536,-0.042954,0.0,-0.096404,0.0,-0.254374,-0.224255,-0.060802,-0.355747,-0.258452,-0.042954,0.0,-0.042954
767,1.470447,-0.392452,-0.904272,-0.395827,-0.402008,-0.677368,-0.405881,-0.444474,-0.232477,-0.278621,-0.266910,-0.214525,-0.582748,-0.210101,-0.399153,-0.169762,-0.412307,-0.197239,-0.257975,-0.173919,-0.839579,-0.332230,-0.389565,0.0,-0.310812,-0.096404,-0.150329,-0.237529,-0.233171,-0.174243,-0.210311,-0.285785,-0.042954,7.702813,-0.300602,-0.233171,-0.096404,-0.105703,-0.185164,0.0,-0.042954,-0.074536,-0.105703,-0.105703,-0.162681,-0.060802,-0.042954,-0.129823,-0.205491,-0.042954,0.0,-0.042954,-0.060802,-0.105703,-0.074536,-0.096404,-0.233171,-0.074536,-0.042954,0.0,-0.096404,0.0,-0.254374,-0.224255,-0.060802,-0.355747,-0.258452,-0.042954,0.0,-0.042954
72,-0.666896,0.326953,1.368968,0.359794,-0.065129,0.490285,-0.173950,0.014279,-0.232477,-0.278621,-0.266910,-0.045152,1.918693,-0.038629,-0.399153,-0.169762,-0.412307,-0.197239,-0.257975,-0.173919,0.805390,-0.064893,-0.389565,0.0,-0.180884,-0.096404,-0.150329,-0.237529,-0.233171,-0.174243,-0.210311,-0.285785,-0.042954,-0.129823,-0.300602,-0.233171,-0.096404,-0.105703,-0.185164,0.0,-0.042954,-0.074536,-0.105703,-0.105703,6.147009,-0.060802,-0.042954,-0.129823,-0.205491,-0.042954,0.0,-0.042954,-0.060802,-0.105703,-0.074536,-0.096404,-0.233171,-0.074536,-0.042954,0.0,-0.096404,0.0,-0.254374,-0.224255,-0.060802,-0.355747,-0.258452,-0.042954,0.0,-0.042954
235,-0.715284,0.449326,1.368968,0.480841,3.995792,1.657938,3.994414,3.688731,1.597019,2.329847,2.030561,1.123332,1.918693,1.128736,-0.399153,-0.169762,-0.412307,1.558508,2.660368,1.044634,2.215363,2.201164,0.536185,0.0,2.169307,-0.096404,-0.150329,-0.237529,-0.233171,-0.174243,-0.210311,-0.285785,-0.042954,-0.129823,-0.300602,-0.233171,-0.096404,-0.105703,-0.185164,0.0,-0.042954,-0.074536,-0.105703,-0.105703,6.147009,-0.060802,-0.042954,-0.129823,-0.205491,-0.042954,0.0,-0.042954,-0.060802,-0.105703,-0.074536,-0.096404,-0.233171,-0.074536,-0.042954,0.0,-0.096404,0.0,-0.254374,-0.224255,-0.060802,-0.355747,-0.258452,-0.042954,0.0,-0.042954


In [23]:
X_train.mean(), X_train.var()

(Sector_score        4.579926e-17
 PARA_A              2.289963e-17
 Score_A            -9.814126e-17
 Risk_A             -3.598513e-17
 PARA_B             -1.308550e-17
                         ...     
 location_8         -7.197026e-17
 location_9         -3.434944e-17
 location_LOHARU     6.542751e-18
 location_NUH        0.000000e+00
 location_SAFIDON   -1.635688e-17
 Length: 70, dtype: float64,
 Sector_score        1.001845
 PARA_A              1.001845
 Score_A             1.001845
 Risk_A              1.001845
 PARA_B              1.001845
                       ...   
 location_8          1.001845
 location_9          1.001845
 location_LOHARU     1.001845
 location_NUH        0.000000
 location_SAFIDON    1.001845
 Length: 70, dtype: float64)

### Training

In [24]:
models = {
    "                   Logistic Regression": LogisticRegression(),
    "                   K-Nearest Neighbors": KNeighborsClassifier(),
    "                         Decision Tree": DecisionTreeClassifier(),
    "Support Vector Machine (Linear Kernel)": LinearSVC(),
    "   Support Vector Machine (RBF Kernel)": SVC(),
    "                        Neural Network": MLPClassifier(),
    "                         Random Forest": RandomForestClassifier(),
    "                     Gradient Boosting": GradientBoostingClassifier()
}

In [25]:
for name, model in models.items():
    model.fit(X_train, y_train)
    print(name + " trained.")

                   Logistic Regression trained.
                   K-Nearest Neighbors trained.
                         Decision Tree trained.
Support Vector Machine (Linear Kernel) trained.
   Support Vector Machine (RBF Kernel) trained.
                        Neural Network trained.
                         Random Forest trained.
                     Gradient Boosting trained.


### Results

In [26]:
for name, model in models.items():
    print(name + ": {:.2f}%".format(model.score(X_test, y_test)*100))

                   Logistic Regression: 98.71%
                   K-Nearest Neighbors: 90.13%
                         Decision Tree: 100.00%
Support Vector Machine (Linear Kernel): 99.14%
   Support Vector Machine (RBF Kernel): 97.00%
                        Neural Network: 99.14%
                         Random Forest: 100.00%
                     Gradient Boosting: 100.00%
