# Random forest classification
## Importing the libraries
```



In [1]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd

## Importing the dataset

In [2]:
dataset = pd.read_csv('card_transdata.csv')
X = dataset.iloc[:, :-1].values
y = dataset.iloc[:, -1].values

In [3]:
print(X)

[[57.87785658  0.31114001  1.94593998 ...  1.          0.
   0.        ]
 [10.8299427   0.1755915   1.29421881 ...  0.          0.
   0.        ]
 [ 5.09107949  0.80515259  0.42771456 ...  0.          0.
   1.        ]
 ...
 [ 2.91485699  1.47268669  0.21807549 ...  1.          0.
   1.        ]
 [ 4.25872939  0.24202337  0.47582206 ...  0.          0.
   1.        ]
 [58.10812496  0.31811012  0.38691985 ...  1.          0.
   1.        ]]


In [4]:
print(y)


[0. 0. 0. ... 0. 0. 0.]


## Describing the Data

In [5]:
dataset.describe()

Unnamed: 0,distance_from_home,distance_from_last_transaction,ratio_to_median_purchase_price,repeat_retailer,used_chip,used_pin_number,online_order,fraud
count,1000000.0,1000000.0,1000000.0,1000000.0,1000000.0,1000000.0,1000000.0,1000000.0
mean,26.628792,5.036519,1.824182,0.881536,0.350399,0.100608,0.650552,0.087403
std,65.390784,25.843093,2.799589,0.323157,0.477095,0.300809,0.476796,0.282425
min,0.004874,0.000118,0.004399,0.0,0.0,0.0,0.0,0.0
25%,3.878008,0.296671,0.475673,1.0,0.0,0.0,0.0,0.0
50%,9.96776,0.99865,0.997717,1.0,0.0,0.0,1.0,0.0
75%,25.743985,3.355748,2.09637,1.0,1.0,0.0,1.0,0.0
max,10632.723672,11851.104565,267.802942,1.0,1.0,1.0,1.0,1.0


In [6]:
dataset.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000000 entries, 0 to 999999
Data columns (total 8 columns):
 #   Column                          Non-Null Count    Dtype  
---  ------                          --------------    -----  
 0   distance_from_home              1000000 non-null  float64
 1   distance_from_last_transaction  1000000 non-null  float64
 2   ratio_to_median_purchase_price  1000000 non-null  float64
 3   repeat_retailer                 1000000 non-null  float64
 4   used_chip                       1000000 non-null  float64
 5   used_pin_number                 1000000 non-null  float64
 6   online_order                    1000000 non-null  float64
 7   fraud                           1000000 non-null  float64
dtypes: float64(8)
memory usage: 61.0 MB


## Feature Engineering


## Impute Missing Values


In [7]:
new_value = float("Nan")
#missing values are represented with '9' in this dataset
dataset.replace(to_replace =9 ,value = new_value, inplace= True)
dataset = dataset.dropna()
dataset.shape

(1000000, 8)

## Find and Remove Duplicate rows


In [8]:
dataset.duplicated().sum()
dataset.drop_duplicates(inplace=True)
#size after removing duplicates
dataset.shape

(1000000, 8)

## Splitting the dataset into the Training set and Test set

In [9]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.25, random_state = 0)

In [10]:
print(X_train)

[[2.58733764e+00 4.25548683e-01 2.11492301e-01 ... 0.00000000e+00
  1.00000000e+00 1.00000000e+00]
 [1.87274416e+01 1.51142880e-02 1.36290579e+00 ... 0.00000000e+00
  1.00000000e+00 1.00000000e+00]
 [2.83393468e+01 3.99962434e-01 4.91858862e-01 ... 0.00000000e+00
  0.00000000e+00 1.00000000e+00]
 ...
 [2.68070835e+01 4.40967803e+00 4.84272835e-01 ... 0.00000000e+00
  1.00000000e+00 0.00000000e+00]
 [1.21443426e+01 4.08443337e-02 1.79287662e+00 ... 0.00000000e+00
  0.00000000e+00 1.00000000e+00]
 [4.93375070e+01 5.25678996e-01 3.22232714e+00 ... 1.00000000e+00
  0.00000000e+00 0.00000000e+00]]


In [11]:
print(X_test)

[[ 3.66060786  0.10902654  4.30416756 ...  0.          0.
   1.        ]
 [ 0.79312666  1.55401696  0.73468389 ...  0.          0.
   1.        ]
 [ 1.44662928  1.06029302  0.62118212 ...  0.          0.
   1.        ]
 ...
 [36.8792349   1.02906813  0.09171728 ...  1.          0.
   0.        ]
 [ 3.73082529  0.14468908  0.86922081 ...  0.          0.
   1.        ]
 [ 0.64275956  0.05048547  2.215808   ...  0.          0.
   1.        ]]


In [12]:
print(y_train)

[0. 0. 0. ... 0. 0. 0.]


In [13]:
print(y_test)

[1. 0. 0. ... 0. 0. 0.]


## Feature Scaling

In [14]:
from sklearn.preprocessing import StandardScaler
sc = StandardScaler()
X_train = sc.fit_transform(X_train)
X_test = sc.transform(X_test)

In [15]:
print(X_train)


[[-3.69867284e-01 -1.69775265e-01 -5.75465065e-01 ... -7.34373613e-01
   2.98724857e+00  7.32342041e-01]
 [-1.21861004e-01 -1.84870184e-01 -1.64475032e-01 ... -7.34373613e-01
   2.98724857e+00  7.32342041e-01]
 [ 2.58340077e-02 -1.70716274e-01 -4.75389929e-01 ... -7.34373613e-01
  -3.34756207e-01  7.32342041e-01]
 ...
 [ 2.28949297e-03 -2.32473189e-02 -4.78097715e-01 ... -7.34373613e-01
   2.98724857e+00 -1.36548217e+00]
 [-2.23015860e-01 -1.83923887e-01 -1.09995663e-02 ... -7.34373613e-01
  -3.34756207e-01  7.32342041e-01]
 [ 3.48488410e-01 -1.66092682e-01  4.99234090e-01 ...  1.36170470e+00
  -3.34756207e-01 -1.36548217e+00]]


In [16]:
print(X_test)

[[-0.35337558 -0.18141629  0.88539044 ... -0.73437361 -0.33475621
   0.73234204]
 [-0.39743685 -0.12827256 -0.38871501 ... -0.73437361 -0.33475621
   0.73234204]
 [-0.38739523 -0.1464307  -0.42922878 ... -0.73437361 -0.33475621
   0.73234204]
 ...
 [ 0.15705658 -0.14757908 -0.61821803 ...  1.3617047  -0.33475621
  -1.36548217]
 [-0.35229663 -0.18010469 -0.34069288 ... -0.73437361 -0.33475621
   0.73234204]
 [-0.39974736 -0.18356931  0.13996321 ... -0.73437361 -0.33475621
   0.73234204]]


## Training the Random forest classification model on the Training set

In [17]:
from sklearn.ensemble import RandomForestClassifier
classifier = RandomForestClassifier(n_estimators = 10, criterion = 'entropy', random_state = 0)
classifier.fit(X_train, y_train)

RandomForestClassifier(criterion='entropy', n_estimators=10, random_state=0)

## Predicting a new result

In [18]:
y_pred = classifier.predict(X_test)
y_pred

array([1., 0., 0., ..., 0., 0., 0.])

## Making the Confusion Matrix

In [19]:
from sklearn.metrics import confusion_matrix, accuracy_score
cm = confusion_matrix(y_test, y_pred)
print(cm)
accuracy_score(y_test, y_pred)

[[228273      0]
 [     3  21724]]


0.999988