# Data Preprocessing Tools

## Importing the libraries

In [11]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd

## Importing the dataset

In [12]:
dataset = pd.read_csv('synthetic_model_ready_data.csv')
X = dataset.iloc[:, :-1].values
y = dataset.iloc[:, -1].values

## checking for null values

In [13]:
dataset.isnull().sum()

entryId                             0
customerEmail                       0
transactionId                       0
orderId                             0
paymentMethodId                     0
paymentMethodRegistrationFailure    0
paymentMethodType                   0
paymentMethodProvider               0
transactionAmount                   0
transactionFailed                   0
orderState                          0
customerPhone                       0
customerDevice                      0
customerIPAddress                   0
customerBillingAddress              0
No_Transactions                     0
No_Orders                           0
No_Payments                         0
Fraud                               0
dtype: int64

In [14]:
print(X)

[[1 'dana09@yahoo.com' '3cryzmi3' ... 6 3 2]
 [2 'uchen@malone.com' 'yd80pfko' ... 7 7 6]
 [3 'meganberry@clark.biz' 'o4z2x2e9' ... 5 4 2]
 ...
 [11998 'johnlowery@gmail.com' 'i8ish28k' ... 6 4 3]
 [11999 'ybrown@gmail.com' '6p2l2jxb' ... 2 2 3]
 [12000 'johnlowery@gmail.com' 'iw7bakk3' ... 6 4 2]]


In [15]:
print(y)

[False  True False ...  True False False]


## Taking care of missing data

In [9]:
# from sklearn.impute import SimpleImputer
# imputer = SimpleImputer(missing_values=np.nan, strategy='mean')
# imputer.fit(X[:, 1:3])
# X[:, 1:3] = imputer.transform(X[:, 1:3])

'''
code blocked since not applicable for the dataset

Question for GPT: in the above code we are imputing 2nd and 3rd column who are next to each other. 
what if I have a dataset with a mix of categorical and numerical data columns with missing values who are not next to each other?

'''

'\ncode blocked since not applicable for the dataset\n\nQuestion for GPT: in the above code we are imputing 2nd and 3rd column who are next to each other. \nwhat if I have a dataset with a mix of categorical and numerical data columns with missing values who are not next to each other?\n\n'

In [16]:
print(X)

[[1 'dana09@yahoo.com' '3cryzmi3' ... 6 3 2]
 [2 'uchen@malone.com' 'yd80pfko' ... 7 7 6]
 [3 'meganberry@clark.biz' 'o4z2x2e9' ... 5 4 2]
 ...
 [11998 'johnlowery@gmail.com' 'i8ish28k' ... 6 4 3]
 [11999 'ybrown@gmail.com' '6p2l2jxb' ... 2 2 3]
 [12000 'johnlowery@gmail.com' 'iw7bakk3' ... 6 4 2]]


## Encoding categorical data

### Encoding the Independent Variable

In [17]:
# from sklearn.compose import ColumnTransformer
# from sklearn.preprocessing import OneHotEncoder
# ct = ColumnTransformer(transformers=[('encoder', OneHotEncoder(), [0])], remainder='passthrough')
# X = np.array(ct.fit_transform(X))

'''
We will not be using this process as we want to exclude 
the categorical data from the model and only work with the numerical data

'''

'\nWe will not be using this process as we want to exclude \nthe categorical data from the model and only work with the numerical data\n\n'

In [18]:
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder

In [20]:
# Identify categorical and numerical columns
categorical_cols = dataset.select_dtypes(include=['object']).columns.tolist()
numerical_cols = dataset.select_dtypes(include=['number']).columns.tolist()

In [21]:
# Print the identified categorical and numerical columns for verification
print("Categorical columns:", categorical_cols)
print("Numerical columns:", numerical_cols)

Categorical columns: ['customerEmail', 'transactionId', 'orderId', 'paymentMethodId', 'paymentMethodType', 'paymentMethodProvider', 'orderState', 'customerPhone', 'customerDevice', 'customerIPAddress', 'customerBillingAddress']
Numerical columns: ['entryId', 'paymentMethodRegistrationFailure', 'transactionAmount', 'transactionFailed', 'No_Transactions', 'No_Orders', 'No_Payments']


In [22]:
# After verifying the columns, enter the indices of numerical columns
numerical_indices = [0, 5, 8, 9, 15, 16, 17]  # Enter the indices of numerical columns here

In [23]:
# Create a ColumnTransformer to only include the numerical columns
ct = ColumnTransformer(transformers=[
    ('passthrough', 'passthrough', numerical_indices)  # Passthrough numerical columns
])

In [25]:
# Fit and transform the numerical features
X_numerical = ct.fit_transform(dataset)

In [26]:
print(X_numerical)

[[    1     0    12 ...     6     3     2]
 [    2     0    59 ...     7     7     6]
 [    3     0    42 ...     5     4     2]
 ...
 [11998     0    65 ...     6     4     3]
 [11999     0    34 ...     2     2     3]
 [12000     1    33 ...     6     4     2]]


In [27]:
X = X_numerical

In [28]:
print(X)

[[    1     0    12 ...     6     3     2]
 [    2     0    59 ...     7     7     6]
 [    3     0    42 ...     5     4     2]
 ...
 [11998     0    65 ...     6     4     3]
 [11999     0    34 ...     2     2     3]
 [12000     1    33 ...     6     4     2]]


### Encoding the Dependent Variable

In [31]:
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
y = le.fit_transform(y)

In [32]:
print(y)

[0 1 0 ... 1 0 0]


## Splitting the dataset into the Training set and Test set

In [33]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 1)

In [34]:
print(X_train)

[[7591    0   68 ...    3    3    1]
 [2902    0   47 ...    8    4    6]
 [3462    0   14 ...    4    4    4]
 ...
 [ 906    0   15 ...    8    5    2]
 [5193    0   13 ...    1    5    1]
 [ 236    1   74 ...    6    6    1]]


In [35]:
print(X_test)

[[ 7164     1    42 ...     4     5     3]
 [10386     0    19 ...     4     3     1]
 [ 1903     0    16 ...     5     4     2]
 ...
 [ 5216     0    21 ...     6     0     2]
 [ 1006     0    29 ...     6     5     2]
 [ 8107     0    43 ...     2     3     1]]


In [36]:
print(y_train)

[1 1 1 ... 0 1 1]


In [37]:
print(y_test)

[0 0 0 ... 1 1 1]


## Feature Scaling

In [None]:
from sklearn.preprocessing import StandardScaler
sc = StandardScaler()
X_train = sc.fit_transform(X_train)
X_test = sc.transform(X_test)

In [None]:
print(X_train)

In [None]:
print(X_test)