In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.ensemble import ExtraTreesClassifier

In [2]:
#Read the Dataset

In [4]:
# Loading the data
df = pd.read_csv('undersampled_data.csv')
df

Unnamed: 0,User,Card,Year,Month,Day,Amount,UseChip,MerchantName,MerchantCity,MerchantState,Zip,MCC,target
0,1750.0,0.0,2015.0,7.0,16.0,$21.42,Swipe Transaction,Lukass Theaters,Tiffin,OH,44883.000000,7832.0,No
1,1959.0,1.0,2016.0,5.0,5.0,$76.99,Chip Transaction,Jadens Wholesale,Lancaster,CA,93535.000000,5300.0,No
2,182.0,2.0,2012.0,11.0,23.0,$2.19,Swipe Transaction,Supermarket Chain 3,Houston,TX,77096.000000,5411.0,No
3,458.0,2.0,2019.0,5.0,15.0,$45.73,Chip Transaction,Supermarket Chain 3,Flint,MI,48532.000000,5411.0,No
4,1949.0,0.0,2018.0,1.0,4.0,$1.25,Chip Transaction,Convenience Store Chain 1,Brooklyn,NY,11213.000000,5499.0,No
...,...,...,...,...,...,...,...,...,...,...,...,...,...
1739,1998.0,2.0,2013.0,1.0,26.0,$193.24,Swipe Transaction,Neufelder Tegucigalpa Wine and Liquor,Tegucigalpa,Honduras,51694.676895,5921.0,Yes
1740,1999.0,3.0,2020.0,1.0,26.0,$221.96,Swipe Transaction,Abrils Wholesale,Saint Louis,MO,63146.000000,5300.0,Yes
1741,1999.0,3.0,2020.0,1.0,26.0,$26.69,Swipe Transaction,Abrils Wholesale,Saint Louis,MO,63146.000000,5300.0,Yes
1742,1999.0,3.0,2020.0,1.0,26.0,$103.95,Chip Transaction,Cox Saint Louis Restaurant,Saint Louis,MO,63146.000000,5812.0,Yes


In [5]:
#User- column for user id details
#Card-Column for card number
#Year- column for year of transaction
#Month-column for month of transaction
#Day-column for Day of transaction
#Amount-for how much amount transacted 
#Use Chip-for transaction is based on online or swipe transaction
#Merchant name- Name of the merchant in the transaction
#Merchant city-Merchant city name in the transaction
#Merchant state-Merchant state name in the transaction
#Zip-Postal code of the merchant area
#MCC-It is a four number pin given by bank for each card
df.columns

Index(['User', 'Card', 'Year', 'Month', 'Day', 'Amount', 'UseChip',
       'MerchantName', 'MerchantCity', 'MerchantState', 'Zip', 'MCC',
       'target'],
      dtype='object')

In [6]:
#Check Null values

In [7]:
df.isnull().sum()

User             0
Card             0
Year             0
Month            0
Day              0
Amount           0
UseChip          0
MerchantName     0
MerchantCity     0
MerchantState    0
Zip              0
MCC              0
target           0
dtype: int64

In [8]:
df.head()

Unnamed: 0,User,Card,Year,Month,Day,Amount,UseChip,MerchantName,MerchantCity,MerchantState,Zip,MCC,target
0,1750.0,0.0,2015.0,7.0,16.0,$21.42,Swipe Transaction,Lukass Theaters,Tiffin,OH,44883.0,7832.0,No
1,1959.0,1.0,2016.0,5.0,5.0,$76.99,Chip Transaction,Jadens Wholesale,Lancaster,CA,93535.0,5300.0,No
2,182.0,2.0,2012.0,11.0,23.0,$2.19,Swipe Transaction,Supermarket Chain 3,Houston,TX,77096.0,5411.0,No
3,458.0,2.0,2019.0,5.0,15.0,$45.73,Chip Transaction,Supermarket Chain 3,Flint,MI,48532.0,5411.0,No
4,1949.0,0.0,2018.0,1.0,4.0,$1.25,Chip Transaction,Convenience Store Chain 1,Brooklyn,NY,11213.0,5499.0,No


In [9]:
#Check balanced data or imbalanced data

In [11]:
# Check balanced data or imbalanced data
df['target'].value_counts()


target
No     872
Yes    872
Name: count, dtype: int64

In [12]:
#Split the input and output values

In [14]:
# Separate independent and dependent variables
independent = df[['User', 'Card', 'Year', 'Month', 'Day', 'UseChip',
                   'MerchantName', 'MerchantCity', 'MerchantState', 'Zip', 'MCC']]
dependent = df[['target']]

print(independent)
print(dependent)


        User  Card    Year  Month   Day             UseChip  \
0     1750.0   0.0  2015.0    7.0  16.0   Swipe Transaction   
1     1959.0   1.0  2016.0    5.0   5.0    Chip Transaction   
2      182.0   2.0  2012.0   11.0  23.0   Swipe Transaction   
3      458.0   2.0  2019.0    5.0  15.0    Chip Transaction   
4     1949.0   0.0  2018.0    1.0   4.0    Chip Transaction   
...      ...   ...     ...    ...   ...                 ...   
1739  1998.0   2.0  2013.0    1.0  26.0   Swipe Transaction   
1740  1999.0   3.0  2020.0    1.0  26.0   Swipe Transaction   
1741  1999.0   3.0  2020.0    1.0  26.0   Swipe Transaction   
1742  1999.0   3.0  2020.0    1.0  26.0    Chip Transaction   
1743  1999.0   3.0  2020.0    1.0  26.0  Online Transaction   

                               MerchantName MerchantCity MerchantState  \
0                           Lukass Theaters       Tiffin            OH   
1                          Jadens Wholesale    Lancaster            CA   
2                    

In [15]:
#Data preprocessing

In [16]:
#split the numerical and categorical values

In [17]:
def quanQual(df):
    quan=[]
    qual=[]
    for columnName in df.columns:
    #print(columnName)
        if(df[columnName].dtypes=='O'):
        #print("qual")
            qual.append(columnName)
        else:
        #print("quan")
             quan.append(columnName)
    return quan,qual

In [18]:
quan,qual=quanQual(df)

In [19]:
quan

['User', 'Card', 'Year', 'Month', 'Day', 'Zip', 'MCC']

In [20]:
qual

['Amount',
 'UseChip',
 'MerchantName',
 'MerchantCity',
 'MerchantState',
 'target']

In [21]:
import numpy as np
from sklearn.impute import SimpleImputer
imp=SimpleImputer(missing_values=np.nan,strategy="mean",copy=True)
imp.fit(df[quan])
datan=imp.transform(df[quan])

In [22]:
datan

array([[1.75000000e+03, 0.00000000e+00, 2.01500000e+03, ...,
        1.60000000e+01, 4.48830000e+04, 7.83200000e+03],
       [1.95900000e+03, 1.00000000e+00, 2.01600000e+03, ...,
        5.00000000e+00, 9.35350000e+04, 5.30000000e+03],
       [1.82000000e+02, 2.00000000e+00, 2.01200000e+03, ...,
        2.30000000e+01, 7.70960000e+04, 5.41100000e+03],
       ...,
       [1.99900000e+03, 3.00000000e+00, 2.02000000e+03, ...,
        2.60000000e+01, 6.31460000e+04, 5.30000000e+03],
       [1.99900000e+03, 3.00000000e+00, 2.02000000e+03, ...,
        2.60000000e+01, 6.31460000e+04, 5.81200000e+03],
       [1.99900000e+03, 3.00000000e+00, 2.02000000e+03, ...,
        2.60000000e+01, 5.16946769e+04, 5.81500000e+03]])

In [23]:
datan=pd.DataFrame(datan,columns=quan)

In [24]:
import numpy as np
from sklearn.impute import SimpleImputer
imp=SimpleImputer(missing_values=np.nan,strategy="most_frequent")
imp.fit(df[qual])
datal=imp.transform(df[qual])

In [25]:
datal

array([['$21.42', 'Swipe Transaction', 'Lukass Theaters', 'Tiffin', 'OH',
        'No'],
       ['$76.99', 'Chip Transaction', 'Jadens Wholesale', 'Lancaster',
        'CA', 'No'],
       ['$2.19', 'Swipe Transaction', 'Supermarket Chain 3', 'Houston',
        'TX', 'No'],
       ...,
       ['$26.69', 'Swipe Transaction', 'Abrils Wholesale', 'Saint Louis',
        'MO', 'Yes'],
       ['$103.95', 'Chip Transaction', 'Cox Saint Louis Restaurant',
        'Saint Louis', 'MO', 'Yes'],
       ['$0.24', 'Online Transaction', 'Digital Content Company 2',
        'ONLINE', 'CA', 'Yes']], dtype=object)

In [26]:
datal=pd.DataFrame(datal,columns=qual)

In [27]:
df=pd.concat([datan,datal],axis=1)

In [30]:
csv=df.to_csv("Preprocessed_undersampled_data.csv",index=False)

In [29]:
csv