In [1]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
import time

**Loading the dataset from json to DataFrame**

In [3]:
df = pd.read_json("transactions.txt", lines=True)
print("Time required to parse data :",time.process_time())

Time required to parse data : 75.697185991


**Looking at Information on Dataset**

In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 786363 entries, 0 to 786362
Data columns (total 29 columns):
 #   Column                    Non-Null Count   Dtype  
---  ------                    --------------   -----  
 0   accountNumber             786363 non-null  int64  
 1   customerId                786363 non-null  int64  
 2   creditLimit               786363 non-null  int64  
 3   availableMoney            786363 non-null  float64
 4   transactionDateTime       786363 non-null  object 
 5   transactionAmount         786363 non-null  float64
 6   merchantName              786363 non-null  object 
 7   acqCountry                786363 non-null  object 
 8   merchantCountryCode       786363 non-null  object 
 9   posEntryMode              786363 non-null  object 
 10  posConditionCode          786363 non-null  object 
 11  merchantCategoryCode      786363 non-null  object 
 12  currentExpDate            786363 non-null  object 
 13  accountOpenDate           786363 non-null  o

**Checking for Number of Invalid transaction and Valid transactions**

*   Here we are checking for number of fraud transactions and Valid transactions.
*  Here we have 2 target variables :
    1.   0 is Valid Transaction
    2.   1 is Fraud Transaction

*   More than 98% of the data is inclined towards 1 class (Valid Transactions), and rest 2% percent is inclined towards other.

*   We cannot use this data to as input to our Machine learning model as we have very less data points for the second case.



In [5]:
#counting the value
df['isFraud'].value_counts()

False    773946
True      12417
Name: isFraud, dtype: int64

**Dropping unnecessary columns** 

In [6]:
df.drop(["posEntryMode","posConditionCode","transactionDateTime","acqCountry","merchantCountryCode","merchantName","merchantCategoryCode","currentExpDate","accountOpenDate","dateOfLastAddressChange","transactionType","merchantCity","merchantState","merchantZip","posOnPremises","recurringAuthInd","echoBuffer"], axis = 1, inplace = True)

In [7]:
df.head()

Unnamed: 0,accountNumber,customerId,creditLimit,availableMoney,transactionAmount,cardCVV,enteredCVV,cardLast4Digits,currentBalance,cardPresent,expirationDateKeyInMatch,isFraud
0,737265056,737265056,5000,5000.0,98.55,414,414,1803,0.0,False,False,False
1,737265056,737265056,5000,5000.0,74.51,486,486,767,0.0,True,False,False
2,737265056,737265056,5000,5000.0,7.47,486,486,767,0.0,False,False,False
3,737265056,737265056,5000,5000.0,7.47,486,486,767,0.0,False,False,False
4,830329091,830329091,5000,5000.0,71.18,885,885,3143,0.0,True,False,False


**Convert Boolean columns to Numeric ones**

In [8]:
df["cardPresent"] = df['cardPresent'] * 1
df["expirationDateKeyInMatch"] = df["expirationDateKeyInMatch"] * 1
df.head()

Unnamed: 0,accountNumber,customerId,creditLimit,availableMoney,transactionAmount,cardCVV,enteredCVV,cardLast4Digits,currentBalance,cardPresent,expirationDateKeyInMatch,isFraud
0,737265056,737265056,5000,5000.0,98.55,414,414,1803,0.0,0,0,False
1,737265056,737265056,5000,5000.0,74.51,486,486,767,0.0,1,0,False
2,737265056,737265056,5000,5000.0,7.47,486,486,767,0.0,0,0,False
3,737265056,737265056,5000,5000.0,7.47,486,486,767,0.0,0,0,False
4,830329091,830329091,5000,5000.0,71.18,885,885,3143,0.0,1,0,False


**Seperating the data for analysis**

*   Checking the dataset for class values 0 and 1 which we will store in a seperate data frame.



In [9]:
valid = df[df.isFraud == 0]
fraud = df[df.isFraud == 1]

#checking the count of new variables
print("Number of Records: {} and Number of Columns: {} for valid".format(valid.shape[0],valid.shape[1]))
print("Number of Records: {} and Number of Columns: {} for fraud".format(fraud.shape[0],fraud.shape[1]))

Number of Records: 773946 and Number of Columns: 12 for valid
Number of Records: 12417 and Number of Columns: 12 for fraud


**Building a sample dataset**

* Using undersampling we will create a dataset containing equal number of valid and fraud transactions.

* We have 773946 Valid transactions, taking 12417 random transactions from valid so we will have equal number of fraud and valid transactions.


In [10]:
# n is number of transacions we want to pick
valid_sample = valid.sample(n=12417)

#concatinating two data frams (valid_sample and fruad) so we have equal number of records.
# axis is 0 because we don't want to add data column wise, we want to add it row wise
new_df = pd.concat([valid_sample,fraud],axis = 0)

# we can confirm the data is random by looking at the index.
print(new_df.head(10))

print(new_df.tail(10))

        accountNumber  customerId  creditLimit  availableMoney  \
461645      255139921   255139921         5000         2315.89   
197005      148553299   148553299          250           21.00   
220784      245648346   245648346        10000         7865.84   
649413      794398270   794398270         2500         1170.77   
70856       126809470   126809470          250           95.70   
570156      436162049   436162049         7500         1781.89   
623435      414947246   414947246          250          250.00   
779770      590331215   590331215        15000         2656.40   
425184      543639500   543639500         7500         4057.04   
688793      797789017   797789017         7500         3740.64   

        transactionAmount  cardCVV  enteredCVV  cardLast4Digits  \
461645              38.60      389         389             5694   
197005             150.78      771         771             7590   
220784               4.07      804         804             6044   
64941

In [11]:
# now we have same number of dataframes in both
new_df['isFraud'].value_counts()

False    12417
True     12417
Name: isFraud, dtype: int64

**Splitting data into features and targets**

* We will create 2 variables x,y and we will drop the column isFraud.
* Note : we will use axis = 1 here that is column
* And we will store ifFraud to y variable


In [12]:
X = new_df.drop(columns='isFraud',axis = 1)
Y = new_df['isFraud']

#converting true false to 0 1
Y = Y*1  
#checking x and y values
print("Values of X \n",X)
print("Values of Y \n",Y)

Values of X 
         accountNumber  customerId  creditLimit  availableMoney  \
461645      255139921   255139921         5000         2315.89   
197005      148553299   148553299          250           21.00   
220784      245648346   245648346        10000         7865.84   
649413      794398270   794398270         2500         1170.77   
70856       126809470   126809470          250           95.70   
...               ...         ...          ...             ...   
785680      207667444   207667444         7500         5517.88   
785713      207667444   207667444         7500          764.48   
785888      428856030   428856030          250           61.94   
786054      657364505   657364505        20000        20000.00   
786112      899818521   899818521         2500          340.38   

        transactionAmount  cardCVV  enteredCVV  cardLast4Digits  \
461645              38.60      389         389             5694   
197005             150.78      771         771             

**Splitting the data into training and tesing**

* We will create 4 variables X_train, X_test, Y_train, Y_test where x has the features and y has the labels.

* We will split data into x and y randomly for training and testing purpose.

* We will use test size as 30% and use 70% of the data to train the model.

* We will use stratify so that the number of 1(True) and 0(False) is evenly distributed.

**Checking the datapoints in each variable**

* X.shape Total number of data points
* Y_train used for training 
* X_test used to test 

In [13]:
X.head()

Unnamed: 0,accountNumber,customerId,creditLimit,availableMoney,transactionAmount,cardCVV,enteredCVV,cardLast4Digits,currentBalance,cardPresent,expirationDateKeyInMatch
461645,255139921,255139921,5000,2315.89,38.6,389,389,5694,2684.11,0,0
197005,148553299,148553299,250,21.0,150.78,771,771,7590,229.0,1,0
220784,245648346,245648346,10000,7865.84,4.07,804,804,6044,2134.16,0,0
649413,794398270,794398270,2500,1170.77,190.53,161,161,3791,1329.23,0,0
70856,126809470,126809470,250,95.7,99.39,375,375,9468,154.3,0,0


**Checking dataset before loading**

In [14]:
X.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 24834 entries, 461645 to 786112
Data columns (total 11 columns):
 #   Column                    Non-Null Count  Dtype  
---  ------                    --------------  -----  
 0   accountNumber             24834 non-null  int64  
 1   customerId                24834 non-null  int64  
 2   creditLimit               24834 non-null  int64  
 3   availableMoney            24834 non-null  float64
 4   transactionAmount         24834 non-null  float64
 5   cardCVV                   24834 non-null  int64  
 6   enteredCVV                24834 non-null  int64  
 7   cardLast4Digits           24834 non-null  int64  
 8   currentBalance            24834 non-null  float64
 9   cardPresent               24834 non-null  int64  
 10  expirationDateKeyInMatch  24834 non-null  int64  
dtypes: float64(3), int64(8)
memory usage: 2.3 MB


**Training Model**

* Loading instance of our model to model.
* Training the logistic regression model with training data.
* Using fit to fit data to our regression model.

In [15]:
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size = 0.3, stratify= Y, random_state= 2  )

In [16]:
X_train.head()

Unnamed: 0,accountNumber,customerId,creditLimit,availableMoney,transactionAmount,cardCVV,enteredCVV,cardLast4Digits,currentBalance,cardPresent,expirationDateKeyInMatch
653368,570884863,570884863,5000,1373.27,336.47,289,289,2194,3626.73,1,0
390597,340865874,340865874,15000,10517.89,53.7,729,729,5166,4482.11,0,0
73242,367824223,367824223,5000,4653.48,29.06,459,459,4892,346.52,1,0
701186,304534021,304534021,15000,12579.68,255.61,560,560,3120,2420.32,0,0
430589,419709514,419709514,5000,1433.77,259.8,917,917,2178,3566.23,0,0


### **After we have fitted and trained the data we can give the same input of variable to different algorithms.**

**Training LogisticRegression**

In [29]:
model = LogisticRegression()
start = time.process_time()
model.fit(X_train,Y_train)
print("LR Training time :",time.process_time() - start)

LR Training time : 0.09511728299999334


**Training Random Forest**

In [42]:
rfm = RandomForestClassifier(n_estimators=80, oob_score=True,n_jobs=-1,random_state=101, max_features=None, min_samples_leaf= 30)
start = time.process_time()
rfm.fit(X_train,Y_train)
y_pred = rfm.predict(X_test)
print("RF Training time :",time.process_time() - start)

RF Training time : 10.32458462599999


**Calculating Performance**

In [23]:
#checking accuracy againt the test data
X_test_prediction = model.predict(X_test)
testing_data_accuracy = accuracy_score(X_test_prediction, Y_test)


In [24]:
print("Accuracy of Logistic Regression :",testing_data_accuracy*100,"%")

Accuracy of Logistic Regression : 50.00671050865656 %


In [43]:
testing_data_accuracy_random = accuracy_score(y_pred, Y_test)

In [44]:
print("Accuracy of Random Forest :",testing_data_accuracy_random*100,'%')

Accuracy of Random Forest : 69.07797611058918 %
