In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns


In [2]:
data = pd.read_csv('/content/new_file.csv')
data.head()

Unnamed: 0,step,type,amount,nameOrig,oldbalanceOrg,newbalanceOrig,nameDest,oldbalanceDest,newbalanceDest,isFraud
0,1,PAYMENT,9839.64,C1231006815,170136.0,160296.36,M1979787155,0.0,0.0,0.0
1,1,PAYMENT,1864.28,C1666544295,21249.0,19384.72,M2044282225,0.0,0.0,0.0
2,1,TRANSFER,181.0,C1305486145,181.0,0.0,C553264065,0.0,0.0,1.0
3,1,CASH_OUT,181.0,C840083671,181.0,0.0,C38997010,21182.0,0.0,1.0
4,1,PAYMENT,11668.14,C2048537720,41554.0,29885.86,M1230701703,0.0,0.0,0.0


In [3]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 71891 entries, 0 to 71890
Data columns (total 10 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   step            71891 non-null  int64  
 1   type            71890 non-null  object 
 2   amount          71890 non-null  float64
 3   nameOrig        71890 non-null  object 
 4   oldbalanceOrg   71890 non-null  float64
 5   newbalanceOrig  71890 non-null  float64
 6   nameDest        71890 non-null  object 
 7   oldbalanceDest  71890 non-null  float64
 8   newbalanceDest  71890 non-null  float64
 9   isFraud         71890 non-null  float64
dtypes: float64(6), int64(1), object(3)
memory usage: 5.5+ MB


In [4]:
data.describe()

Unnamed: 0,step,amount,oldbalanceOrg,newbalanceOrig,oldbalanceDest,newbalanceDest,isFraud
count,71891.0,71890.0,71890.0,71890.0,71890.0,71890.0,71890.0
mean,7.924622,164514.3,916957.2,933330.6,855519.0,1183650.0,0.001488
std,1.856901,331555.4,2803887.0,2842539.0,2399394.0,2861918.0,0.038551
min,1.0,0.63,0.0,0.0,0.0,0.0,0.0
25%,8.0,9118.472,0.0,0.0,0.0,0.0,0.0
50%,9.0,44874.74,20050.0,0.0,15226.0,22736.93,0.0
75%,9.0,199993.3,179933.2,199992.6,525448.9,989801.2,0.0
max,9.0,10000000.0,33797390.0,34008740.0,31306920.0,31976990.0,1.0


In [5]:
obj = (data.dtypes == 'object')
object_cols = list(obj[obj].index)
print("Categorical variables:", len(object_cols))

int_ = (data.dtypes == 'int')
num_cols = list(int_[int_].index)
print("Integer variables:", len(num_cols))

fl = (data.dtypes == 'float')
fl_cols = list(fl[fl].index)
print("Float variables:", len(fl_cols))

Categorical variables: 3
Integer variables: 1
Float variables: 6


In [6]:
data['isFraud'].value_counts()

Unnamed: 0_level_0,count
isFraud,Unnamed: 1_level_1
0.0,71783
1.0,107


### **Data Preprocessing**
This step includes the following :

Encoding of Type column
Dropping irrelevant columns like nameOrig, nameDest
Data Splitting



In [7]:
type_new = pd.get_dummies(data['type'], drop_first=True)
data_new = pd.concat([data, type_new], axis=1)
data_new.head()

Unnamed: 0,step,type,amount,nameOrig,oldbalanceOrg,newbalanceOrig,nameDest,oldbalanceDest,newbalanceDest,isFraud,CASH_OUT,DEBIT,PAYMENT,TRANSFER
0,1,PAYMENT,9839.64,C1231006815,170136.0,160296.36,M1979787155,0.0,0.0,0.0,False,False,True,False
1,1,PAYMENT,1864.28,C1666544295,21249.0,19384.72,M2044282225,0.0,0.0,0.0,False,False,True,False
2,1,TRANSFER,181.0,C1305486145,181.0,0.0,C553264065,0.0,0.0,1.0,False,False,False,True
3,1,CASH_OUT,181.0,C840083671,181.0,0.0,C38997010,21182.0,0.0,1.0,True,False,False,False
4,1,PAYMENT,11668.14,C2048537720,41554.0,29885.86,M1230701703,0.0,0.0,0.0,False,False,True,False


In [8]:
data_new = data_new.dropna(subset=['isFraud'])

In [9]:
X = data_new.drop(['isFraud', 'type', 'nameOrig', 'nameDest'], axis=1)
y = data_new['isFraud']

In [10]:
X.shape, y.shape

((71890, 10), (71890,))

In [11]:
# Now let's split the data into 2 parts : Training and Testing.
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.3, random_state=42)

In [12]:
X_test.head()

Unnamed: 0,step,amount,oldbalanceOrg,newbalanceOrig,oldbalanceDest,newbalanceDest,CASH_OUT,DEBIT,PAYMENT,TRANSFER
44366,9,11804.12,133751.86,121947.75,0.0,0.0,False,False,True,False
56899,9,866376.27,0.0,0.0,5968363.02,8842336.76,True,False,False,False
59930,9,268164.23,0.0,0.0,1746313.58,1760664.16,True,False,False,False
49273,9,184276.6,0.0,0.0,462311.79,332821.55,True,False,False,False
46210,9,10825.05,369.0,0.0,386394.75,397219.8,True,False,False,False


### **Model Training**
As the prediction is a classification problem so the models we will be using are :

**LogisticRegression**:  It predicts that the probability of a given data belongs to the particular category or not.
XGBClassifier : It refers to Gradient Boosted decision trees. In this algorithm, decision trees are created in sequential form and weights are assigned to all the independent variables which are then fed into the decision tree which predicts results.

**SVC** : SVC is used to find a hyperplane in an N-dimensional space that distinctly classifies the data points. Then it gives the output according the most nearby element.

**RandomForestClassifier**: Random forest classifier creates a set of decision trees from a randomly selected subset of the training set. Then, it collects the votes from different decision trees to decide the final prediction.

In [13]:
from xgboost import XGBClassifier
from sklearn.metrics import roc_auc_score as ras
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.impute import SimpleImputer
imputer = SimpleImputer(strategy='mean')
X_train = imputer.fit_transform(X_train)
X_test = imputer.transform(X_test)

In [14]:
models = {
    "Logistic Regression": LogisticRegression(),
    "XGBoost": XGBClassifier(use_label_encoder=False, eval_metric='logloss'),
    "Random Forest": RandomForestClassifier(n_estimators=7, criterion='entropy', random_state=7)
}

# Training and evaluation
for name, model in models.items():
    model.fit(X_train, y_train)
    print(f'{name} :')

    train_preds = model.predict_proba(X_train)[:, 1]
    print('Training ROC AUC :', ras(y_train, train_preds))

    test_preds = model.predict_proba(X_test)[:, 1]
    print('Validation ROC AUC :', ras(y_test, test_preds))
    print()

STOP: TOTAL NO. OF ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


Logistic Regression :
Training ROC AUC : 0.9301734119773983
Validation ROC AUC : 0.9235290801403435



Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


XGBoost :
Training ROC AUC : 1.0
Validation ROC AUC : 0.9938251543711407

Random Forest :
Training ROC AUC : 0.9999984883893184
Validation ROC AUC : 0.9459211861540195



In [15]:
print(X.columns.tolist())

['step', 'amount', 'oldbalanceOrg', 'newbalanceOrig', 'oldbalanceDest', 'newbalanceDest', 'CASH_OUT', 'DEBIT', 'PAYMENT', 'TRANSFER']


In [20]:
sample_input = pd.DataFrame([{
    'step': 1,
    'amount': 5000,
    'oldbalanceOrg': 7000,
    'newbalanceOrig': 2000,
    'oldbalanceDest': 10000,
    'newbalanceDest': 15000,
    'CASH_IN': 0,    # One-hot encoded from 'type'
    'CASH_OUT': 1,
    'DEBIT': 0,# One-hot encoded from 'type'
    'PAYMENT': 0 ,
    'TRANSFER': 0,


}])

In [21]:
sample_input = imputer.transform(sample_input)
for model in models:
    prediction_proba = model.predict_proba(sample_input)[0][1]
    prediction_class = model.predict(sample_input)[0]

    print(f"Model: {model.__class__.__name__}")
    print(f"Fraud Probability: {prediction_proba:.4f}")
    print(f"Predicted Class: {'Fraud' if prediction_class == 1 else 'Not Fraud'}")
    print()

Model: LogisticRegression
Fraud Probability: 0.3019
Predicted Class: Not Fraud

Model: XGBClassifier
Fraud Probability: 0.0000
Predicted Class: Not Fraud

Model: RandomForestClassifier
Fraud Probability: 0.0000
Predicted Class: Not Fraud



In [16]:
import joblib

for name, model in models.items():
    filename = name.replace(" ", "_").lower() + '.pkl'
    joblib.dump(model, filename)

In [17]:
!ls


logistic_regression.pkl  random_forest.pkl  xgboost.pkl
new_file.csv		 sample_data
