<a href="https://colab.research.google.com/github/aribashaikh870-droid/student-performance-prediction/blob/main/fraud_detection_intelligent_system.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import os
os.listdir('/')
import pandas as pd
df = pd.read_csv('/fraud_detection.csv')
df.head()


Unnamed: 0,step,type,amount,nameOrig,oldbalanceOrg,newbalanceOrig,nameDest,oldbalanceDest,newbalanceDest,isFraud,isFlaggedFraud
0,1,PAYMENT,9839.64,C1231006815,170136.0,160296.36,M1979787155,0.0,0.0,0,0
1,1,PAYMENT,1864.28,C1666544295,21249.0,19384.72,M2044282225,0.0,0.0,0,0
2,1,TRANSFER,181.0,C1305486145,181.0,0.0,C553264065,0.0,0.0,1,0
3,1,CASH_OUT,181.0,C840083671,181.0,0.0,C38997010,21182.0,0.0,1,0
4,1,PAYMENT,11668.14,C2048537720,41554.0,29885.86,M1230701703,0.0,0.0,0,0


In [2]:
df.shape

(1048575, 11)

In [3]:
df.info

In [4]:
df.describe()

Unnamed: 0,step,amount,oldbalanceOrg,newbalanceOrig,oldbalanceDest,newbalanceDest,isFraud,isFlaggedFraud
count,1048575.0,1048575.0,1048575.0,1048575.0,1048575.0,1048575.0,1048575.0,1048575.0
mean,26.96617,158667.0,874009.5,893808.9,978160.0,1114198.0,0.001089097,0.0
std,15.62325,264940.9,2971751.0,3008271.0,2296780.0,2416593.0,0.03298351,0.0
min,1.0,0.1,0.0,0.0,0.0,0.0,0.0,0.0
25%,15.0,12149.07,0.0,0.0,0.0,0.0,0.0,0.0
50%,20.0,76343.33,16002.0,0.0,126377.2,218260.4,0.0,0.0
75%,39.0,213761.9,136642.0,174600.0,915923.5,1149808.0,0.0,0.0
max,95.0,10000000.0,38900000.0,38900000.0,42100000.0,42200000.0,1.0,0.0


In [5]:
df.columns

Index(['step', 'type', 'amount', 'nameOrig', 'oldbalanceOrg', 'newbalanceOrig',
       'nameDest', 'oldbalanceDest', 'newbalanceDest', 'isFraud',
       'isFlaggedFraud'],
      dtype='object')

In [6]:
# This cell checks for and sums up the number of missing (null) values in each column of the DataFrame `df`.
# The output shows that there are 0 missing values for every column, indicating a clean dataset in this regard.
df.isnull().sum()

Unnamed: 0,0
step,0
type,0
amount,0
nameOrig,0
oldbalanceOrg,0
newbalanceOrig,0
nameDest,0
oldbalanceDest,0
newbalanceDest,0
isFraud,0


In [7]:
# This cell performs data preprocessing by separating features from the target variable,
# removing irrelevant columns, and encoding categorical features.
#
# 1. `y = df['isFraud']`: Extracts the 'isFraud' column as the target variable `y`.
# 2. `x = df.drop(['isFraud','isFlaggedFraud','nameOrig','nameDest'], axis = 1)`: Creates the feature set `x`
#    by dropping the target, a redundant flag, and identifier columns from the original DataFrame.
# 3. `x.dtypes`: (Implicitly checked) Displays the data types of the remaining features in `x`.
# 4. `from sklearn.preprocessing import LabelEncoder`: Imports the LabelEncoder for categorical feature transformation.
# 5. `le = LabelEncoder()`: Initializes a LabelEncoder instance.
# 6. `x['type']= le.fit_transform(x['type'])`: Converts the 'type' column (e.g., 'PAYMENT', 'TRANSFER')
#    from categorical text to numerical labels, which is required for most machine learning models.
# 7. `y.value_counts()`: Shows the distribution of the target variable `isFraud`,
#    revealing a significant class imbalance (many more non-fraudulent than fraudulent transactions).
y = df['isFraud']
x = df.drop(['isFraud','isFlaggedFraud','nameOrig','nameDest'], axis = 1)
x.dtypes
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
x['type']= le.fit_transform(x['type'])
y.value_counts()

Unnamed: 0_level_0,count
isFraud,Unnamed: 1_level_1
0,1047433
1,1142


In [8]:
x.dtypes

Unnamed: 0,0
step,int64
type,int64
amount,float64
oldbalanceOrg,float64
newbalanceOrig,float64
oldbalanceDest,float64
newbalanceDest,float64


In [9]:
# This cell displays the value counts of the target variable `y` ('isFraud').
# The output shows a severe class imbalance: 1,047,433 non-fraudulent transactions (0)
# versus only 1,142 fraudulent transactions (1). This imbalance is a crucial factor
# to consider during model building, as it can affect the performance and fairness of the model.
y.value_counts()

Unnamed: 0_level_0,count
isFraud,Unnamed: 1_level_1
0,1047433
1,1142


In [10]:
# This cell performs feature scaling on the input features `x` using `StandardScaler`.
# `StandardScaler` transforms the data such that each feature has a mean of 0 and a standard deviation of 1.
# This is a common preprocessing step to normalize the range of independent variables.
#
# 1. `from sklearn.preprocessing import StandardScaler`: Imports the necessary class.
# 2. `scaler = StandardScaler()`: Initializes a StandardScaler object.
# 3. `x_scaled = scaler.fit_transform(x)`: Computes the mean and standard deviation for each feature in `x`
#    (the 'fit' part) and then applies the scaling transformation to `x` (the 'transform' part),
#    storing the result in `x_scaled`. This standardized data is often better suited for
#    machine learning models.
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
x_scaled = scaler.fit_transform(x)

In [11]:
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(x_scaled, y , test_size = 0.2,random_state = 42, stratify = y)

In [12]:
# This cell is where we train a machine learning model called Logistic Regression.
# Think of Logistic Regression as a tool that learns to classify data into one of two categories (like 'fraud' or 'not fraud').
#
# 1. `from sklearn.linear_model import LogisticRegression`: This line imports the Logistic Regression tool from the scikit-learn library.
# 2. `lr = LogisticRegression(max_iter = 1000)`: We create an instance of our Logistic Regression tool and call it 'lr'.
#    `max_iter = 1000` tells the tool to take up to 1000 steps to learn the best patterns.
# 3. `lr.fit(x_train, y_train)`: This is the training step. We 'teach' our tool (`lr`) by showing it the training features (`x_train`) and the corresponding correct answers (`y_train`, which indicate whether each transaction was fraud or not).
#    The tool learns the relationships between the features and the fraud outcome.
# 4. `y_pred_lr = lr.predict(x_test)`: After training, we ask the tool to make predictions on new data it hasn't seen before (`x_test`).
#    The results, `y_pred_lr`, are the model's guesses (0 for not fraud, 1 for fraud) for each transaction in the test set.

from sklearn.linear_model import LogisticRegression
lr = LogisticRegression(max_iter = 1000)
lr.fit(x_train,y_train)
y_pred_lr = lr.predict(x_test)


In [13]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
print("Logistic Regression")
# The 'NameError: name 'y_test' is not defined' occurs because the train_test_split
# cell (cell ID: 2mmbx3UKa2P2) which defines y_test, x_train, x_test, and y_train
# has not been executed yet. Please run that cell first.
print("Accuracy :", accuracy_score(y_test, y_pred_lr))
print("precision:", precision_score(y_test, y_pred_lr))
print("recall:", recall_score(y_test, y_pred_lr))
print("f1 score:", f1_score(y_test, y_pred_lr))

Logistic Regression
Accuracy : 0.9990558615263572
precision: 0.9411764705882353
recall: 0.14035087719298245
f1 score: 0.24427480916030533


random forest

In [14]:
from sklearn.ensemble import RandomForestClassifier
rf = RandomForestClassifier(n_estimators = 100, random_state = 42)
rf.fit(x_train, y_train) # Execution was stopped here by a KeyboardInterrupt.
y_pred_rf = rf.predict(x_test)

In [15]:
print("Random Forest")
print("Accuracy:", accuracy_score(y_test,y_pred_rf))
print("Precision:", precision_score(y_test,y_pred_rf))
print("Recall:", recall_score(y_test,y_pred_rf))
print("F1 score:",f1_score(y_test,y_pred_rf))




Random Forest
Accuracy: 0.9997234341844885
Precision: 0.9885057471264368
Recall: 0.7543859649122807
F1 score: 0.8557213930348259


Isolation forest

In [16]:
from sklearn.ensemble import IsolationForest
iso = IsolationForest(contamination = 0.001, random_state = 42 )
iso.fit(x_train)
y_pred_iso = iso.predict(x_test)
y_pred_iso = [1 if x == -1 else 0 for x in y_pred_iso]

In [18]:
print("Isolation Forest")
print("Acuracy:",accuracy_score(y_test, y_pred_iso))
print("Precision:",precision_score(y_test, y_pred_iso))
print("Recall:", recall_score(y_test,y_pred_iso))
print("F1 score:",f1_score(y_test,y_pred_iso))

Isolation Forest
Acuracy: 0.9978065469804258
Precision: 0.0
Recall: 0.0
F1 score: 0.0


AI explainability

In [20]:
import pandas as pd
importance = pd.DataFrame({'Feature': x.columns, 'Importance': rf.feature_importances_}).sort_values(by = 'Importance',ascending=False)
importance


Unnamed: 0,Feature,Importance
0,step,0.287147
3,oldbalanceOrg,0.189341
2,amount,0.186031
6,newbalanceDest,0.1832
5,oldbalanceDest,0.099421
4,newbalanceOrig,0.028897
1,type,0.025964
