# Task 5 : Credit card fraud detection

In [None]:
# import necessary dependencies

In [None]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split # type: ignore
from sklearn.metrics import accuracy_score, precision_score, recall_score, classification_report # type: ignore
from sklearn.linear_model import LogisticRegression

In [None]:
credit_card_data = pd.read_csv(r"C:\Users\HP\OneDrive\Desktop\codsoft\datasets\creditcard.csv")

# data analysis

In [None]:
credit_card_data

In [None]:
ccd = credit_card_data

In [None]:
ccd.info()

In [None]:
# no object data type so we wouldn't need any conversions in here

In [None]:
ccd.isnull().sum()

In [None]:
# no null values, that's great

In [None]:
# distribution of fraudulent and legitimate classes
ccd['Class'].value_counts()

In [None]:
# where 1 - depicts fraudulent cases and 0 -  depicts legitimate classes which actually implies that this dataset is highly imbalanced 

Data Visualization

In [None]:
# visualizing the class distribution in percentage

In [None]:
print((ccd.groupby('Class')['Class'].count()/ccd['Class'].count())*100)
((ccd.groupby('Class')['Class'].count()/ccd['Class'].count())*100).plot.pie()

In [None]:
classes = ccd['Class'].value_counts()
normal_value = round(classes[0]/ccd['Class'].count()*100,2)
fraud_values = round(classes[1]/ccd['Class'].count()*100,2)
print(normal_value)
print(fraud_values)

In [None]:
# let's check tthe correlation of the features

In [None]:
corr = ccd.corr()
corr

In [None]:
# plotting the heatmap for the correlation
plt.figure(figsize=(27,19))
sns.heatmap(corr, cmap = 'spring', annot= True )
plt.show()

In [None]:
# separte the data according to type of transaction i.e. fraud or legit

In [None]:
legit = ccd[ccd.Class == 0]

In [None]:
fraud = ccd[ccd.Class==1]

In [None]:
legit.Amount.describe()

In [None]:
fraud.Amount.describe()

In [None]:
# we can observe that the mean amount spent for fraud transactions is actually more than for the legit ones

In [None]:
ccd.groupby('Class').describe()

In [None]:
ccd.groupby('Class').mean()

In [None]:
# there's a significant difference in the mean value for our normal transaction and mean value for our fraud transactions

In [None]:
# now to balance the data for legit and fraud transaction value points 
# we will use sampling for creating a new dataset of normal transactions with 492 entries being selected randomly out of 284315

In [None]:
normal_sample = legit.sample(n=492)

In [None]:
# now merge the two datasets for fraud and legit transactions with equal number of sampl points

In [None]:
new_dataset = pd.concat([normal_sample, fraud], axis = 0) # axis =0 species row wise joining of the datasets l

In [None]:
new_dataset

In [None]:
new_dataset['Class'].value_counts()

In [None]:
new_dataset.groupby('Class').mean() 

In [None]:
# here we can drop the time feature and instead use a derived column using timedelta function of pandas to represent the duration that is difference between two time values
delta_time = pd.to_timedelta(new_dataset['Time'], unit = 's')
# create the derived column
new_dataset['time_hour']=(delta_time.dt.components.hours).astype(int)
# now drop the time column
new_dataset.drop(columns='Time', axis=1, inplace = True)

In [None]:
new_dataset

# separating the features and target variables

In [None]:
x = new_dataset.drop('Class', axis=1)

In [None]:
y = new_dataset['Class']

In [None]:
x.shape

In [None]:
y.shape

# splitting the data into training and testing data 

In [None]:
x_train, x_test, y_train, y_test = train_test_split(x,y, test_size = 0.25, random_state = 3, stratify = y)

In [None]:
#accumulating all the column names under one variable
cols = list(x.columns.values)

In [None]:
normal_entries = new_dataset.Class==0
fraud_entries = new_dataset.Class==1

plt.figure(figsize=(20,70))
for n, col in enumerate(cols):
    plt.subplot(10,3,n+1)
    sns.histplot(x[col][normal_entries], color='blue', kde = True, stat = 'density')
    sns.histplot(x[col][fraud_entries], color='red', kde = True, stat = 'density')
    plt.title(col, fontsize=17)
plt.show()


In [None]:
model = LogisticRegression()
model.fit(x_train, y_train)
y_pred = model.predict(x_train)
pred_test = model.predict(x_test)

# Model evaluation

In [None]:
# creating confusion matrix
from sklearn.metrics import confusion_matrix
def Plot_confusion_matrix(y_test, y_pred):
    cm = confusion_matrix(y_test,pred_test)
    plt.clf()
    plt.show()

    

In [None]:
# accuracy on training data
acc_score= round(accuracy_score(y_pred, y_train)*100,2)

In [None]:
print('the accuracy score for training data of our model is :', acc_score)

In [None]:
y_pred = model.predict(x_test)
acc_score = round(accuracy_score(y_pred, y_test)*100,2)

In [None]:
print('the accuracy score of our model is :', acc_score)

In [None]:
from sklearn import metrics

In [None]:
score = round(model.score(x_test, y_test)*100,2)
print('score of our model is :', score)

In [None]:
class_report = classification_report(y_pred, y_test)
print('classification report of our model: ', class_report)

In [None]:
# we have achieved a model with decent accuracy score

# Thank You