<a href="https://colab.research.google.com/github/ashu0109/Cognorise_InfoTech_Datascience/blob/main/Credit%20Card%20Prediction.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [3]:

import pandas as pd
from imblearn.over_sampling import SMOTE
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
import xgboost as xgb
from sklearn.metrics import accuracy_score, classification_report
import pickle
import warnings

warnings.filterwarnings('ignore')
pd.set_option('display.max_rows', 500)

# Loading Dataset which is CSV file in my Drive
# Connecting to the drive for dataset

df_train = pd.read_csv('/content/fraudTest.csv', low_memory=False, index_col=0)
df_test = pd.read_csv('/content/fraudTest.csv', low_memory=False, index_col=0)

# Prints the shapes of the training and test DataFrames to check the number of rows and columns.

df_train.shape
df_test.shape


(58343, 22)

In [5]:
# Checks if the columns in the training and test DataFrames are the same.

list(df_train.columns) == list(df_test.columns)

# Prints the shape of the combined DataFrame.

df = pd.concat([df_train, df_test],ignore_index=True)
df.shape

(116686, 22)

In [6]:
# Counts the occurrences of each unique value in the 'is_fraud' column, providing insight into the class distribution.

df.is_fraud.value_counts()


0.0    116216
1.0       468
Name: is_fraud, dtype: int64

In [7]:
# specified columns from the DataFrame, removing unnecessary features.

def clean_df(df):
    return df.drop(['cc_num','first', 'last', 'street', 'city', 'state', 'zip', 'dob', 'trans_num','trans_date_trans_time'],axis=1)

# clean the DataFrame and prints the first two rows.

df = clean_df(df)
df.head(2)

Unnamed: 0,merchant,category,amt,gender,lat,long,city_pop,job,unix_time,merch_lat,merch_long,is_fraud
0,fraud_Kirlin and Sons,personal_care,2.86,M,33.9659,-80.9355,333497,Mechanical engineer,1371817000.0,33.986391,-81.200714,0.0
1,fraud_Sporer-Keebler,personal_care,29.84,F,40.3207,-110.436,302,"Sales professional, IT",1371817000.0,39.450498,-109.960431,0.0


In [8]:
df.head()

Unnamed: 0,merchant,category,amt,gender,lat,long,city_pop,job,unix_time,merch_lat,merch_long,is_fraud
0,fraud_Kirlin and Sons,personal_care,2.86,M,33.9659,-80.9355,333497,Mechanical engineer,1371817000.0,33.986391,-81.200714,0.0
1,fraud_Sporer-Keebler,personal_care,29.84,F,40.3207,-110.436,302,"Sales professional, IT",1371817000.0,39.450498,-109.960431,0.0
2,"fraud_Swaniawski, Nitzsche and Welch",health_fitness,41.28,F,40.6729,-73.5365,34496,"Librarian, public",1371817000.0,40.49581,-74.196111,0.0
3,fraud_Haley Group,misc_pos,60.05,M,28.5697,-80.8191,54767,Set designer,1371817000.0,28.812398,-80.883061,0.0
4,fraud_Johnston-Casper,travel,3.19,M,44.2529,-85.017,1126,Furniture designer,1371817000.0,44.959148,-85.884734,0.0


In [9]:
# Splits the DataFrame into training and testing sets using train_test_split.

train, test = train_test_split(df, test_size=0.2, shuffle=True, random_state=42)
train = train.reset_index(drop=True)
test = test.reset_index(drop=True)
train.shape, test.shape


((93348, 12), (23338, 12))

In [10]:
# Defines a function encode to label encode categorical columns in the DataFrame

def encode(df):
    df_obj = df.select_dtypes(include=['object'])
    encoders = {}
    for col in df_obj.columns:
        encoder = LabelEncoder()
        df[col] = encoder.fit_transform(df[col])
        encoders[col] = encoder
    with open('LE_mdl_v1.pkl', 'wb') as f:
        pickle.dump(encoders, f)
    return df

train = encode(train)
train.head(2)

Unnamed: 0,merchant,category,amt,gender,lat,long,city_pop,job,unix_time,merch_lat,merch_long,is_fraud
0,128,1,54.09,0,39.1657,-84.233,31394,222,1372536000.0,39.270535,-83.537479,0.0
1,152,0,12.25,1,40.0987,-84.6342,22930,277,1372425000.0,39.702252,-83.908581,0.0


In [11]:
x = train.drop(columns=['is_fraud'])
y = train['is_fraud']
x_train, x_val, y_train, y_val = train_test_split(x, y, test_size=0.2, shuffle=True, random_state=42)

In [12]:
# Initializes three machine learning models: Logistic Regression, Random Forest, and Decision Tree.

model1 = LogisticRegression()
model2 = RandomForestClassifier()
model3 = DecisionTreeClassifier()

In [13]:
# Defines a function model_train to train a given model, make predictions, print accuracy and classification report

def model_train(model, x_train, y_train, x_test, y_test):
    model.fit(x_train,y_train)
    y_pred = model.predict(x_test)
    print('Accuracy Score: ',accuracy_score(y_test,y_pred))
    print(classification_report(y_test,y_pred))
    with open(str(model)[:3] + '_mdl.pkl', 'wb') as f:
        pickle.dump(model,f)