#TITLE : Credit Card Fraud Detection
*   AUTHOR: Arman Shaikh
*   DOMAIN: DATA SCIENCE
*   AIM   : To build a ML model to identify fraudulent transactions

IMPORT REQUIRED LIBRARIES

In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.impute import SimpleImputer
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score


LOAD AND EXPLORE THE DATESET


In [None]:
# Load the dataset
df = pd.read_csv('/content/creditcard.csv')


In [None]:
# Display the first few rows of the dataset
print(df.head())

# Display basic information about the dataset
print(df.info())


   Time        V1        V2        V3        V4        V5        V6        V7  \
0     0 -1.359807 -0.072781  2.536347  1.378155 -0.338321  0.462388  0.239599   
1     0  1.191857  0.266151  0.166480  0.448154  0.060018 -0.082361 -0.078803   
2     1 -1.358354 -1.340163  1.773209  0.379780 -0.503198  1.800499  0.791461   
3     1 -0.966272 -0.185226  1.792993 -0.863291 -0.010309  1.247203  0.237609   
4     2 -1.158233  0.877737  1.548718  0.403034 -0.407193  0.095921  0.592941   

         V8        V9  ...       V21       V22       V23       V24       V25  \
0  0.098698  0.363787  ... -0.018307  0.277838 -0.110474  0.066928  0.128539   
1  0.085102 -0.255425  ... -0.225775 -0.638672  0.101288 -0.339846  0.167170   
2  0.247676 -1.514654  ...  0.247998  0.771679  0.909412 -0.689281 -0.327642   
3  0.377436 -1.387024  ... -0.108300  0.005274 -0.190321 -1.175575  0.647376   
4 -0.270533  0.817739  ... -0.009431  0.798278 -0.137458  0.141267 -0.206010   

        V26       V27       V28 

In [None]:
# Check for missing values
print(df.isnull().sum())

Time      0
V1        0
V2        0
V3        0
V4        0
V5        0
V6        0
V7        0
V8        0
V9        0
V10       0
V11       0
V12       0
V13       0
V14       0
V15       0
V16       0
V17       0
V18       0
V19       0
V20       0
V21       0
V22       0
V23       1
V24       1
V25       1
V26       1
V27       1
V28       1
Amount    1
Class     1
dtype: int64


DATA PREPROCESSING

In [None]:
# Separate features and target variable
X = df.drop('Class', axis=1)
y = df['Class']


In [None]:
# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

TRAIN THE MODEL

In [None]:
# Initialize the Random Forest Classifier
model = RandomForestClassifier(n_estimators=100, random_state=42)

# Initialize an imputer to replace missing values with the mean of each column
imputer = SimpleImputer(strategy='mean')

# Fit the imputer on the training data and transform both training and testing data
X_train = imputer.fit_transform(X_train)
X_test = imputer.transform(X_test)

# Initialize an imputer to replace missing values with the most frequent value for the target variable
imputer_y = SimpleImputer(strategy='most_frequent')

# Fit and transform the target variable (y_train)
y_train = imputer_y.fit_transform(y_train.values.reshape(-1, 1))
y_train = y_train.ravel()

# Now you can train the model
model.fit(X_train, y_train)


MODEL EVALUATION

In [None]:
# Make predictions
y_pred = model.predict(X_test)

# Evaluate the model
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred))
print("\nClassification Report:\n", classification_report(y_test, y_pred))
print("\nAccuracy Score:", accuracy_score(y_test, y_pred))



Confusion Matrix:
 [[10689     0]
 [    5    21]]

Classification Report:
               precision    recall  f1-score   support

         0.0       1.00      1.00      1.00     10689
         1.0       1.00      0.81      0.89        26

    accuracy                           1.00     10715
   macro avg       1.00      0.90      0.95     10715
weighted avg       1.00      1.00      1.00     10715


Accuracy Score: 0.9995333644423705


**Explanation**
* Import Libraries: We import the necessary libraries for data manipulation, model training, and evaluation.

* Load and Explore the Dataset: Load the dataset and explore its structure, checking for any missing values.

* Data Preprocessing: Split the dataset into features (X) and target (y). The target variable (Class) indicates whether a transaction is fraudulent (1) or not (0). We also split the data into training and testing sets to evaluate the model's performance.

* Train the Model: We use a Random Forest Classifier to train the model. This is a popular choice for classification tasks due to its robustness and ability to handle imbalanced datasets.

* Evaluate the Model: After training, we evaluate the model using the test set. We use the confusion matrix, classification report, and accuracy score to understand how well the model performs in identifying fraudulent transactions.