<a href="https://colab.research.google.com/github/ankitojha2705/ankitojha2705-CRISP_DM_SEMMA_AND_KDD-Models/blob/main/KDD.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Step 1: Data Selection
- Objective: Identify and gather the data needed for analysis.
- Action: Download and load the dataset using pandas.

In [None]:
import kagglehub
import pandas as pd

# Download the dataset from Kaggle
path = kagglehub.dataset_download("mlg-ulb/creditcardfraud")
file_path = path + '/creditcard.csv'  # Adjust the file name if necessary

# Load the dataset
df = pd.read_csv(file_path)

# Display the first few rows of the dataset
print("Dataset Loaded Successfully")
print(df.head())


Dataset Loaded Successfully
   Time        V1        V2        V3        V4        V5        V6        V7  \
0   0.0 -1.359807 -0.072781  2.536347  1.378155 -0.338321  0.462388  0.239599   
1   0.0  1.191857  0.266151  0.166480  0.448154  0.060018 -0.082361 -0.078803   
2   1.0 -1.358354 -1.340163  1.773209  0.379780 -0.503198  1.800499  0.791461   
3   1.0 -0.966272 -0.185226  1.792993 -0.863291 -0.010309  1.247203  0.237609   
4   2.0 -1.158233  0.877737  1.548718  0.403034 -0.407193  0.095921  0.592941   

         V8        V9  ...       V21       V22       V23       V24       V25  \
0  0.098698  0.363787  ... -0.018307  0.277838 -0.110474  0.066928  0.128539   
1  0.085102 -0.255425  ... -0.225775 -0.638672  0.101288 -0.339846  0.167170   
2  0.247676 -1.514654  ...  0.247998  0.771679  0.909412 -0.689281 -0.327642   
3  0.377436 -1.387024  ... -0.108300  0.005274 -0.190321 -1.175575  0.647376   
4 -0.270533  0.817739  ... -0.009431  0.798278 -0.137458  0.141267 -0.206010   

    

# Step 2: Data Preprocessing
- Objective: Clean the data to improve its quality. This includes handling missing values, removing duplicates, and correcting data types.
- Action: Check for missing values and duplicates, then address them

In [None]:
# Check for missing values
print("Missing Values in Each Column:")
print(df.isnull().sum())

# Handle missing values (if any) - for this dataset, there are usually no missing values
df = df.fillna(df.median())  # Fill missing values with the median

# Check for duplicates and remove them
print("Number of Duplicate Rows:", df.duplicated().sum())
df = df.drop_duplicates()

# Check data types and convert if necessary
print("\nData Types:")
print(df.dtypes)


Missing Values in Each Column:
Time      0
V1        0
V2        0
V3        0
V4        0
V5        0
V6        0
V7        0
V8        0
V9        0
V10       0
V11       0
V12       0
V13       0
V14       0
V15       0
V16       0
V17       0
V18       0
V19       0
V20       0
V21       0
V22       0
V23       0
V24       0
V25       0
V26       0
V27       0
V28       0
Amount    0
Class     0
dtype: int64
Number of Duplicate Rows: 1081

Data Types:
Time      float64
V1        float64
V2        float64
V3        float64
V4        float64
V5        float64
V6        float64
V7        float64
V8        float64
V9        float64
V10       float64
V11       float64
V12       float64
V13       float64
V14       float64
V15       float64
V16       float64
V17       float64
V18       float64
V19       float64
V20       float64
V21       float64
V22       float64
V23       float64
V24       float64
V25       float64
V26       float64
V27       float64
V28       float64
Amount    float64


# Step 3: Data Transformation
- Objective: Transform the data for better analysis. This can include encoding categorical variables and scaling numerical features.
- Action: Since this dataset has only numerical features, we’ll focus on scaling.

In [None]:
from sklearn.preprocessing import StandardScaler

# Initialize the scaler
scaler = StandardScaler()

# Scale the numerical features
df_scaled = df.copy()
df_scaled.iloc[:, :-1] = scaler.fit_transform(df.iloc[:, :-1])  # Exclude the 'Class' column from scaling

print("\nData After Scaling:")
print(df_scaled.head())



Data After Scaling:
       Time        V1        V2        V3        V4        V5        V6  \
0 -1.996823 -0.701082 -0.041687  1.680101  0.976623 -0.247020  0.348012   
1 -1.996823  0.608792  0.164138  0.109279  0.318998  0.042258 -0.060980   
2 -1.996802 -0.700336 -0.811337  1.174270  0.270648 -0.366756  1.352655   
3 -1.996802 -0.499064 -0.109972  1.187383 -0.608355 -0.008814  0.937245   
4 -1.996781 -0.597606  0.535539  1.025470  0.287092 -0.297036  0.072873   

         V7        V8        V9  ...       V21       V22       V23       V24  \
0  0.193700  0.084434  0.333534  ... -0.024777  0.383483 -0.177444  0.110157   
1 -0.065656  0.072903 -0.231703  ... -0.311372 -0.881454  0.162081 -0.561503   
2  0.643223  0.210788 -1.381169  ...  0.343094  1.065068  1.457772 -1.138484   
3  0.192079  0.320843 -1.264664  ... -0.149093  0.007299 -0.305465 -1.941446   
4  0.481517 -0.228725  0.747917  ... -0.012516  1.101780 -0.220709  0.232904   

        V25       V26       V27       V28    Am

# Step 4: Data Mining

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier

# Split the data into features and target
X = df_scaled.drop('Class', axis=1)  # Features
y = df_scaled['Class']  # Target

# Split into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# Train Logistic Regression
log_reg = LogisticRegression(max_iter=1000)
log_reg.fit(X_train, y_train)

# Train Random Forest
rf = RandomForestClassifier(random_state=42)
rf.fit(X_train, y_train)

print("\nModels Trained Successfully")



Models Trained Successfully


# Step 5: Interpretation/Evaluation

In [None]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

# Evaluate Logistic Regression
y_pred_log_reg = log_reg.predict(X_test)
print("\nLogistic Regression Metrics:")
print("Accuracy:", accuracy_score(y_test, y_pred_log_reg))
print("Precision:", precision_score(y_test, y_pred_log_reg))
print("Recall:", recall_score(y_test, y_pred_log_reg))
print("F1 Score:", f1_score(y_test, y_pred_log_reg))

# Evaluate Random Forest
y_pred_rf = rf.predict(X_test)
print("\nRandom Forest Metrics:")
print("Accuracy:", accuracy_score(y_test, y_pred_rf))
print("Precision:", precision_score(y_test, y_pred_rf))
print("Recall:", recall_score(y_test, y_pred_rf))
print("F1 Score:", f1_score(y_test, y_pred_rf))



Logistic Regression Metrics:
Accuracy: 0.999177612255927
Precision: 0.8809523809523809
Recall: 0.5522388059701493
F1 Score: 0.6788990825688074

Random Forest Metrics:
Accuracy: 0.9995065673535563
Precision: 0.9509803921568627
Recall: 0.7238805970149254
F1 Score: 0.8220338983050848
