In [2]:
import pandas as pd
import gdown
import ipaddress
import numpy as np
from datetime import datetime
from sklearn.preprocessing import MinMaxScaler


In [None]:
# Load Fraud_Data, Credit Card Data, and IP-to-Country data
fraud_df = pd.read_csv("/content/Fraud_Data.csv")
creditcard_df = pd.read_csv("/content/creditcard1.csv")
ip_country_df = pd.read_csv("/content/IpAddress_to_Country.csv")


In [None]:
# ============================================================
# STEP 3: Data Cleaning - IP Address to Country Data
# ============================================================

# Convert lower_bound_ip_address to integer (it was a float)
ip_country_df['lower_bound_ip_address'] = ip_country_df['lower_bound_ip_address'].astype(int)


In [None]:
# ============================================================
# STEP 4: Data Cleaning - Fraud Data
# ============================================================

# Remove duplicate rows from fraud_df (if any)
fraud_df.drop_duplicates(inplace=True)

# Check for missing values and print a summary
print("Missing values in Fraud Data:")
print(fraud_df.isna().sum())

# Convert timestamp columns to datetime objects.
# (Assuming columns are named 'signup_time' and 'purchase_time')
fraud_df['signup_time'] = pd.to_datetime(fraud_df['signup_time'], errors='coerce')
fraud_df['purchase_time'] = pd.to_datetime(fraud_df['purchase_time'], errors='coerce')


Missing values in Fraud Data:
user_id           0
signup_time       0
purchase_time     0
purchase_value    0
device_id         0
source            0
browser           0
sex               0
age               0
ip_address        0
class             0
dtype: int64


In [None]:
# ============================================================
# STEP 5: Convert IP Address Strings to Integer
# ============================================================

# Define a function that converts an IPv4 address (string) to an integer.
def ip_to_int(ip_str):
    try:
        return int(ipaddress.IPv4Address(ip_str))
    except Exception:
        return np.nan

# Apply the conversion to create a new column 'ip_int'
fraud_df['ip_int'] = fraud_df['ip_address'].apply(ip_to_int)


In [None]:
# ============================================================
# STEP 6: Merge Fraud Data with IP-to-Country Data
# ============================================================

# Define a function that maps an IP (as an integer) to a country using the IP ranges
def map_ip_to_country(ip_int):
    # Find the row in ip_country_df where ip_int falls between the lower and upper bounds
    row = ip_country_df[(ip_country_df['lower_bound_ip_address'] <= ip_int) &
                        (ip_country_df['upper_bound_ip_address'] >= ip_int)]
    if not row.empty:
        return row.iloc[0]['country']
    else:
        return np.nan

# Create a new column 'country' in fraud_df by applying the mapping function
fraud_df['country'] = fraud_df['ip_int'].apply(map_ip_to_country)

# Save the merged DataFrame to a CSV file named 'merged_ip.csv'
fraud_df.to_csv("merged_ip.csv", index=False)

print("Merged file saved as merged_ip.csv")


Merged file saved as merged_ip.csv


In [None]:

# ============================================================
# STEP 7: Feature Engineering - Time-Based Features
# ============================================================

# Extract the hour of day and day of week from the purchase_time column
fraud_df['purchase_hour'] = fraud_df['purchase_time'].dt.hour
fraud_df['purchase_dayofweek'] = fraud_df['purchase_time'].dt.dayofweek


In [None]:
# ============================================================
# STEP 8: Normalize a Key Feature (purchase_value)
# ============================================================

# Using MinMaxScaler to normalize the 'purchase_value' column (assumed to be the purchase amount)
scaler = MinMaxScaler()
# Reshape is required because scaler expects a 2D array
fraud_df['purchase_value_scaled'] = scaler.fit_transform(fraud_df[['purchase_value']])


In [None]:
# ============================================================
# STEP 9: Exploratory Data Analysis (EDA) - Quick Look
# ============================================================

# Print a summary of the fraud dataset to inspect data types and new columns
print("\nFraud Data Info:")
print(fraud_df.info())

# Display the first few rows to verify the changes
print("\nFraud Data Sample:")
print(fraud_df.head())

# Optionally, you can also print summary statistics
print("\nFraud Data Summary Statistics:")
print(fraud_df.describe())



Fraud Data Info:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 151112 entries, 0 to 151111
Data columns (total 16 columns):
 #   Column                 Non-Null Count   Dtype         
---  ------                 --------------   -----         
 0   user_id                151112 non-null  int64         
 1   signup_time            151112 non-null  datetime64[ns]
 2   purchase_time          151112 non-null  datetime64[ns]
 3   purchase_value         151112 non-null  int64         
 4   device_id              151112 non-null  object        
 5   source                 151112 non-null  object        
 6   browser                151112 non-null  object        
 7   sex                    151112 non-null  object        
 8   age                    151112 non-null  int64         
 9   ip_address             151112 non-null  float64       
 10  class                  151112 non-null  int64         
 11  ip_int                 0 non-null       float64       
 12  country                0 n

In [None]:
# ============================================================
# STEP 10: (Optional) Quick EDA on Credit Card Data
# ============================================================

print("\nCredit Card Data Info:")
print(creditcard_df.info())
print("\nCredit Card Data Sample:")
print(creditcard_df.head())


Credit Card Data Info:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7973 entries, 0 to 7972
Data columns (total 31 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   Time    7973 non-null   int64  
 1   V1      7973 non-null   float64
 2   V2      7973 non-null   float64
 3   V3      7973 non-null   float64
 4   V4      7973 non-null   float64
 5   V5      7973 non-null   float64
 6   V6      7973 non-null   float64
 7   V7      7973 non-null   float64
 8   V8      7973 non-null   float64
 9   V9      7973 non-null   float64
 10  V10     7973 non-null   float64
 11  V11     7973 non-null   float64
 12  V12     7973 non-null   float64
 13  V13     7973 non-null   float64
 14  V14     7973 non-null   float64
 15  V15     7972 non-null   float64
 16  V16     7972 non-null   float64
 17  V17     7972 non-null   float64
 18  V18     7972 non-null   float64
 19  V19     7972 non-null   float64
 20  V20     7972 non-null   float64
 21  V21     7972 

In [None]:
# Save the cleaned and merged Fraud Data with IP-to-Country information
fraud_df.to_csv("cleaned_fraud_data.csv", index=False)
print("Cleaned fraud data saved as 'cleaned_fraud_data.csv'.")

# Save the cleaned Credit Card Data as well
creditcard_df.to_csv("cleaned_creditcard_data.csv", index=False)
print("Cleaned credit card data saved as 'cleaned_creditcard_data.csv'.")

# (Optional) Save the cleaned IP-to-Country data if needed
ip_country_df.to_csv("cleaned_ip_country_data.csv", index=False)
print("Cleaned IP-to-Country data saved as 'cleaned_ip_country_data.csv'.")


Cleaned fraud data saved as 'cleaned_fraud_data.csv'.
Cleaned credit card data saved as 'cleaned_creditcard_data.csv'.
Cleaned IP-to-Country data saved as 'cleaned_ip_country_data.csv'.


In [7]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

# Load the cleaned credit card dataset
df_cc = pd.read_csv(r"/content/cleaned_creditcard_data.csv")

# Count rows with NaN in any column
total_rows_with_nan = df_cc.isna().any(axis=1).sum()

print(f"Total rows with NaN values in any column: {total_rows_with_nan}")

# Count rows with NaN in specific column(s)
rows_with_nan_in_class = df_cc['Class'].isna().sum()
print(f"Rows with NaN in 'Class' column: {rows_with_nan_in_class}")

# You can also list columns with NaN and their count
nan_counts_per_column = df_cc.isna().sum()
print("\nNaN counts per column:")
print(nan_counts_per_column)

Total rows with NaN values in any column: 1
Rows with NaN in 'Class' column: 1

NaN counts per column:
Time      0
V1        0
V2        0
V3        0
V4        0
V5        0
V6        0
V7        0
V8        0
V9        0
V10       0
V11       0
V12       0
V13       0
V14       0
V15       1
V16       1
V17       1
V18       1
V19       1
V20       1
V21       1
V22       1
V23       1
V24       1
V25       1
V26       1
V27       1
V28       1
Amount    1
Class     1
dtype: int64


In [14]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.impute import SimpleImputer  # Import SimpleImputer

# Load the cleaned credit card dataset
df_cc = pd.read_csv(r"/content/cleaned_creditcard_data.csv")

# Separate features and target
X = df_cc.drop('Class', axis=1)
y = df_cc['Class']

# Handle NaN values in 'y' (if any)
y.fillna(y.mode()[0], inplace=True)

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

# Impute NaN values in X_train and X_test using SimpleImputer
imputer = SimpleImputer(strategy='mean')  # or 'median', 'most_frequent'
X_train_imputed = imputer.fit_transform(X_train)
X_test_imputed = imputer.transform(X_test)

# Scale the features after imputation
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train_imputed)  # Use imputed data
X_test_scaled = scaler.transform(X_test_imputed)      # Use imputed data

In [11]:
!pip install mlflow

Collecting mlflow
  Downloading mlflow-2.20.1-py3-none-any.whl.metadata (30 kB)
Collecting mlflow-skinny==2.20.1 (from mlflow)
  Downloading mlflow_skinny-2.20.1-py3-none-any.whl.metadata (31 kB)
Collecting alembic!=1.10.0,<2 (from mlflow)
  Downloading alembic-1.14.1-py3-none-any.whl.metadata (7.4 kB)
Collecting docker<8,>=4.0.0 (from mlflow)
  Downloading docker-7.1.0-py3-none-any.whl.metadata (3.8 kB)
Collecting graphene<4 (from mlflow)
  Downloading graphene-3.4.3-py2.py3-none-any.whl.metadata (6.9 kB)
Collecting gunicorn<24 (from mlflow)
  Downloading gunicorn-23.0.0-py3-none-any.whl.metadata (4.4 kB)
Collecting databricks-sdk<1,>=0.20.0 (from mlflow-skinny==2.20.1->mlflow)
  Downloading databricks_sdk-0.43.0-py3-none-any.whl.metadata (38 kB)
Collecting Mako (from alembic!=1.10.0,<2->mlflow)
  Downloading Mako-1.3.9-py3-none-any.whl.metadata (2.9 kB)
Collecting graphql-core<3.3,>=3.1 (from graphene<4->mlflow)
  Downloading graphql_core-3.2.6-py3-none-any.whl.metadata (11 kB)
Colle

In [15]:
import mlflow
import mlflow.sklearn
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.metrics import classification_report, accuracy_score

# Define a dictionary of models to train
models = {
    "LogisticRegression": LogisticRegression(max_iter=1000),
    "DecisionTree": DecisionTreeClassifier(),
    "RandomForest": RandomForestClassifier(n_estimators=100, random_state=42),
    "GradientBoosting": GradientBoostingClassifier(random_state=42)
}

# Set an MLflow experiment name (this creates or uses an existing experiment)
mlflow.set_experiment("CreditCard_Fraud_Detection")

for model_name, model in models.items():
    with mlflow.start_run(run_name=model_name):
        # Train the model
        model.fit(X_train_scaled, y_train)
        # Make predictions
        y_pred = model.predict(X_test_scaled)
        # Calculate accuracy and print a classification report
        acc = accuracy_score(y_test, y_pred)
        print(f"{model_name} Accuracy: {acc:.4f}")
        print(classification_report(y_test, y_pred))

        # Log parameters and metrics to MLflow
        mlflow.log_param("model_name", model_name)
        mlflow.log_metric("accuracy", acc)
        mlflow.sklearn.log_model(model, model_name)


LogisticRegression Accuracy: 0.9994
              precision    recall  f1-score   support

         0.0       1.00      1.00      1.00      1590
         1.0       0.83      1.00      0.91         5

    accuracy                           1.00      1595
   macro avg       0.92      1.00      0.95      1595
weighted avg       1.00      1.00      1.00      1595





DecisionTree Accuracy: 0.9994
              precision    recall  f1-score   support

         0.0       1.00      1.00      1.00      1590
         1.0       1.00      0.80      0.89         5

    accuracy                           1.00      1595
   macro avg       1.00      0.90      0.94      1595
weighted avg       1.00      1.00      1.00      1595





RandomForest Accuracy: 0.9994
              precision    recall  f1-score   support

         0.0       1.00      1.00      1.00      1590
         1.0       1.00      0.80      0.89         5

    accuracy                           1.00      1595
   macro avg       1.00      0.90      0.94      1595
weighted avg       1.00      1.00      1.00      1595





GradientBoosting Accuracy: 0.9994
              precision    recall  f1-score   support

         0.0       1.00      1.00      1.00      1590
         1.0       1.00      0.80      0.89         5

    accuracy                           1.00      1595
   macro avg       1.00      0.90      0.94      1595
weighted avg       1.00      1.00      1.00      1595





In [16]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout
from tensorflow.keras.callbacks import EarlyStopping

# Build a simple MLP model
mlp_model = Sequential([
    Dense(64, activation='relu', input_dim=X_train_scaled.shape[1]),
    Dropout(0.5),
    Dense(32, activation='relu'),
    Dropout(0.5),
    Dense(1, activation='sigmoid')
])
mlp_model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

early_stop = EarlyStopping(monitor='val_loss', patience=3)

mlp_history = mlp_model.fit(
    X_train_scaled, y_train,
    validation_split=0.2,
    epochs=20,
    callbacks=[early_stop],
    verbose=1
)

loss, accuracy = mlp_model.evaluate(X_test_scaled, y_test)
print("MLP Test Accuracy:", accuracy)


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


Epoch 1/20
[1m160/160[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 8ms/step - accuracy: 0.7902 - loss: 0.4246 - val_accuracy: 0.9984 - val_loss: 0.0193
Epoch 2/20
[1m160/160[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 6ms/step - accuracy: 0.9957 - loss: 0.0454 - val_accuracy: 0.9984 - val_loss: 0.0057
Epoch 3/20
[1m160/160[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 7ms/step - accuracy: 0.9981 - loss: 0.0198 - val_accuracy: 1.0000 - val_loss: 0.0011
Epoch 4/20
[1m160/160[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 6ms/step - accuracy: 0.9981 - loss: 0.0130 - val_accuracy: 1.0000 - val_loss: 6.2069e-04
Epoch 5/20
[1m160/160[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 6ms/step - accuracy: 0.9988 - loss: 0.0068 - val_accuracy: 1.0000 - val_loss: 5.9896e-04
Epoch 6/20
[1m160/160[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 6ms/step - accuracy: 0.9986 - loss: 0.0068 - val_accuracy: 0.9992 - val_loss: 0.0010
Epoch 7/20
[1m160/1

In [17]:
import numpy as np
from tensorflow.keras.layers import Conv1D, MaxPooling1D, Flatten

# Reshape data: (samples, timesteps, channels)
X_train_cnn = np.expand_dims(X_train_scaled, axis=2)
X_test_cnn = np.expand_dims(X_test_scaled, axis=2)

cnn_model = Sequential([
    Conv1D(filters=32, kernel_size=3, activation='relu', input_shape=(X_train_scaled.shape[1], 1)),
    MaxPooling1D(pool_size=2),
    Flatten(),
    Dense(64, activation='relu'),
    Dense(1, activation='sigmoid')
])
cnn_model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
cnn_history = cnn_model.fit(
    X_train_cnn, y_train,
    validation_split=0.2,
    epochs=10,
    callbacks=[early_stop],
    verbose=1
)
loss, accuracy = cnn_model.evaluate(X_test_cnn, y_test)
print("CNN Test Accuracy:", accuracy)


Epoch 1/10


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


[1m160/160[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 4ms/step - accuracy: 0.9882 - loss: 0.1152 - val_accuracy: 1.0000 - val_loss: 0.0023
Epoch 2/10
[1m160/160[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 3ms/step - accuracy: 0.9975 - loss: 0.0154 - val_accuracy: 0.9992 - val_loss: 0.0016
Epoch 3/10
[1m160/160[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 3ms/step - accuracy: 0.9992 - loss: 0.0033 - val_accuracy: 0.9992 - val_loss: 0.0031
Epoch 4/10
[1m160/160[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 4ms/step - accuracy: 0.9995 - loss: 0.0021 - val_accuracy: 0.9992 - val_loss: 0.0016
Epoch 5/10
[1m160/160[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 7ms/step - accuracy: 0.9998 - loss: 0.0024 - val_accuracy: 1.0000 - val_loss: 4.1795e-04
Epoch 6/10
[1m160/160[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 7ms/step - accuracy: 0.9995 - loss: 7.6096e-04 - val_accuracy: 0.9992 - val_loss: 0.0026
Epoch 7/10
[1m160/160[0m [32

In [18]:
from tensorflow.keras.layers import SimpleRNN, LSTM

# Simple RNN Model
rnn_model = Sequential([
    SimpleRNN(32, input_shape=(X_train_scaled.shape[1], 1)),
    Dense(16, activation='relu'),
    Dense(1, activation='sigmoid')
])
rnn_model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
rnn_history = rnn_model.fit(
    X_train_cnn, y_train,
    validation_split=0.2,
    epochs=10,
    callbacks=[early_stop],
    verbose=1
)
loss, accuracy = rnn_model.evaluate(X_test_cnn, y_test)
print("RNN Test Accuracy:", accuracy)

# LSTM Model
lstm_model = Sequential([
    LSTM(32, input_shape=(X_train_scaled.shape[1], 1)),
    Dense(16, activation='relu'),
    Dense(1, activation='sigmoid')
])
lstm_model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
lstm_history = lstm_model.fit(
    X_train_cnn, y_train,
    validation_split=0.2,
    epochs=10,
    callbacks=[early_stop],
    verbose=1
)
loss, accuracy = lstm_model.evaluate(X_test_cnn, y_test)
print("LSTM Test Accuracy:", accuracy)


Epoch 1/10


  super().__init__(**kwargs)


[1m160/160[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 9ms/step - accuracy: 0.8270 - loss: 0.3294 - val_accuracy: 0.9984 - val_loss: 0.0148
Epoch 2/10
[1m160/160[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 6ms/step - accuracy: 0.9968 - loss: 0.0204 - val_accuracy: 0.9984 - val_loss: 0.0134
Epoch 3/10
[1m160/160[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 6ms/step - accuracy: 0.9961 - loss: 0.0240 - val_accuracy: 0.9984 - val_loss: 0.0122
Epoch 4/10
[1m160/160[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 6ms/step - accuracy: 0.9972 - loss: 0.0170 - val_accuracy: 0.9984 - val_loss: 0.0128
Epoch 5/10
[1m160/160[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 6ms/step - accuracy: 0.9981 - loss: 0.0105 - val_accuracy: 0.9984 - val_loss: 0.0130
Epoch 6/10
[1m160/160[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 6ms/step - accuracy: 0.9971 - loss: 0.0151 - val_accuracy: 0.9984 - val_loss: 0.0129
[1m50/50[0m [32m━━━━━━━━━━━━━━━━━━━━

  super().__init__(**kwargs)


[1m160/160[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 11ms/step - accuracy: 0.9966 - loss: 0.3064 - val_accuracy: 0.9984 - val_loss: 0.0126
Epoch 2/10
[1m160/160[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 14ms/step - accuracy: 0.9971 - loss: 0.0188 - val_accuracy: 0.9984 - val_loss: 0.0079
Epoch 3/10
[1m160/160[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 10ms/step - accuracy: 0.9958 - loss: 0.0124 - val_accuracy: 0.9984 - val_loss: 0.0022
Epoch 4/10
[1m160/160[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 10ms/step - accuracy: 0.9965 - loss: 0.0083 - val_accuracy: 0.9984 - val_loss: 0.0017
Epoch 5/10
[1m160/160[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 10ms/step - accuracy: 0.9976 - loss: 0.0092 - val_accuracy: 0.9984 - val_loss: 0.0018
Epoch 6/10
[1m160/160[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 9ms/step - accuracy: 0.9981 - loss: 0.0048 - val_accuracy: 1.0000 - val_loss: 0.0021
Epoch 7/10
[1m160/160[0m [32m━━