# Feature Engineering and Data Preprocessing


In [9]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [10]:
fraud_df = pd.read_csv("../data/raw/Fraud_Data.csv")
ip_df = pd.read_csv("../data/raw/IpAddress_to_Country.csv")


# Convert IP addresses to integer format

In [17]:
def ip_to_int(ip):
    parts = [int(x) for x in ip.split('.')]
    return parts[0] * 256**3 + parts[1] * 256**2 + parts[2] * 256 + parts[3]

fraud_df['ip_int'] = fraud_df['ip_address'].apply(ip_to_int)

AttributeError: 'float' object has no attribute 'split'

In [11]:
fraud_df["ip_address_int"] = fraud_df["ip_address"].astype(int)


In [12]:
ip_df["lower_bound_ip_address"] = ip_df["lower_bound_ip_address"].astype(int)
ip_df["upper_bound_ip_address"] = ip_df["upper_bound_ip_address"].astype(int)

ip_df = ip_df.sort_values("lower_bound_ip_address")
fraud_df = fraud_df.sort_values("ip_address_int")


In [13]:
merged_df = pd.merge_asof(
    fraud_df,
    ip_df,
    left_on="ip_address_int",
    right_on="lower_bound_ip_address",
    direction="backward"
)

merged_df = merged_df[
    merged_df["ip_address_int"] <= merged_df["upper_bound_ip_address"]
]


In [14]:
merged_df["signup_time"] = pd.to_datetime(merged_df["signup_time"])
merged_df["purchase_time"] = pd.to_datetime(merged_df["purchase_time"])

merged_df["hour_of_day"] = merged_df["purchase_time"].dt.hour
merged_df["day_of_week"] = merged_df["purchase_time"].dt.dayofweek
merged_df["time_since_signup"] = (
    merged_df["purchase_time"] - merged_df["signup_time"]
).dt.total_seconds()


In [15]:
from sklearn.preprocessing import StandardScaler

num_cols = ["purchase_value", "age", "time_since_signup"]
scaler = StandardScaler()
merged_df[num_cols] = scaler.fit_transform(merged_df[num_cols])

merged_df = pd.get_dummies(
    merged_df,
    columns=["browser", "source", "sex", "country"],
    drop_first=True
)


In [16]:
merged_df.to_csv("../data/processed/fraud_processed.csv", index=False)


## Summary
- IP geolocation successfully integrated
- Time-based and behavioral features engineered
- Dataset prepared for modeling


In [31]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np

# Load the fraud dataset and the IP-to-country mapping
fraud_df = pd.read_csv('../data/raw/Fraud_Data.csv')
ip_df = pd.read_csv('../data/raw/IpAddress_to_Country.csv')

# Convert IP addresses to integer format, handle invalid IPs
def ip_to_int(ip):
    try:
        # Only process valid IP addresses (strings)
        parts = [int(x) for x in ip.split('.')]
        return parts[0] * 256**3 + parts[1] * 256**2 + parts[2] * 256 + parts[3]
    except Exception as e:
        # If the IP is invalid or NaN, return a default value (or NaN)
        return np.nan

# Apply the conversion function
fraud_df['ip_int'] = fraud_df['ip_address'].apply(ip_to_int)

# Optional: Remove rows with invalid IP addresses (NaN)
valid_fraud_df = fraud_df.dropna(subset=['ip_int'])

# Check if there are valid rows after filtering
if valid_fraud_df.empty:
    print("No valid IP addresses found. The dataset has been emptied after filtering.")
else:
    # Sort the data for range-based merging
    valid_fraud_df = valid_fraud_df.sort_values('ip_int')
    ip_df = ip_df.sort_values('lower_bound_ip_address')

    # Perform range-based merge to match IP addresses with countries
    fraud_geo_df = pd.merge_asof(
        valid_fraud_df,
        ip_df,
        left_on='ip_int',
        right_on='lower_bound_ip_address',
        direction='backward'
    )

    # Group by country and count fraudulent transactions
    fraud_by_country = fraud_geo_df[fraud_geo_df['class'] == 1].groupby('country').size().sort_values(ascending=False)

    # Plot the fraud count by country
    plt.figure(figsize=(12,6))
    fraud_by_country.head(10).plot(kind='bar', color='salmon')
    plt.title('Top 10 Countries by Fraudulent Transactions')
    plt.xlabel('Country')
    plt.ylabel('Number of Fraudulent Transactions')
    plt.xticks(rotation=45)
    plt.tight_layout()

    # Save the image
    plt.savefig('notebooks/images/fraud_by_country.png', dpi=300)
    plt.show()


No valid IP addresses found. The dataset has been emptied after filtering.
