In [None]:
import pandas as pd
import numpy as np
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
import matplotlib.pyplot as plt
import seaborn as sns
import datetime

# Load data
fraud_data = pd.read_csv('Fraud_Data.csv')
ip_data = pd.read_csv('IpAddress_to_Country.csv')

# Handle Missing Values
# Impute or drop missing values
imputer = SimpleImputer(strategy='median')
fraud_data = pd.DataFrame(imputer.fit_transform(fraud_data), columns=fraud_data.columns)

# Data Cleaning
# Remove duplicates
fraud_data = fraud_data.drop_duplicates()

# Correct data types
fraud_data['signup_time'] = pd.to_datetime(fraud_data['signup_time'])
fraud_data['purchase_time'] = pd.to_datetime(fraud_data['purchase_time'])
fraud_data['ip_address'] = fraud_data['ip_address'].astype(int)

# Exploratory Data Analysis (EDA)
# Univariate analysis
plt.figure(figsize=(12, 6))
sns.histplot(fraud_data['purchase_value'], bins=30, kde=True)
plt.title('Distribution of Purchase Value')
plt.show()

plt.figure(figsize=(12, 6))
sns.countplot(x='class', data=fraud_data)
plt.title('Class Distribution')
plt.show()

# Bivariate analysis
plt.figure(figsize=(12, 6))
sns.boxplot(x='class', y='purchase_value', data=fraud_data)
plt.title('Purchase Value by Class')
plt.show()

plt.figure(figsize=(12, 6))
sns.scatterplot(x='purchase_value', y='age', hue='class', data=fraud_data)
plt.title('Purchase Value vs. Age')
plt.show()

# Merge Datasets for Geolocation Analysis
# Convert IP addresses to integer format
ip_data['lower_bound_ip_address'] = ip_data['lower_bound_ip_address'].apply(lambda x: int(x.replace('.', '')))
ip_data['upper_bound_ip_address'] = ip_data['upper_bound_ip_address'].apply(lambda x: int(x.replace('.', '')))

# Merge Fraud_Data.csv with IpAddress_to_Country.csv
fraud_data = pd.merge(fraud_data, ip_data, how='left', left_on='ip_address', right_on='lower_bound_ip_address')

# Feature Engineering
# Transaction frequency and velocity for Fraud_Data.csv
fraud_data['signup_time'] = pd.to_datetime(fraud_data['signup_time'])
fraud_data['purchase_time'] = pd.to_datetime(fraud_data['purchase_time'])
fraud_data['time_diff'] = (fraud_data['purchase_time'] - fraud_data['signup_time']).dt.total_seconds()

# Time-Based features for Fraud_Data.csv
fraud_data['hour_of_day'] = fraud_data['purchase_time'].dt.hour
fraud_data['day_of_week'] = fraud_data['purchase_time'].dt.dayofweek

# Normalization and Scaling
scaler = StandardScaler()
fraud_data[['purchase_value', 'age', 'time_diff']] = scaler.fit_transform(fraud_data[['purchase_value', 'age', 'time_diff']])

# Encode Categorical Features
categorical_features = ['source', 'browser', 'sex']
encoder = OneHotEncoder()
encoded_features = pd.DataFrame(encoder.fit_transform(fraud_data[categorical_features]).toarray(), columns=encoder.get_feature_names_out(categorical_features))
fraud_data = pd.concat([fraud_data, encoded_features], axis=1)
fraud_data = fraud_data.drop(categorical_features, axis=1)

# Save the cleaned and processed data
fraud_data.to_csv('cleaned_fraud_data.csv', index=False)
