In [1]:
import pandas as pd
import numpy as np
from datetime import datetime
import os
# Add the project root directory to Python path
import sys
sys.path.append(os.path.abspath(os.path.join(os.path.dirname('__file__'), '..')))

In [3]:
from src.data_merger import ProcessedDataMerger

In [4]:
 # Initialize the processed data merger
fraud_processed_path = '../data/processed/fraud_processed.csv'
credit_processed_path = '../data/processed/credit_processed.csv'
    
merger = ProcessedDataMerger(fraud_processed_path, credit_processed_path)
    
print("Starting comprehensive fraud detection analysis...")
print("="*60)

Starting comprehensive fraud detection analysis...


In [5]:
# Load processed data
merger.load_processed_data()
    
# Create comprehensive analysis
analysis = merger.create_comprehensive_analysis()
    
# Print summary
merger.print_analysis_summary(analysis)

2025-07-20 11:44:32,408 - INFO - Loading processed fraud data...
2025-07-20 11:44:32,962 - INFO - Fraud processed data loaded: (151112, 13)
2025-07-20 11:44:32,963 - INFO - Loading processed credit card data...
2025-07-20 11:44:36,360 - INFO - Credit processed data loaded: (283726, 31)
2025-07-20 11:44:36,361 - INFO - Creating comprehensive analysis...
2025-07-20 11:44:36,362 - INFO - Analyzing fraud data with geolocation...
2025-07-20 11:44:36,423 - INFO - Fraud rate: 0.0936
2025-07-20 11:44:36,423 - INFO - Countries found: 1
2025-07-20 11:44:36,424 - INFO - High-risk countries: 0
2025-07-20 11:44:36,424 - INFO - Analyzing credit card data...
  amount_fraud_analysis = self.credit_data.groupby('amount_category').agg({
2025-07-20 11:44:36,513 - INFO - Credit card fraud rate: 0.001667
2025-07-20 11:44:36,514 - INFO - Average transaction amount: $88.47
2025-07-20 11:44:36,514 - INFO - Comprehensive analysis completed!





In [6]:
# Get detailed geolocation summary
print("\nDETAILED GEOLOCATION SUMMARY:")
print("="*40)
geolocation_summary = merger.get_geolocation_summary()
print(geolocation_summary.head(10))
    

2025-07-20 11:44:39,002 - INFO - Creating geolocation summary...



DETAILED GEOLOCATION SUMMARY:
           Total_Transactions  Fraud_Count  Avg_Purchase_Value  \
country                                                          
Australia              151112        14151             36.9354   

           Total_Purchase_Value  Unique_Users  Fraud_Rate  \
country                                                     
Australia               5581378        151112      0.0936   

           Avg_Transaction_Value  
country                           
Australia                  36.94  


In [7]:
# Save results
output_path = '../data/merged/comprehensive_analysis.csv'
merger.save_analysis_results(output_path, analysis)
    
print(f"\nAnalysis results saved to:")
print(f"  - {output_path.replace('.csv', '_geolocation_summary.csv')}")
print(f"  - {output_path.replace('.csv', '_country_analysis.csv')}")
print(f"  - {output_path.replace('.csv', '_summary.csv')}")
    

2025-07-20 11:44:44,099 - INFO - Saving analysis results to ../data/merged/comprehensive_analysis.csv...
2025-07-20 11:44:44,100 - INFO - Creating geolocation summary...
2025-07-20 11:44:44,147 - INFO - Geolocation summary saved to ../data/merged/comprehensive_analysis_geolocation_summary.csv
2025-07-20 11:44:44,150 - INFO - Country analysis saved to ../data/merged/comprehensive_analysis_country_analysis.csv
2025-07-20 11:44:44,153 - INFO - Summary saved to ../data/merged/comprehensive_analysis_summary.csv



Analysis results saved to:
  - ../data/merged/comprehensive_analysis_geolocation_summary.csv
  - ../data/merged/comprehensive_analysis_country_analysis.csv
  - ../data/merged/comprehensive_analysis_summary.csv


In [9]:
# Show some key insights
print("\nKEY INSIGHTS:")
print("="*40)
# Fraud rate comparison
fraud_rate = analysis['fraud_dataset']['fraud_rate']
credit_rate = analysis['credit_dataset']['fraud_rate']
print(f"• Fraud dataset fraud rate: {fraud_rate:.4f} ({fraud_rate*100:.2f}%)")
print(f"• Credit card dataset fraud rate: {credit_rate:.6f} ({credit_rate*100:.4f}%)")
 # Country insights
top_countries = analysis['fraud_dataset']['top_countries']
print(f"\n• Top country by transactions: {top_countries.index[0]} ({top_countries.iloc[0]['Total_Transactions']:,} transactions)")
high_risk_countries = analysis['fraud_dataset']['high_risk_countries']
if not high_risk_countries.empty:
        worst_country = high_risk_countries.iloc[0]
        print(f"• Highest fraud rate country: {high_risk_countries.index[0]} ({worst_country['Fraud_Rate']:.4f} fraud rate)")
 # Credit card insights
credit_stats = analysis['credit_dataset']['amount_stats']
print(f"\n• Average credit card transaction: ${credit_stats['mean']:.2f}")
print(f"• Credit card transactions analyzed: {analysis['credit_dataset']['total_records']:,}")


KEY INSIGHTS:
• Fraud dataset fraud rate: 0.0936 (9.36%)
• Credit card dataset fraud rate: 0.001667 (0.1667%)

• Top country by transactions: Australia (151,112.0 transactions)

• Average credit card transaction: $88.47
• Credit card transactions analyzed: 283,726
