In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, confusion_matrix, roc_auc_score
import xgboost as xgb

In [2]:
df = pd.read_csv("Fraud.csv")

In [3]:
df.head()

Unnamed: 0,step,type,amount,nameOrig,oldbalanceOrg,newbalanceOrig,nameDest,oldbalanceDest,newbalanceDest,isFraud,isFlaggedFraud
0,1,PAYMENT,9839.64,C1231006815,170136.0,160296.36,M1979787155,0.0,0.0,0.0,0.0
1,1,PAYMENT,1864.28,C1666544295,21249.0,19384.72,M2044282225,0.0,0.0,0.0,0.0
2,1,TRANSFER,181.0,C1305486145,181.0,0.0,C553264065,0.0,0.0,1.0,0.0
3,1,CASH_OUT,181.0,C840083671,181.0,0.0,C38997010,21182.0,0.0,1.0,0.0
4,1,PAYMENT,11668.14,C2048537720,41554.0,29885.86,M1230701703,0.0,0.0,0.0,0.0


In [4]:
df.tail()

Unnamed: 0,step,type,amount,nameOrig,oldbalanceOrg,newbalanceOrig,nameDest,oldbalanceDest,newbalanceDest,isFraud,isFlaggedFraud
4734026,332,PAYMENT,7942.18,C1626972053,0.0,0.0,M1100119869,0.0,0.0,0.0,0.0
4734027,332,PAYMENT,10609.5,C1523043868,0.0,0.0,M833810286,0.0,0.0,0.0,0.0
4734028,332,CASH_OUT,132714.18,C488360954,153668.04,20953.85,C1881062348,572387.75,705101.93,0.0,0.0
4734029,332,PAYMENT,27966.84,C1336030451,20953.85,0.0,M728871989,0.0,0.0,0.0,0.0
4734030,332,CASH_IN,260229.42,C228521898,,,,,,,


In [5]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4734031 entries, 0 to 4734030
Data columns (total 11 columns):
 #   Column          Dtype  
---  ------          -----  
 0   step            int64  
 1   type            object 
 2   amount          float64
 3   nameOrig        object 
 4   oldbalanceOrg   float64
 5   newbalanceOrig  float64
 6   nameDest        object 
 7   oldbalanceDest  float64
 8   newbalanceDest  float64
 9   isFraud         float64
 10  isFlaggedFraud  float64
dtypes: float64(7), int64(1), object(3)
memory usage: 397.3+ MB


In [6]:
df.shape

(4734031, 11)

In [7]:
df.describe()

Unnamed: 0,step,amount,oldbalanceOrg,newbalanceOrig,oldbalanceDest,newbalanceDest,isFraud,isFlaggedFraud
count,4734031.0,4734031.0,4734030.0,4734030.0,4734030.0,4734030.0,4734030.0,4734030.0
mean,182.8565,179897.8,840851.8,862600.2,1055498.0,1185914.0,0.0007885459,6.337095e-07
std,99.89669,650063.1,2915411.0,2952086.0,3018792.0,3388540.0,0.02806999,0.0007960586
min,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,132.0,13099.15,0.0,0.0,0.0,0.0,0.0,0.0
50%,191.0,75608.28,14059.0,0.0,136723.7,219812.8,0.0,0.0
75%,260.0,209241.2,107877.0,146356.5,944467.2,1117674.0,0.0,0.0
max,332.0,92445520.0,43818860.0,43686620.0,355553400.0,355553400.0,1.0,1.0


In [8]:
type(df)

pandas.core.frame.DataFrame

# 1. Data Cleaning

In [9]:
# Handling missing values
df = df.dropna()

In [10]:
# Handling outliers (you might want to use more advanced techniques here)
# For simplicity, let's assume we remove rows with extreme values
df = df[(df['amount'] < df['amount'].quantile(0.99)) & (df['amount'] > df['amount'].quantile(0.01))]

In [11]:
import numpy as np

In [12]:
# Handling multicollinearity (you may want to perform more advanced analysis)
# For simplicity, let's drop highly correlated columns
correlation_matrix = df.corr().abs()
upper_triangle = correlation_matrix.where(np.triu(np.ones(correlation_matrix.shape), k=1).astype(np.bool))
to_drop = [column for column in upper_triangle.columns if any(upper_triangle[column] > 0.9)]
df = df.drop(to_drop, axis=1)

  correlation_matrix = df.corr().abs()
Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
  upper_triangle = correlation_matrix.where(np.triu(np.ones(correlation_matrix.shape), k=1).astype(np.bool))


# 2. Fraud Detection Model

In [13]:
# Assume 'isFraud' is the target variable, and other columns are features
X = df.drop(['isFraud', 'isFlaggedFraud'], axis=1)
y = df['isFraud']

In [14]:
# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [15]:
# Use ColumnTransformer to handle categorical variables
# For simplicity, let's assume 'type' is a categorical variable
categorical_features = ['type']

In [16]:
# Create a preprocessing pipeline
preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), X.select_dtypes(include=['number']).columns),
        ('cat', OneHotEncoder(), categorical_features)
    ])

In [17]:
# Create the final pipeline with the model
pipeline = Pipeline([
    ('preprocessor', preprocessor),
    ('model', xgb.XGBClassifier())
])

In [18]:
# Fit the pipeline on training data
pipeline.fit(X_train, y_train)

# 4. Performance Evaluation

In [19]:
# Predictions
y_pred = pipeline.predict(X_test)

In [20]:
# Confusion Matrix and Classification Report
print("Confusion Matrix:")
print(confusion_matrix(y_test, y_pred))
print("\nClassification Report:")
print(classification_report(y_test, y_pred))

Confusion Matrix:
[[927262     37]
 [   213    358]]

Classification Report:
              precision    recall  f1-score   support

         0.0       1.00      1.00      1.00    927299
         1.0       0.91      0.63      0.74       571

    accuracy                           1.00    927870
   macro avg       0.95      0.81      0.87    927870
weighted avg       1.00      1.00      1.00    927870



In [21]:
# ROC-AUC Score
y_prob = pipeline.predict_proba(X_test)[:, 1]
print(f"\nROC-AUC Score: {roc_auc_score(y_test, y_prob)}")


ROC-AUC Score: 0.998255700841747


# 5. Key Predictive Factors

In [22]:
# Feature Importance
# Print feature names after preprocessing
feature_names = preprocessor.transformers_[0][2].tolist() + list(pipeline.named_steps['preprocessor'].named_transformers_['cat'].get_feature_names_out(categorical_features))
feature_importance = pd.Series(pipeline.named_steps['model'].feature_importances_, index=feature_names)
print("\nFeature Importance:")
print(feature_importance.sort_values(ascending=False))


Feature Importance:
type_CASH_IN      0.317711
type_PAYMENT      0.197322
type_TRANSFER     0.188026
type_CASH_OUT     0.075385
oldbalanceDest    0.064072
oldbalanceOrg     0.061659
amount            0.059093
step              0.036731
type_DEBIT        0.000000
dtype: float32


# 6. Sense-making of Factors
Interpretation based on domain knowledge or further analysis

# 7. Prevention Recommendations
Based on the analysis, provide recommendations for prevention strategies

# 8. Monitoring and Evaluation
Determine metrics for post-implementation evaluation and regularly assess effectiveness
