In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score
import joblib

df = pd.read_csv("Resources/Fraud.csv")

In [2]:
df.head()

Unnamed: 0,step,type,amount,nameOrig,oldbalanceOrg,newbalanceOrig,nameDest,oldbalanceDest,newbalanceDest,isFraud,isFlaggedFraud
0,1,PAYMENT,9839.64,C1231006815,170136.0,160296.36,M1979787155,0.0,0.0,0,0
1,1,PAYMENT,1864.28,C1666544295,21249.0,19384.72,M2044282225,0.0,0.0,0,0
2,1,TRANSFER,181.0,C1305486145,181.0,0.0,C553264065,0.0,0.0,1,0
3,1,CASH_OUT,181.0,C840083671,181.0,0.0,C38997010,21182.0,0.0,1,0
4,1,PAYMENT,11668.14,C2048537720,41554.0,29885.86,M1230701703,0.0,0.0,0,0


## Data Pre-Processing

In [3]:
missing_data = df.isnull().sum()
print("Missing values:\n", missing_data)

print("\nDistribution of target variable (isFraud):\n", df['isFraud'].value_counts())

Missing values:
 step              0
type              0
amount            0
nameOrig          0
oldbalanceOrg     0
newbalanceOrig    0
nameDest          0
oldbalanceDest    0
newbalanceDest    0
isFraud           0
isFlaggedFraud    0
dtype: int64

Distribution of target variable (isFraud):
 isFraud
0    6354407
1       8213
Name: count, dtype: int64


## Feature Engineering

In [5]:
X = df.drop(['isFraud'], axis=1)
y = df['isFraud']

categorical_features = ['type']
numerical_features = ['amount', 'oldbalanceOrg', 'newbalanceOrig', 'oldbalanceDest', 'newbalanceDest']

preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), numerical_features),
        ('cat', OneHotEncoder(), categorical_features)
    ])

## Model Selection and Training

In [7]:
from sklearn.model_selection import RandomizedSearchCV

def get_subset(X, y, frac=0.01):
    return X.sample(frac=frac, random_state=42), y.sample(frac=frac, random_state=42)

X_small, y_small = get_subset(X, y)

X_train, X_test, y_train, y_test = train_test_split(X_small, y_small, test_size=0.2, random_state=42)

param_distributions = {
    'classifier__n_estimators': [50, 100, 200],
    'classifier__max_depth': [None, 10, 20, 30],
    'classifier__min_samples_split': [2, 5, 10],
    'classifier__min_samples_leaf': [1, 2, 4]
}

pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('classifier', RandomForestClassifier(class_weight='balanced', n_jobs=-1))
])

random_search = RandomizedSearchCV(pipeline, param_distributions, n_iter=10, cv=3, scoring='roc_auc', n_jobs=-1, random_state=42)
random_search.fit(X_train, y_train)

best_params = random_search.best_params_
model = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('classifier', RandomForestClassifier(
        class_weight='balanced',
        n_jobs=-1,
        n_estimators=best_params['classifier__n_estimators'],
        max_depth=best_params['classifier__max_depth'],
        min_samples_split=best_params['classifier__min_samples_split'],
        min_samples_leaf=best_params['classifier__min_samples_leaf']
    ))
])

model.fit(X_train, y_train)

## Model Evaluation

In [8]:
y_pred = model.predict(X_test)

accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)
roc_auc = roc_auc_score(y_test, y_pred)

print("Model Evaluation Metrics:")
print("Accuracy:", accuracy)
print("Precision:", precision)
print("Recall:", recall)
print("F1 Score:", f1)
print("ROC AUC Score:", roc_auc)

Model Evaluation Metrics:
Accuracy: 0.9996856828539997
Precision: 0.9090909090909091
Recall: 0.7692307692307693
F1 Score: 0.8333333333333333
ROC AUC Score: 0.8845760547955152


## Actionable Insights

In [9]:
importances = model.named_steps['classifier'].feature_importances_
feature_names = numerical_features + list(model.named_steps['preprocessor'].named_transformers_['cat'].get_feature_names_out(categorical_features))

feature_importances = pd.DataFrame({'Feature': feature_names, 'Importance': importances})

feature_importances = feature_importances.sort_values(by='Importance', ascending=False)

feature_importances

Unnamed: 0,Feature,Importance
1,oldbalanceOrg,0.295607
0,amount,0.192934
2,newbalanceOrig,0.131266
9,type_TRANSFER,0.084463
4,newbalanceDest,0.081886
3,oldbalanceDest,0.068502
8,type_PAYMENT,0.068135
5,type_CASH_IN,0.051284
6,type_CASH_OUT,0.025847
7,type_DEBIT,7.5e-05


## Save the Model

In [14]:
joblib.dump(model, 'fraud_detection_model.pkl')

['fraud_detection_model.pkl']

## Answers to Questions

### Describe your fraud detection model in elaboration

I employed a Random Forest classifier for fraud detection due to its ability to handle large datasets effectively and capture complex relationships between features. The model was trained on transactional data, including features like transaction amount, account balances before and after transactions, and the type of payment. I only used a fraction of data set as it will take a lot of time to train data on the given dataset. Preprocessing steps included standardization of numerical features and one-hot encoding of categorical features. The model was trained to classify transactions as fraudulent or non-fraudulent based on these features.

### How did you select variables to be included in the model?

Variables were selected based on domain knowledge and their potential correlation with fraudulent transactions. Features like transaction amount, account balances, and transaction type were considered important indicators of fraud. I also used feature importances from the Random Forest model to identify the most predictive variables.

### Demonstrate the performance of the model by using best set of tools.

I evaluated the model's performance using common metrics such as accuracy, precision, recall, F1 score, and ROC AUC score. These metrics provide insights into how well the model can classify fraudulent transactions and minimize false positives and false negatives. This model is more than 99.96% accurate

### What are the key factors that predict fraudulent customer?

The key factors that predict fraudulent transactions include transaction amount, account balances before and after transactions, and the type of payment. These features capture patterns associated with fraudulent behavior, such as large transactions, sudden changes in account balances, and certain types of transactions.

### Do these factors make sense? If yes, How? If not, How not?

Yes, these factors make sense as they align with common patterns of fraudulent activity observed in financial transactions. Large transactions or unusual changes in account balances are often indicators of fraudulent behavior.

### What kind of prevention should be adopted while company update its infrastructure?

Implement real-time fraud detection systems that can identify suspicious transactions as they occur. Enhance security protocols for high-risk transactions, such as requiring additional authentication for large transfers or transactions to unfamiliar recipients. Regularly update and monitor machine learning models to adapt to new fraud patterns and ensure their effectiveness.

### Assuming these actions have been implemented, how would you determine if they work?

Monitoring Fraud Metrics. Evaluating the score for assumption and real time impementaion.