In [None]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

In [None]:
data = pd.read_csv("/kaggle/input/paysim1/PS_20174392719_1491204439457_log.csv")
data

In [None]:
# test = data['nameOrig'].duplicated()

In [None]:
# data.iloc[59]

In [None]:
# data.iloc[834]

In [None]:
# len(data[data['nameOrig'].duplicated(keep=False)].index)

In [None]:
# len(data[data['nameDest'].duplicated(keep=False)].index)

In [None]:
# data = data.drop(columns=['nameDest', 'nameOrig', 'isFlaggedFraud'])

In [None]:
from sklearn import preprocessing
label_encoder = preprocessing.LabelEncoder()
data["type"] = label_encoder.fit_transform(data["type"])
data["nameOrig"] = label_encoder.fit_transform(data["nameOrig"])
data["nameDest"] = label_encoder.fit_transform(data["nameDest"])

In [None]:
correlation_matrix = data.corr()

plt.figure(figsize=(10, 8))
sns.heatmap(correlation_matrix, annot=True, cmap='coolwarm', linewidths=0.5)
plt.title('Correlation Matrix')
plt.show()


In [None]:
x = data.drop(columns=['isFraud'])
y = data['isFraud']

In [None]:
from imblearn.over_sampling import SMOTE
oversample = SMOTE()
x, y = oversample.fit_resample(x, y)

In [None]:
df_combined = pd.concat([x, y], axis=1)

In [None]:
df_combined

In [None]:
correlation_matrix = df_combined.corr()

plt.figure(figsize=(10, 8))
sns.heatmap(correlation_matrix, annot=True, cmap='coolwarm', linewidths=0.5)
plt.title('Correlation Matrix')
plt.show()

In [None]:
df_combined = df_combined.drop(columns=['isFlaggedFraud'])

In [None]:
x = df_combined.drop(columns=['isFraud'])
y = df_combined['isFraud']

In [None]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=42, stratify=y)

In [None]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import GaussianNB
from sklearn.neighbors import KNeighborsClassifier
import xgboost as xgb
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report ,f1_score , recall_score,precision_score
from xgboost import XGBClassifier

In [None]:
models = {
    "Logistic Regression": LogisticRegression(),
    "Naive Bayes": GaussianNB(),
    "K-Nearest Neighbors": KNeighborsClassifier(),
    "Decision Tree": DecisionTreeClassifier(),
    "XGBoost": XGBClassifier(use_label_encoder=False, 
        eval_metric="logloss",  # Avoid warnings
        tree_method='gpu_hist',  # Use GPU-accelerated histogram tree method
        gpu_id=0 )  # Avoid warnings
}
results = {}

for name, model in models.items():
    # Train the model
    model.fit(X_train, y_train)
    
    # Predict on the test set
    y_pred = model.predict(X_test)
    

    cm = confusion_matrix(y_test, y_pred)
    accuracy = accuracy_score(y_test, y_pred)
    precision = precision_score(y_test, y_pred)
    recall = recall_score(y_test, y_pred)
    f1 = f1_score(y_test, y_pred)
    
    # Store results
    results[name] = {
        'Accuracy' : accuracy,
        'Confusion Matrix': cm.flatten(),  # Flatten to get TN, FP, FN, TP
        'Precision': precision,
        'Recall': recall,
        'F1 Score': f1
    }



In [None]:
results

In [None]:
# Convert to DataFrame for easier plotting
import pandas as pd

# Create DataFrame
df_scores = pd.DataFrame(results).T

# Plotting the scores for each metric
fig, axes = plt.subplots(2, 2, figsize=(12, 10))

# Plot Accuracy
axes[0, 0].bar(df_scores.index, df_scores['Accuracy'], color='skyblue')
axes[0, 0].set_title('Accuracy Comparison')
axes[0, 0].set_ylabel('Accuracy')

# Plot Precision
axes[0, 1].bar(df_scores.index, df_scores['Precision'], color='lightgreen')
axes[0, 1].set_title('Precision Comparison')
axes[0, 1].set_ylabel('Precision')

# Plot Recall
axes[1, 0].bar(df_scores.index, df_scores['Recall'], color='salmon')
axes[1, 0].set_title('Recall Comparison')
axes[1, 0].set_ylabel('Recall')

# Plot F1 Score
axes[1, 1].bar(df_scores.index, df_scores['F1 Score'], color='lightcoral')
axes[1, 1].set_title('F1 Score Comparison')
axes[1, 1].set_ylabel('F1 Score')

# Rotate x labels for better readability
for ax in axes.flat:
    ax.set_xticklabels(ax.get_xticklabels(), rotation=45)

plt.tight_layout()
plt.show()

In [None]:
!pip install shap

In [None]:
import shap
best_model = models["XGBoost"]
explainer = shap.Explainer(best_model)
shap_values = explainer(X_test)
shap_values.shape

In [None]:
shap.plots.waterfall(shap_values[5,:])

In [None]:
shap.plots.bar(shap_values[4,:])

In [None]:
shap.summary_plot(shap_values=shap_values[: ,:], features=X_test.iloc[:, :])

In [None]:
shap_values.shape

In [None]:
from transformers import pipeline
import numpy as np

# Function to generate a natural language explanation based on SHAP values
def generate_nlp_report(shap_values, features, top_n=6):
    """
    Generates a natural language report based on SHAP values.

    Args:
        shap_values: A NumPy array of SHAP values. Should be 2D with shape (num_features, num_classes).
        features: A pandas DataFrame containing the features.
        top_n: The number of top contributing features to include in the report.

    Returns:
        A natural language report as a string.
    """

    # Get the top contributing features for fraud
    # print(shap_values)
    shap_abs_values = np.abs(shap_values.values)  # Assuming shap_values is now a 1D array
    # If shap_values is 2D (e.g., (num_features, num_classes)),
    # you might need to select the relevant class before taking the absolute value:
    # shap_abs_values = np.abs(shap_values[:, 0])  # For example, for the first class
    print(shap_abs_values)
    top_features_indices = np.argsort(shap_abs_values)[-top_n:]

    # Generate a simple explanation
    report = "This transaction was flagged as fraud due to: "
    print(top_features_indices)
    for i in reversed(top_features_indices):
        feature_name = features.columns[i]
        report += f"{feature_name} with an influence of {shap_abs_values[i]:.4f}, "

    return report[:-2] + "."

# Example explanation for a fraud prediction
# Access the SHAP values for the first prediction and desired class
fraud_shap_values = shap_values[1, :] # Get SHAP values for all features for the first prediction
# Or if you want to use all classes:
# fraud_shap_values = shap_values[1]  # Get SHAP values for all features and classes for the first prediction

explanation = generate_nlp_report(fraud_shap_values, X_test)
print(explanation)

In [None]:
# Define feature descriptions
feature_descriptions = {
    "step": "Anomalous Timing of Transactions",
    "type" : "High-Risk Transaction Type",
    "amount": "Unusual Transaction Amount",
    "nameOrig": "Suspicious Originating Account",
    "oldbalanceOrg": "Dramatic Balance Decrease",
    "newbalanceOrig": "Dangerously Low Balance Post-Transaction",
    "nameDest": "Transactions to Unverified Accounts",
    "oldbalanceDest": "Low Initial Balance in Destination Account",
    "newbalanceDest": "Significant Increase in Destination Balance"
}

def extract_top_reasons(shap_values, feature_names, top_n=3):
    """
    Extracts the top N reasons contributing to a fraud prediction based on SHAP values.

    Parameters:
    - shap_values: SHAP values for the instance.
    - feature_names: List of feature names.
    - top_n: Number of top features to extract.

    Returns:
    - List of reason descriptions.
    """
    # Calculate absolute SHAP values
    shap_abs = np.abs(shap_values.values)

    # Get indices of top N features
    top_indices = np.argsort(shap_abs)[-top_n:]

    # Map feature names to descriptions
    reasons = [feature_descriptions.get(feature_names[i], feature_names[i]) for i in top_indices]

    return reasons


# 1. go to https://aistudio.google.com/app/apikey
# 2. create API key
# 3. put in code

In [None]:
import google.generativeai as genai
import os

from kaggle_secrets import UserSecretsClient
user_secrets = UserSecretsClient()
API = user_secrets.get_secret("gemini_api")

def generate_from_gimini(resones,name):
    genai.configure(api_key=API)
    model = genai.GenerativeModel("gemini-1.5-flash")
    text = ""
    for x in resones:
        text = text + x + " "
    promet=f"""{text} 
    take those tags and make email that is from a bank that tell the customer {name} that his transaction is a fraud based on those tags"""
    response = model.generate_content(promet)
    print(response.text)
    return

In [None]:
# Find indices of fraudulent transactions in the test set
fraud_indices = np.where(y_test.values == 1)[0]

if len(fraud_indices) == 0:
    print("No fraudulent transactions detected in the test set.")
else:
    # Select the first fraudulent transaction
    fraud_index = fraud_indices[0]

    # Get the SHAP values for this transaction
    transaction_shap_values = shap_values[1, :]  # shap_values[1] corresponds to the positive class (fraud)

    # Get the feature names
    feature_names = X_test.columns

    # Extract top reasons
    top_reasons = extract_top_reasons(transaction_shap_values, feature_names, top_n=3)

    # Generate the email
    customer_name = "John Doe"  # Replace with actual customer name as needed
    email_content = generate_from_gimini(top_reasons, customer_name)
