In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import scipy.stats as stats
import seaborn as sns

from sklearn.model_selection import train_test_split, RandomizedSearchCV, learning_curve
from sklearn.preprocessing import LabelEncoder
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier, export_graphviz

import graphviz

from sklearn.metrics import (
    confusion_matrix,
    ConfusionMatrixDisplay,
    precision_recall_curve,
    accuracy_score,
    classification_report,
    roc_curve,
    roc_auc_score,
)


# 1. Understand the Problem and Data

In [None]:
fraud = pd.read_csv("Fraud_Data.csv")
ipaddress_mapping = pd.read_csv("IpAddress_to_Country.csv")

In [None]:
def get_country(ip, mapping):
    # Check which row in ipaddress_mapping matches the given IP
    match = mapping[
        (mapping["lower_bound_ip_address"] <= ip) & 
        (mapping["upper_bound_ip_address"] >= ip)
    ]
    # Return the country if a match is found
    return match["country"].iloc[0] if not match.empty else np.nan

# Apply the function to each row in fraud
fraud["country"] = fraud["ip_address"].apply(lambda ip: get_country(ip, ipaddress_mapping))


In [None]:
df = fraud.copy()

# 2. Feature Engineering

In [None]:
df["signup_time"] = pd.to_datetime(df["signup_time"])
df["purchase_time"] = pd.to_datetime(df["purchase_time"])

In [None]:
# Diff
df['tenure_days'] = (df["purchase_time"] - df["signup_time"]).dt.days
df['tenure_seconds'] = (df["purchase_time"] - df["signup_time"]).dt.seconds

# Signup
df['signup_dow'] = df['signup_time'].dt.dayofweek # 0 = Monday and 6 = Sunday # If you'd like the name of the day (e.g., "Sunday", "Monday"), use .dt.day_name() instead
df['signup_hour'] = df['signup_time'].dt.hour
df['signup_week'] = df['signup_time'].dt.isocalendar().week # .dt.isocalendar().week (Preferred): Extracts the ISO week number (1–53). This is aligned with the ISO 8601 standard.

# Purchase
df['purchase_dow'] = df['purchase_time'].dt.dayofweek
df['purchase_hour'] = df['purchase_time'].dt.hour
df['purchase_week'] = df['purchase_time'].dt.isocalendar().week

In [None]:
df["shared_device_user_cnt"] = df.groupby("device_id")["user_id"].transform('nunique')
df["shared_device_flag"] = df["shared_device_user_cnt"].apply(lambda x: 1 if x> 1 else 0)
df["shared_ip_user_cnt"] = df.groupby("ip_address")["user_id"].transform('nunique')
df["shared_ip_flag"] = df["shared_ip_user_cnt"].apply(lambda x: 1 if x> 1 else 0)

In [None]:
df.columns

In [None]:
train_df, test_df = train_test_split(df, test_size=0.3, random_state=42)

In [None]:
# Compute the encoding map from training data
encoding_map = train_df.groupby('country')['class'].mean()

In [None]:
default_value = train_df['class'].mean()  # Fallback value: global mean of the target variable in the training set

In [None]:
# Apply encoding to the training set
train_df['country_encoded'] = train_df['country'].map(encoding_map).fillna(default_value)
# Apply encoding to the test set (handle NaN and unseen categories)
test_df['country_encoded'] = test_df['country'].map(encoding_map).fillna(default_value)

In [None]:
# Label Encoding
le = LabelEncoder()
train_df['source_encoded'] = le.fit_transform(train_df['source'])

# Apply the same encoding to the test set
test_df['source_encoded'] = le.transform(test_df['source'])

In [None]:
# Label Encoding
le = LabelEncoder()
train_df['browser_encoded'] = le.fit_transform(train_df['browser'])

# Apply the same encoding to the test set
test_df['browser_encoded'] = le.transform(test_df['browser'])

In [None]:
# Label Encoding
le = LabelEncoder()
train_df['sex_encoded'] = le.fit_transform(train_df['sex'])

# Apply the same encoding to the test set
test_df['sex_encoded'] = le.transform(test_df['sex'])

# 3. Split Data for Training

In [None]:
# Define features and target
feature = [
    "signup_dow",
    "signup_week",
    "signup_hour",
    "purchase_dow",
    "purchase_week",
    "purchase_hour",
    "purchase_value",
    "source_encoded",
    "browser_encoded",
    "sex_encoded",
    "age",
    "country_encoded",
    "tenure_seconds",
    "shared_device_user_cnt",
    "shared_ip_user_cnt",
]
target = "class"


In [None]:
# Define features (X) and target (y)
X = train_df[feature] 
y = train_df[target]

# Split into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

In [None]:
df["class"].value_counts()

In [None]:
train_df["class"].value_counts()

In [None]:
y_train.value_counts()

In [None]:
y_test.value_counts()

# 4. Train Predictive Models

## i. General Guidelines

`max_depth` (Limits the depth of the tree)
- Prevents overfitting by controlling the tree's complexity.
- Rule of thumb:
    - For small datasets (<10,000 samples): Use **max_depth=3-10**.
    - For large datasets (>10,000 samples): Experiment with larger values.
- Proportional approach:
    - Set based on the number of features (**sqrt(n_features)** for classification problems).

`min_samples_split` (Minimum samples required to split a node)
- Ensures a split only occurs if enough samples are present, reducing overfitting.
- Rule of thumb:
    - Set min_samples_split to 2-5% of the dataset size (**int(0.02 * n_samples)**).
    - For imbalanced datasets, **adjust according to the minority class size**.

`min_samples_leaf` (Minimum samples per leaf node)
- Ensures that leaf nodes have enough data to make meaningful predictions.
- Rule of thumb:
    - Use 1-10% of the dataset size (`int(0.01 * n_samples)`).
    - For imbalanced datasets, ensure leaf nodes contain enough minority samples.


## ii. For Imbalanced Datasets (0/1 Classes)

When you have an imbalanced dataset (e.g., class 0: 90%, class 1: 10%), ensure that the minority class (class 1) is well-represented in the splits

**max_depth**
- Prevent deep trees that may overfit the majority class.
- Start with smaller depths, such as max_depth=5, and gradually increase while monitoring performance.

**min_samples_split**
- Set to ensure splits occur only if both classes are represented

**min_samples_leaf**
- Ensure leaf nodes contain meaningful samples for both classes

## iii. Additional Hyperparameters to Tune

**criterion (Splitting Criterion)**
- Defines how the split quality is measured.
- Options:
    - "gini" (default): Gini Impurity.
    - "entropy": Information Gain.

**max_features (Number of Features to Consider for Splits)**

- Limits the number of features to consider at each split.
- Options:
    - "sqrt": Square root of the total number of features (common for classification).
    - "log2": Logarithm base 2 of total features.
    - None: All features.

**min_weight_fraction_leaf (Minimum Weighted Fraction of Samples in a Leaf)**

- Forces a minimum fraction of the weighted input samples in a leaf node.
- Useful for datasets with weighted samples.

## iv. Considerations for Tuning

**1. Avoid Overfitting**
- Limit tree depth using max_depth.
- Set a minimum number of samples per leaf (min_samples_leaf) or split (min_samples_split).

**2. Improve Generalization**
- Use max_features to reduce the chance of overfitting specific features.
- Limit the number of leaf nodes (max_leaf_nodes) to simplify the tree.

**3. Optimize for Imbalanced Datasets**
- Use min_samples_leaf to ensure that leaf nodes have enough samples from minority classes.


***Keep in mind that a very large param_grid can lead to long search times. Start with fewer combinations and refine the grid iteratively based on results.***

In [None]:
max_depth = 5 
n_minority_samples = df["class"].value_counts()[1]
min_samples_split = max(2, int(0.05 * n_minority_samples))
min_samples_leaf = max(1, int(0.01 * n_minority_samples))

In [None]:
param_distributions = {
    "max_depth": [3, 5, 10, 20, None],
    "min_samples_split": stats.randint(2, 50),
    "min_samples_leaf": stats.randint(1, 20),
    "class_weight": ["balanced", None],
    "criterion": ["gini", "entropy"]
}

random_search = RandomizedSearchCV(
    estimator=DecisionTreeClassifier(random_state=42),
    param_distributions=param_distributions,
    n_iter=50,  # Number of random combinations to test
    scoring="roc_auc",  # Use a metric suitable for imbalanced data
    cv=5,
    verbose=2,
    random_state=42
)
random_search.fit(X_train, y_train)

In [None]:
# Print the best parameters
print("Best Parameters:", random_search.best_params_)

In [None]:
# Retrieve the best model from RandomizedSearchCV
best_tree = random_search.best_estimator_

In [None]:
# Use the best model to make predictions
y_pred = best_tree.predict(X_test)

In [None]:
# Export the decision tree to Graphviz format
dot_data = export_graphviz(
    best_tree,  # The trained DecisionTreeClassifier model
    out_file=None,  # No need to save to a file, we handle it in-memory
    feature_names=X_train.columns,  # Feature names from the training dataset
    class_names=["Class 0", "Class 1"],  # Replace with actual class names if available
    filled=True,  # Add colors to nodes based on class distribution
    rounded=True,  # Round the corners of the nodes
    special_characters=True  # Allow special characters in feature names
)

# Render the Graphviz tree
graph = graphviz.Source(dot_data)

display(graph)

In [None]:
# Access tree attributes
tree_depth = best_tree.tree_.max_depth
num_nodes = best_tree.tree_.node_count

print(f"Tree Depth: {tree_depth}")
print(f"Number of Nodes: {num_nodes}")


# 5. Model Evaluation

## i. Feature Importance

In [None]:
importances = best_tree.feature_importances_

In [None]:
# Get indices of features sorted by importance
indices = np.argsort(importances)

# Plot feature importance
plt.figure(figsize=(15, 5))
plt.barh(range(len(indices)), importances[indices], color="skyblue")
plt.yticks(range(len(indices)), [X_train.columns[i] for i in indices])
plt.xlabel("Feature Importance")
plt.ylabel("Feature")
plt.title("Feature Importance - Best Model from Randomized Search")
plt.tight_layout()
plt.show()

## ii. ROC Curve

The ROC (Receiver Operating Characteristic) curve is a graphical representation that illustrates the performance of a classification model at various thresholds. It is commonly used for binary classification problems to evaluate the **trade-off between sensitivity (True Positive Rate) and specificity (False Positive Rate)**.

- True Positives: Correctly identified fraudulent transactions.
- False Positives: Legitimate transactions mistakenly flagged as fraudulent.
- The ROC curve helps you visualize the trade-off between catching fraud (TPR) and wrongly flagging legitimate transactions (FPR).

- The closer the ROC curve is to the top-left corner, the better the model.
- AUC (Area Under Curve):
    - Ranges from 0 to 1.
    - 0.5: Random guessing.
    - 1.0: Perfect classifier.
- Diagonal Line:
    - Represents random guessing (baseline).


In [None]:
# Get predicted probabilities for the train and test sets
y_train_proba = best_tree.predict_proba(X_train)[:, 1]  # Probabilities for positive class (1)
y_test_proba = best_tree.predict_proba(X_test)[:, 1]

# Compute ROC curve and AUC for training data
fpr_train, tpr_train, _ = roc_curve(y_train, y_train_proba)
auc_train = roc_auc_score(y_train, y_train_proba)

# Compute ROC curve and AUC for test data
fpr_test, tpr_test, _ = roc_curve(y_test, y_test_proba)
auc_test = roc_auc_score(y_test, y_test_proba)

In [None]:
# Plot the ROC curves
plt.figure(figsize=(8, 6))
plt.plot(fpr_train, tpr_train, label=f"Train ROC Curve (AUC = {auc_train:.2f})", color="blue")
plt.plot(fpr_test, tpr_test, label=f"Test ROC Curve (AUC = {auc_test:.2f})", color="green")
plt.plot([0, 1], [0, 1], color="red", linestyle="--", label="Random Guess")  # Diagonal line
plt.xlabel("False Positive Rate")
plt.ylabel("True Positive Rate")
plt.title("ROC Curve - Train and Test")
plt.legend(loc="lower right")
plt.grid(alpha=0.4)
plt.show()

**Interpreting the Results**
- If the train AUC is much higher than the test AUC:
    - The model may be overfitting to the training data.
- If both curves are similar:
    - The model generalizes well to unseen data.

## iii. Confusion Matrix

A confusion matrix gives you detailed counts for:
- True Positives (TP)
- True Negatives (TN)
- False Positives (FP)
- False Negatives (FN)

Use confusion matrices to analyze misclassifications (e.g., whether your model misclassifies the minority class).

In [None]:
# Generate confusion matrix
cm = confusion_matrix(y_test, y_pred)

# Display confusion matrix
disp = ConfusionMatrixDisplay(confusion_matrix=cm)
disp.plot(cmap="Blues")
plt.show()


## iv. Precision-Recall Curve

- Useful for imbalanced datasets where the positive class is rare.
- Shows the trade-off between Precision and Recall at different thresholds.

In [None]:
y_pred_proba = best_tree.predict_proba(X_test)[:, 1]  # Only positive class

In [None]:
precision, recall, thresholds = precision_recall_curve(y_test, y_pred_proba)

plt.figure(figsize=(8, 6))
plt.plot(recall, precision, label="Precision-Recall Curve")
plt.xlabel("Recall")
plt.ylabel("Precision")
plt.title("Precision-Recall Curve")
plt.legend()
plt.show()


**Insights from Your Plot**
- High Precision for Low Recall
    - At the start of the curve (left side), precision is close to 1.0, indicating the model is very confident about the few predictions it makes at low recall values.
    - As recall increases, the model starts to classify more samples as positive, but precision decreases due to an increasing number of false positives.
- Sharp Drop in Precision
    - The steep decline indicates that as the threshold is lowered, the model includes more false positives, causing precision to degrade rapidly.
- Flat at Higher Recall
    - At the far right, the curve flattens out, indicating that recall is maximized (all positives are classified as positive), but precision is low due to many false positives.

**When to Use a PR Curve?**
- Imbalanced Datasets
    - PR curves are particularly useful for imbalanced datasets where the positive class is rare. The ROC curve may give an overly optimistic view of performance.
- Focus on False Positives and False Negatives
    - If minimizing false positives (e.g., fraud detection) or false negatives (e.g., medical diagnosis) is critical, PR curves give a clearer picture.

In [None]:
from sklearn.metrics import average_precision_score
ap_score = average_precision_score(y_test, y_pred_proba)
print(f"Average Precision Score: {ap_score:.2f}")

## v. Learning Curves

- Shows how performance changes with increasing training data.
- Helps diagnose **overfitting** or **underfitting**.

In [None]:
train_sizes, train_scores, test_scores = learning_curve(best_tree, X_train, y_train, cv=5, scoring="accuracy")

plt.figure(figsize=(8, 6))
plt.plot(train_sizes, train_scores.mean(axis=1), label="Training Score")
plt.plot(train_sizes, test_scores.mean(axis=1), label="Validation Score")
plt.xlabel("Training Set Size")
plt.ylabel("Accuracy")
plt.title("Learning Curve")
plt.legend()
plt.show()


**Key Observations**
- Fluctuating Training Score
    - The training score (blue line) shows significant oscillations as the training set size increases.
    - This could be due to variability in the data subsets used during training, especially with smaller training sizes. If the dataset isn't well-balanced or has noisy data, this can happen.
- Relatively Stable Validation Score
    - The validation score (orange line) remains relatively constant with small improvements as the training size increases.
    - This suggests that the model generalizes well and is not significantly overfitting the training data.
- Training Score Above Validation Score
    - The training score is slightly higher than the validation score, which is typical.
    - A large gap between the two would suggest overfitting, but here the gap is small, indicating a good generalization balance.
- Close Convergence
    - The training and validation scores are close to each other as the training set size increases, suggesting that the model's performance is consistent and not underfitting or overfitting significantly.

**Key Takeaways**

- Model is likely performing well
    - The training and validation scores are close, and there’s no significant drop in validation score, indicating that the model is learning effectively without overfitting.
- Instability in Training Score
    - The fluctuations in the training score could point to:
        - Data imbalance in the training subsets.
        - A need for more consistent cross-validation folds.
- Potential Issue with Sampling
    - The large fluctuations in the training score may indicate an issue with how training samples are selected. Consider stratified sampling if the dataset is imbalanced.

**1. Improve Data Sampling**
- Check Data Quality
    - Ensure the training data is well-distributed and not imbalanced. If imbalanced, use techniques like oversampling (SMOTE) or undersampling.
- Use Stratified Sampling
    - Ensure each fold in cross-validation maintains the same class proportions as the overall dataset:

**2. Tune Hyperparameters**
Even though best_tree comes from a parameter search, consider adding or refining these
- Increase max_depth
    - A shallow tree might underfit the training data. Increase depth and observe if training accuracy improves while keeping an eye on overfitting.
- Decrease min_samples_split and min_samples_leaf:
    - These might be too high, preventing the tree from growing fully. Lower values can increase granularity.
- Try Different Scorers
    - Accuracy might not always be the best metric. For imbalanced datasets, consider:
        - scoring="f1" for a balance between precision and recall.
        - scoring="roc_auc" for probability-based classifiers.

**3. Use More Data**
- Expand the Training Set
    - If possible, collect more training data or augment the dataset with similar examples.
- Data Augmentation
    - For specific use cases (e.g., images, text), apply transformations to create synthetic data.

**4. Optimize Features**
- Feature Engineering
    - Add meaningful features, interactions, or transformations (e.g., log, polynomial).
- Feature Selection
    - Remove unimportant or noisy features. Use feature importances or recursive feature elimination (RFE).

**5. Try a More Complex Model**
- Boosting Algorithms
    - Use Gradient Boosting (e.g., XGBoost, LightGBM) for better generalization.
- Bagging Algorithms
    - Use Random Forests or Extra Trees for ensemble learning.

**6. Cross-Validation with More Folds**
- Increase the number of folds in cross-validation to get a more robust evaluation

In [None]:
train_sizes, train_scores, test_scores = learning_curve(
    best_tree, X_train, y_train, cv=20, scoring="accuracy"
)


In [None]:
plt.figure(figsize=(8, 6))
plt.plot(train_sizes, train_scores.mean(axis=1), label="Training Score")
plt.plot(train_sizes, test_scores.mean(axis=1), label="Validation Score")
plt.xlabel("Training Set Size")
plt.ylabel("Accuracy")
plt.title("Learning Curve")
plt.legend()
plt.show()

**7. Plot Learning Curve Smoothly**
- Use a more evenly distributed set of train_sizes for smoother curves

In [None]:
train_sizes = np.linspace(0.1, 1.0, 10)  # Train sizes from 10% to 100%
train_sizes, train_scores, test_scores = learning_curve(
    best_tree, X_train, y_train, train_sizes=train_sizes, cv=10, scoring="accuracy"
)


In [None]:
plt.figure(figsize=(8, 6))
plt.plot(train_sizes, train_scores.mean(axis=1), label="Training Score")
plt.plot(train_sizes, test_scores.mean(axis=1), label="Validation Score")
plt.xlabel("Training Set Size")
plt.ylabel("Accuracy")
plt.title("Learning Curve")
plt.legend()
plt.show()

**8. Analyze Misclassifications**
- Evaluate misclassified examples to identify patterns or weaknesses in the model.
- Use a confusion matrix to gain insights

# =========================================================================================

**1. Change Scoring Metric**
- If you care more about Precision or Recall (based on PR curve analysis), change the scoring metric:
    - For a balanced focus: scoring="f1".
    - To focus on minimizing false negatives: scoring="recall".
    - To optimize class separation: scoring="roc_auc".

**2. Optimize Cross-Validation**
- If your dataset is imbalanced, use stratified k-fold cross-validation to ensure equal class proportions in each fold

**3. Adjust the Number of Iterations (n_iter)**
- If you have sufficient computation resources, increase n_iter to explore more combinations

***Given your best parameters***

- criterion: Keep it as 'gini' since it's categorical.
- max_depth=3: Explore slightly larger and smaller values (e.g., [2, 3, 4, 5]).
- min_samples_leaf=3: Test a range around this value (e.g., [2, 3, 4, 5]).
- min_samples_split=13: Narrow the range (e.g., [10, 13, 15, 18]).

In [None]:
# from sklearn.model_selection import StratifiedKFold

# cv = StratifiedKFold(n_splits=5)

param_distributions = {
    "max_depth": [3, 5, 7, 10],
    "min_samples_split": [5, 8, 10, 15],
    "min_samples_leaf": [10, 15, 20, 25],
    "criterion": ["gini"],
}

random_search = RandomizedSearchCV(
    estimator=DecisionTreeClassifier(random_state=42),
    param_distributions=param_distributions,
    n_iter=30,  # Reduced iterations for focused search
    scoring="roc_auc",
    cv=5,
    verbose=2,
    random_state=42
)

random_search.fit(X_train, y_train)

In [None]:
random_search.best_params_

In [None]:
best_tree = random_search.best_estimator_

In [None]:
# Use the best model to make predictions
y_pred = best_tree.predict(X_test)

In [None]:
# Export the decision tree to Graphviz format
dot_data = export_graphviz(
    best_tree,  # The trained DecisionTreeClassifier model
    out_file=None,  # No need to save to a file, we handle it in-memory
    feature_names=X_train.columns,  # Feature names from the training dataset
    class_names=["Class 0", "Class 1"],  # Replace with actual class names if available
    filled=True,  # Add colors to nodes based on class distribution
    rounded=True,  # Round the corners of the nodes
    special_characters=True  # Allow special characters in feature names
)

# Render the Graphviz tree
graph = graphviz.Source(dot_data)

# display(graph)
graph.render("decision_tree")  # Saves as 'decision_tree.pdf'
graph.view()  # Opens the rendered file in the default viewer

In [None]:
importances = best_tree.feature_importances_

# Get indices of features sorted by importance
indices = np.argsort(importances)

# Plot feature importance
plt.figure(figsize=(15, 5))
plt.barh(range(len(indices)), importances[indices], color="skyblue")
plt.yticks(range(len(indices)), [X_train.columns[i] for i in indices])
plt.xlabel("Feature Importance")
plt.ylabel("Feature")
plt.title("Feature Importance - Best Model from Randomized Search")
plt.tight_layout()
plt.show()

In [None]:
# Get predicted probabilities for the train and test sets
y_train_proba = best_tree.predict_proba(X_train)[:, 1]  # Probabilities for positive class (1)
y_test_proba = best_tree.predict_proba(X_test)[:, 1]

# Compute ROC curve and AUC for training data
fpr_train, tpr_train, _ = roc_curve(y_train, y_train_proba)
auc_train = roc_auc_score(y_train, y_train_proba)

# Compute ROC curve and AUC for test data
fpr_test, tpr_test, _ = roc_curve(y_test, y_test_proba)
auc_test = roc_auc_score(y_test, y_test_proba)

# Plot the ROC curves
plt.figure(figsize=(8, 6))
plt.plot(fpr_train, tpr_train, label=f"Train ROC Curve (AUC = {auc_train:.2f})", color="blue")
plt.plot(fpr_test, tpr_test, label=f"Test ROC Curve (AUC = {auc_test:.2f})", color="green")
plt.plot([0, 1], [0, 1], color="red", linestyle="--", label="Random Guess")  # Diagonal line
plt.xlabel("False Positive Rate")
plt.ylabel("True Positive Rate")
plt.title("ROC Curve - Train and Test")
plt.legend(loc="lower right")
plt.grid(alpha=0.4)
plt.show()

In [None]:
# Generate confusion matrix
cm = confusion_matrix(y_test, y_pred)

# Display confusion matrix
disp = ConfusionMatrixDisplay(confusion_matrix=cm)
disp.plot(cmap="Blues")
plt.show()


In [None]:
y_pred_proba = best_tree.predict_proba(X_test)[:, 1]  # Only positive class

precision, recall, thresholds = precision_recall_curve(y_test, y_pred_proba)

plt.figure(figsize=(8, 6))
plt.plot(recall, precision, label="Precision-Recall Curve")
plt.xlabel("Recall")
plt.ylabel("Precision")
plt.title("Precision-Recall Curve")
plt.legend()
plt.show()


In [None]:
from sklearn.metrics import average_precision_score
ap_score = average_precision_score(y_test, y_pred_proba)
print(f"Average Precision Score: {ap_score:.2f}")

In [None]:
train_sizes = np.linspace(0.1, 1.0, 10)  # Train sizes from 10% to 100%
train_sizes, train_scores, test_scores = learning_curve(
    best_tree, X_train, y_train, train_sizes=train_sizes, cv=10, scoring="accuracy"
)

plt.figure(figsize=(8, 6))
plt.plot(train_sizes, train_scores.mean(axis=1), label="Training Score")
plt.plot(train_sizes, test_scores.mean(axis=1), label="Validation Score")
plt.xlabel("Training Set Size")
plt.ylabel("Accuracy")
plt.title("Learning Curve")
plt.legend()
plt.show()


- Analyze Feature Importance: Identify the most critical features from the feature importance plot, such as tenure_seconds, shared_device_user_cnt, purchase_hour, and country_encoded. These features are likely to provide meaningful patterns for fraud detection.
- Inspect the Decision Tree: Extract key decision splits and paths from the decision tree. Look for combinations of conditions where features interact, leading to a fraud classification. For instance, tenure_seconds <= 1.5 combined with shared_device_user_cnt > 1.5 suggests that accounts created quickly and shared across multiple devices are suspicious.
- Formulate Combined Rules:
    - Combine multiple high-risk feature conditions. For example, rules like "short account tenure + shared devices" can signal a potential fraud pattern.
    - Analyze value ranges of critical features, such as specific ranges of purchase_hour that might represent unusual purchase times.
- Incorporate Business Logic:
    - Leverage domain knowledge to enhance the rules. For example, shared devices might indicate account abuse, while transactions occurring during atypical hours could flag suspicious activity.
    - Consider the context behind these rules, such as small test payments or accounts created for short-term misuse.
- Create Generalized Rule Structures:
    - Design rules that combine conditions rather than relying on a single feature. For example, a rule like "short tenure accounts and purchases from high-risk countries" can better capture fraud patterns.
    - Highlight the potential impact of these rules, such as temporarily flagging or freezing transactions for additional verification.
- Ensure Comprehensive Coverage:
    - Include multiple conditions from the decision tree to account for various fraud scenarios rather than focusing on isolated features.
    - Use different branches and paths from the tree to identify diverse patterns and ensure the rules address a broad spectrum of fraud behaviors.

**1. Short Tenure + Shared Device**
- Rule: tenure_seconds <= 1.5 AND shared_device_user_cnt > 1.5
- Reason: New accounts sharing devices with multiple users are likely fraudulent.
- Action: Block the account or require additional identity verification.


In [None]:
df.sample(2)

In [None]:
df.columns

In [None]:
df["predict_1"] = np.where(
    (df["tenure_seconds"] <= 1) & 
    (df["shared_device_user_cnt"] >= 2),
    1,
    0
)


In [None]:
# Calculate True Positives, False Positives, and False Negatives
true_positives = len(df[(df["class"] == 1) & (df["predict_1"] == 1)])
false_positives = len(df[(df["class"] == 0) & (df["predict_1"] == 1)])
false_negatives = len(df[(df["class"] == 1) & (df["predict_1"] == 0)])

# Calculate Precision and Recall
precision = true_positives / (true_positives + false_positives)
recall = true_positives / (true_positives + false_negatives)

# Print the results
print(f"Precision: {precision:.2%}")
print(f"Recall: {recall:.2%}")

**2. Short Tenure + Unusual Purchase Time**
- Rule: tenure_seconds <= 1.5 AND purchase_hour <= 2.5
- Reason: Fraudsters often make purchases at odd hours after creating new accounts.
- Action: Freeze the transaction or request further verification.


In [None]:
df[(df["tenure_seconds"] <= 1) & (df["purchase_hour"] <= 2)]["class"].value_counts()

In [None]:
df[(df["tenure_seconds"] <= 1) & (df["purchase_hour"] <= 2)]["class"].value_counts(normalize=True)

**3. High-Risk Country + Shared IP**
- Rule: country_encoded > 0.052 AND shared_ip_user_cnt > 2.0
- Reason: Multiple accounts from the same high-risk IP indicate potential proxy or bot activity.
- Action: Trigger additional verification steps, like SMS or ID checks.

In [None]:
df.columns

In [None]:
train_df[train_df["country_encoded"] > 0.149]["country"].unique()

In [None]:
df[(df["country"].isin(['Denmark', 'Ecuador', 'Chile', 'Armenia', 'Honduras', 'Lithuania',
       'Ireland', 'Sri Lanka', 'Egypt', 'New Zealand', 'Peru', 'Tunisia',
       'Luxembourg', 'Kuwait', 'Senegal', 'Bolivia', 'Namibia', 'Malta',
       'Malawi', 'Uzbekistan', 'Afghanistan', 'Turkmenistan'])) & (df["shared_ip_user_cnt"] > 2)]["class"].value_counts()

**4. Short Tenure + Low Purchase Amount + Odd Purchase Time**
- Rule: tenure_seconds <= 1.5 AND purchase_value <= 10.5 AND purchase_hour <= 2.5
- Reason: Fraudsters often test stolen payment methods with small transactions at odd hours.
- Action: Decline such transactions and investigate the account.

In [None]:
df[(df["tenure_seconds"] <= 1) & (df["purchase_hour"] <= 2) & (df["purchase_value"] <= 10.5)]["class"].value_counts()

**5. Unusual Purchase Time + Shared Device + High-Risk Country**
- Rule: purchase_hour <= 2.5 AND shared_device_user_cnt > 1.5 AND country_encoded > 0.052
- Reason: A combination of unusual time, shared device, and risky location indicates fraud.
- Action: Block transactions and escalate for investigation.


In [None]:
df[(df["tenure_seconds"] <= 1) & (df["purchase_hour"] <= 2) & (df["purchase_value"] <= 10.5)]["class"].value_counts()

**6. Short Tenure + Unusual Signup Day + Shared Device**
- Rule: tenure_seconds <= 1.5 AND signup_dow in [1, 3] AND shared_device_user_cnt > 1.5
- Reason: Many fake accounts are registered on specific days using shared devices.
- Action: Restrict account activities and request additional verification.

In [None]:
df["predict_6"] = np.where(
    (df["tenure_seconds"] <= 1) & 
    (df["signup_dow"].isin([1, 3])) & 
    (df["shared_device_user_cnt"] >= 2),
    1,
    0
)


In [None]:
# Calculate True Positives, False Positives, and False Negatives
true_positives = len(df[(df["class"] == 1) & (df["predict_6"] == 1)])
false_positives = len(df[(df["class"] == 0) & (df["predict_6"] == 1)])
false_negatives = len(df[(df["class"] == 1) & (df["predict_6"] == 0)])

# Calculate Precision and Recall
precision = true_positives / (true_positives + false_positives)
recall = true_positives / (true_positives + false_negatives)

# Print the results
print(f"Precision: {precision:.2%}")
print(f"Recall: {recall:.2%}")