In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from matplotlib.ticker import PercentFormatter
import seaborn as sns

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score, classification_report, roc_auc_score

from xgboost import XGBClassifier

# 1. Understand the Problem and Data

In [None]:
fraud = pd.read_csv("Fraud_Data.csv")

In [None]:
fraud.info()

In [None]:
fraud.isna().mean()

In [None]:
for col in fraud.columns:
    uniques = fraud[col].unique()
    print(f"{col:<30}{len(uniques):<30}{', '.join(map(str, uniques[:5]))}") 
    # map(str, ...) applies the str function to each element of uniques[:5], converting each value into a string (if it's not already a string).
    # It's necessary because the join() function only works with strings.

In [None]:
fraud.columns

In [None]:
fraud["signup_time"] = pd.to_datetime(fraud["signup_time"])
fraud["purchase_time"] = pd.to_datetime(fraud["purchase_time"])

In [None]:
fraud.describe()

In [None]:
fraud["class"].value_counts()

# 2. Feature Engineering

In [None]:
ipaddress_mapping = pd.read_csv("IpAddress_to_Country.csv")

In [None]:
ipaddress_mapping.sample(3)

In [None]:
ipaddress_mapping.info()

In [None]:
def get_country(ip, mapping):
    # Check which row in ipaddress_mapping matches the given IP
    match = mapping[
        (mapping["lower_bound_ip_address"] <= ip) & 
        (mapping["upper_bound_ip_address"] >= ip)
    ]
    # Return the country if a match is found
    return match["country"].iloc[0] if not match.empty else np.nan

# Apply the function to each row in fraud
fraud["country"] = fraud["ip_address"].apply(lambda ip: get_country(ip, ipaddress_mapping))


In [None]:
df = fraud.copy()

In [None]:
df.info()

In [None]:
df[df["country"].isna()]["class"].value_counts()

In [None]:
df.columns

In [None]:
# Diff
df['tenure_days'] = (df["purchase_time"] - df["signup_time"]).dt.days
df['tenure_seconds'] = (df["purchase_time"] - df["signup_time"]).dt.seconds

# Signup
df['signup_dow'] = df['signup_time'].dt.dayofweek # 0 = Monday and 6 = Sunday # If you'd like the name of the day (e.g., "Sunday", "Monday"), use .dt.day_name() instead
df['signup_hour'] = df['signup_time'].dt.hour
df['signup_week'] = df['signup_time'].dt.isocalendar().week # .dt.isocalendar().week (Preferred): Extracts the ISO week number (1–53). This is aligned with the ISO 8601 standard.

# Purchase
df['purchase_dow'] = df['purchase_time'].dt.dayofweek
df['purchase_hour'] = df['purchase_time'].dt.hour
df['purchase_week'] = df['purchase_time'].dt.isocalendar().week


In [None]:
df["shared_device_user_cnt"] = df.groupby("device_id")["user_id"].transform('nunique')
df["shared_device_flag"] = df["shared_device_user_cnt"].apply(lambda x: 1 if x> 1 else 0)
df["shared_ip_user_cnt"] = df.groupby("ip_address")["user_id"].transform('nunique')
df["shared_ip_flag"] = df["shared_ip_user_cnt"].apply(lambda x: 1 if x> 1 else 0)

In [None]:
df.columns

In [None]:
df.country.isna().mean()

In [None]:
data = df.groupby("signup_dow")["class"].mean().reset_index()

day_of_week_map = {
    0: "Sunday",
    1: "Monday",
    2: "Tuesday",
    3: "Wednesday",
    4: "Thursday",
    5: "Friday",
    6: "Saturday"
}

data["signup_dow"] = data["signup_dow"].map(day_of_week_map)


fig, ax = plt.subplots(figsize=(20,5))
sns.barplot(data=data, x="signup_dow", y="class")
ax.axhline(y=df["class"].mean(), linestyle='--', color='r', linewidth=2)

for p in ax.patches:
    percentage = f"{p.get_height()*100:.2f}%"
    ax.text(
        x=p.get_x() + p.get_width()/2,
        y=p.get_height() + 0.001,
        ha="center",
        s=percentage,
        fontsize=12
    )
ax.yaxis.set_major_formatter(PercentFormatter())
plt.show()

In [None]:
data = df.groupby("purchase_dow")["class"].mean().reset_index()

day_of_week_map = {
    0: "Sunday",
    1: "Monday",
    2: "Tuesday",
    3: "Wednesday",
    4: "Thursday",
    5: "Friday",
    6: "Saturday",
}

data["purchase_dow"] = data["purchase_dow"].map(day_of_week_map)
overall_avg = df["class"].mean()
fig, ax = plt.subplots(figsize=(20, 5))
ax = sns.barplot(data=data, x="purchase_dow", y="class")

ax.axhline(
    y=overall_avg,
    ls="--",
    lw=2,
    color="r",
    label=f"Average: {overall_avg*100:.2f}%",
)

ax.legend(
    loc="center left", # Specifies the reference point of the legend within the bbox_to_anchor bounding box. # "upper right", "lower left", "best", etc.
    bbox_to_anchor=(1.01, 0.5), # 1.01: Places the legend just outside the right edge of the plot (slightly offset to the right of the axes by 1% of the figure width).
    borderaxespad=0, #Controls the padding (in fractional axes coordinates) between the legend and the axes frame.
    frameon=False, #Determines whether the legend has a surrounding border (frame).
)

ax.yaxis.set_major_formatter(PercentFormatter())
for p in ax.patches:
    percentage = f"{p.get_height()*100:.2f}%"
    ax.text(
        p.get_x() + p.get_width() / 2,
        p.get_height() + 0.001,
        percentage,
        ha="center",
        fontsize=12,
    )

plt.tight_layout()
plt.show()



In [None]:
data = df.groupby("source")["class"].mean().reset_index()

overall_avg = df["class"].mean()
fig, ax = plt.subplots(figsize=(20, 5))
ax = sns.barplot(data=data, x="source", y="class")

ax.axhline(
    y=overall_avg,
    ls="--",
    lw=2,
    color="r",
    label=f"Average: {overall_avg*100:.2f}%",
    alpha=0.4,
)

ax.legend(
    loc="center left", # Specifies the reference point of the legend within the bbox_to_anchor bounding box. # "upper right", "lower left", "best", etc.
    bbox_to_anchor=(1.01, 0.5), # bounding box to anchor: 1.01: Places the legend just outside the right edge of the plot (slightly offset to the right of the axes by 1% of the figure width).
    borderaxespad=0, # Controls the padding (in fractional axes coordinates) between the legend and the axes frame.
    frameon=False, # Determines whether the legend has a surrounding border (frame).
    # shadow=True, # Adds a shadow behind the legend.
    # fancybox=True, # Controls whether the legend has rounded corners.
    # framealpha=0.5, # Sets the transparency of the legend’s frame.
    # edgecolor="blue", # Sets the color of the legend’s frame.
    # facecolor="lightgrey",  # Sets a light grey background for the legend
    # markerscale=1.5, # Enlarges the markers in the legend
    # handlelength=3.0,  # Increases handle length
    # labelspacing=0.2,  # Reduces spacing between entries
)

ax.yaxis.set_major_formatter(PercentFormatter())
for p in ax.patches:
    percentage = f"{p.get_height()*100:.2f}%"
    ax.text(
        p.get_x() + p.get_width() / 2,
        p.get_height() + 0.001,
        percentage,
        ha="center",
        fontsize=12,
        
    )

plt.tight_layout()
plt.show()



In [None]:
fig, ax = plt.subplots(figsize=(20,4))
sns.boxplot(data=df, x='tenure_seconds')

plt.tight_layout()
plt.show()

## i. Categorical Encoding

In [None]:
train_df, test_df = train_test_split(df, test_size=0.3, random_state=42)

 ### a. country

**Best Practice for Decision Trees**

- Low Cardinality: Use Label Encoding or One-Hot Encoding.
- High Cardinality: Use Target Encoding or Frequency Encoding.
- Tree Models: Prefer Label Encoding, as they handle categorical splits efficiently.


If there are many unique countries (e.g., >50), **Target Encoding** or **Frequency Encoding** is preferred. **One-Hot Encoding** should be avoided due to the "curse of dimensionality."

**Target Encoding**
- Replace each country with the mean of the target variable (class) for that country.
- This approach can capture the relationship between a country and the likelihood of fraud.

- Benefits
    - Captures how likely a country is associated with the target variable (e.g., fraud).
    - Reduces dimensionality compared to One-Hot Encoding.
- Caution
    - May lead to data leakage if applied on the entire dataset. Use it carefully with proper train-test splitting and cross-validation.

**Step-by-Step Process for Target Encoding with Train-Test Splitting**
1. Split Your Data: First, split your dataset into training and testing sets. Ensure the test set is completely unseen during the encoding process.
2. Compute Encoding on the Training Set: For the `country` column, compute the mean of the target variable (`class`) in the training set. Use this mapping to encode the training and test data.
3. Apply the Encoding: 
    - Replace the country column in both training and test sets using the computed encoding_map.
    - For unseen categories in the test set (i.e., categories that do not exist in the training set), assign a default value (e.g., the overall mean of the target in the training set).

In [None]:
df.country.value_counts()

In [None]:
# Compute the encoding map from training data
encoding_map = train_df.groupby('country')['class'].mean()

In [None]:
default_value = train_df['class'].mean()  # Fallback value: global mean of the target variable in the training set

In [None]:
# Apply encoding to the training set
train_df['country_encoded'] = train_df['country'].map(encoding_map).fillna(default_value)
# Apply encoding to the test set (handle NaN and unseen categories)
test_df['country_encoded'] = test_df['country'].map(encoding_map).fillna(default_value)

### b. source

**Recommendation for source**

- Use One-Hot Encoding if you’re using linear models or want to avoid any ordinal relationships.
- Use Label Encoding if you’re using tree-based models (like Decision Trees, Random Forest, or XGBoost) for simplicity and efficiency.
- Use Target Encoding if source has a strong correlation with the target variable and you're confident in avoiding data leakage.

In [None]:
train_df.source.value_counts()

In [None]:
# Label Encoding
le = LabelEncoder()
train_df['source_encoded'] = le.fit_transform(train_df['source'])

# Apply the same encoding to the test set
test_df['source_encoded'] = le.transform(test_df['source'])

### c. browser

**Key Considerations**
- browser has moderate cardinality (e.g., Chrome, Safari, Firefox, etc.).
- Encoding options depend on the number of unique categories and how you want to handle the relationships between them.

In [None]:
# Label Encoding
le = LabelEncoder()
train_df['browser_encoded'] = le.fit_transform(train_df['browser'])

# Apply the same encoding to the test set
test_df['browser_encoded'] = le.transform(test_df['browser'])

### d. sex

**Key Considerations**
- sex typically has low cardinality (e.g., Male, Female, Other).
- Encoding is straightforward since there are usually only two or three unique values.

In [None]:
# Label Encoding
le = LabelEncoder()
train_df['sex_encoded'] = le.fit_transform(train_df['sex'])

# Apply the same encoding to the test set
test_df['sex_encoded'] = le.transform(test_df['sex'])

In [None]:
train_df.sample(3)

# 3. Split Data for Training

In [None]:
# Define features and target
feature = [
    "signup_dow",
    "signup_week",
    "signup_hour",
    "purchase_dow",
    "purchase_week",
    "purchase_hour",
    "purchase_value",
    "source_encoded",
    "browser_encoded",
    "sex_encoded",
    "age",
    "country_encoded",
    "tenure_seconds",
    "shared_device_user_cnt",
    "shared_ip_user_cnt",
]
target = "class"


In [None]:
# Define features (X) and target (y)
X = train_df[feature] 
y = train_df[target]

# Split into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# 4. Train Predictive Models

## i. Decision Tree

***General Guidelines***

`max_depth` (Limits the depth of the tree)
- Prevents overfitting by controlling the tree's complexity.
- Rule of thumb:
    - For small datasets (<10,000 samples): Use **max_depth=3-10**.
    - For large datasets (>10,000 samples): Experiment with larger values.
- Proportional approach:
    - Set based on the number of features (**sqrt(n_features)** for classification problems).

In [None]:
df.shape

`min_samples_split` (Minimum samples required to split a node)
- Ensures a split only occurs if enough samples are present, reducing overfitting.
- Rule of thumb:
    - Set min_samples_split to 2-5% of the dataset size (**int(0.02 * n_samples)**).
    - For imbalanced datasets, **adjust according to the minority class size**.

`min_samples_leaf` (Minimum samples per leaf node)
- Ensures that leaf nodes have enough data to make meaningful predictions.
- Rule of thumb:
    - Use 1-10% of the dataset size (`int(0.01 * n_samples)`).
    - For imbalanced datasets, ensure leaf nodes contain enough minority samples.

***For Imbalanced Datasets (0/1 Classes)***

When you have an imbalanced dataset (e.g., class 0: 90%, class 1: 10%), ensure that the minority class (class 1) is well-represented in the splits

In [None]:
df["class"].value_counts(normalize=True)

**max_depth**
- Prevent deep trees that may overfit the majority class.
- Start with smaller depths, such as max_depth=5, and gradually increase while monitoring performance.

In [None]:
max_depth = 5 

**min_samples_split**
- Set to ensure splits occur only if both classes are represented

In [None]:
n_minority_samples = df["class"].value_counts()[1]

In [None]:
min_samples_split = max(2, int(0.05 * n_minority_samples))

**min_samples_leaf**
- Ensure leaf nodes contain meaningful samples for both classes

In [None]:
min_samples_leaf = max(1, int(0.01 * n_minority_samples))

In [None]:
tree = DecisionTreeClassifier(
    max_depth=max_depth,
    min_samples_split=min_samples_split,
    min_samples_leaf=min_samples_leaf,
    random_state=42,
)
tree.fit(X_train, y_train)
y_pred = tree.predict(X_test)


In [None]:
accuracy_score(y_test, y_pred)

In [None]:
print(classification_report(y_test, y_pred))

#### a. Check for Overfitting

**1. Evaluate Training and Test Accuracy**
- If the training accuracy is high, but the test accuracy is much lower, the model is likely overfitting.

**Overfitting Sign**
- High training accuracy (e.g., 95% or higher).
- Significantly lower test accuracy (e.g., below 70%).

In [None]:
# Predictions on training and test data
y_train_pred = tree.predict(X_train)
y_test_pred = tree.predict(X_test)

# Calculate accuracy
train_acc = accuracy_score(y_train, y_train_pred)
test_acc = accuracy_score(y_test, y_test_pred)

print(f"Training Accuracy: {train_acc:.2f}")
print(f"Test Accuracy: {test_acc:.2f}")

**2. Cross-Validation**
- Cross-validation evaluates the model's performance on multiple subsets of the data to check for consistency and reduce overfitting.

**Overfitting Sign**
- Large variability in cross-validation scores.
- Mean cross-validation score significantly lower than training accuracy.

In [None]:
from sklearn.model_selection import cross_val_score

# Perform 5-fold cross-validation
cv_scores = cross_val_score(tree, X_train, y_train, cv=5)

print(f"Cross-Validation Scores: {cv_scores}")
print(f"Mean Cross-Validation Score: {cv_scores.mean():.2f}")


**3. Analyze Tree Complexity**

- Complex trees with deep depths or many nodes are prone to overfitting because they can memorize the training data.

**Overfitting Sign**
- Very deep trees (e.g., depth > 10 for small datasets).
- Excessive number of nodes compared to the size of the training data.

In [None]:
# Access tree attributes
tree_depth = tree.tree_.max_depth
num_nodes = tree.tree_.node_count

print(f"Tree Depth: {tree_depth}")
print(f"Number of Nodes: {num_nodes}")


#### b. Avoid Overfitting

**1. Prune the Tree**
- Control the maximum depth, minimum samples per split, or minimum samples per leaf during training.

**2. Use Cross-Validation**
- Cross-validation ensures the model generalizes well to unseen data and reduces overfitting.

**3. Use Regularization Parameters**
- ccp_alpha: (Cost-Complexity Pruning)
    - Prunes branches that add little predictive power.
    - Larger ccp_alpha values result in smaller trees.

**4. Use Ensemble Models**
- Random Forests or Gradient Boosted Trees reduce overfitting by averaging predictions or using regularization.

In [None]:
from sklearn.tree import export_graphviz
import graphviz


# Export the decision tree to Graphviz format
dot_data = export_graphviz(
    tree,  # The trained DecisionTreeClassifier model
    out_file=None,  # No need to save to a file, we handle it in-memory
    feature_names=X_train.columns,  # Feature names from the training dataset
    class_names=["Class 0", "Class 1"],  # Replace with actual class names if available
    filled=True,  # Add colors to nodes based on class distribution
    rounded=True,  # Round the corners of the nodes
    special_characters=True  # Allow special characters in feature names
)

# Render the Graphviz tree
graph = graphviz.Source(dot_data)

display(graph)
# Display the tree in a Jupyter Notebook or save it as a file
# graph.render("decision_tree")  # Saves as 'decision_tree.pdf'
# graph.view()  # Opens the rendered file in the default viewer


#### c. Evaluate Performance

In [None]:
from sklearn.model_selection import learning_curve

train_sizes, train_scores, test_scores = learning_curve(
    tree, X_train, y_train, cv=5, scoring="accuracy"
)

plt.figure(figsize=(10, 6))
plt.plot(train_sizes, train_scores.mean(axis=1), label="Training Score")
plt.plot(train_sizes, test_scores.mean(axis=1), label="Validation Score")
plt.xlabel("Training Set Size")
plt.ylabel("Score")
plt.title("Learning Curve")
plt.legend()
plt.show()


In [None]:
importance = tree.feature_importances_
feature_names = X.columns

In [None]:
# Sort features by importance for better readability
sorted_idx = np.argsort(importance)
sorted_importance = importance[sorted_idx]
sorted_feature_names = feature_names[sorted_idx]

# Create a larger figure for better visualization
plt.figure(figsize=(20, 5))

# Horizontal bar plot
plt.barh(sorted_feature_names, sorted_importance, color="skyblue")

# Add labels and title
plt.xlabel("Feature Importance", fontsize=12)
plt.ylabel("Feature", fontsize=12)
plt.title("Fraud Model Feature Importance", fontsize=14, fontweight="bold")

# Add gridlines for readability
plt.grid(axis="x", linestyle="--", alpha=0.7)

# Remove top and right spines for a cleaner look
plt.gca().spines['top'].set_visible(False)
plt.gca().spines['right'].set_visible(False)

# Adjust tick parameters for better readability
plt.tick_params(axis="both", which="major", labelsize=10)

# Show the plot
plt.tight_layout()
plt.show()

In [None]:
min_samples_leaf

In [None]:
min_samples_split

#### d. Grid Search

In [None]:
from sklearn.model_selection import GridSearchCV

param_grid = {
    "max_depth": [3, 5, 10, None],
    "min_samples_split": [2, 5, 10, 20],
    "min_samples_leaf": [1, 5, 10],
    "criterion": ["gini", "entropy"],
}

grid_search = GridSearchCV(tree, param_grid, cv=5, scoring="accuracy")
grid_search.fit(X_train, y_train)

print("Best Parameters:", grid_search.best_params_)


In [None]:
# Get the best fitted model
best_tree = grid_search.best_estimator_
# Use the best model to make predictions
y_pred = best_tree.predict(X_test)
# Evaluate the performance
from sklearn.metrics import accuracy_score
print("Test Accuracy:", accuracy_score(y_test, y_pred))

In [None]:
print(classification_report(y_test, y_pred))

In [None]:
# Access best_tree attributes
best_tree_depth = best_tree.tree_.max_depth
num_nodes = best_tree.tree_.node_count

print(f"best_tree Depth: {best_tree_depth}")
print(f"Number of Nodes: {num_nodes}")

In [None]:
importance = best_tree.feature_importances_
feature_names = X.columns

# Sort features by importance for better readability
sorted_idx = np.argsort(importance)
sorted_importance = importance[sorted_idx]
sorted_feature_names = feature_names[sorted_idx]

# Create a larger figure for better visualization
plt.figure(figsize=(20, 5))

# Horizontal bar plot
plt.barh(sorted_feature_names, sorted_importance, color="skyblue")

# Add labels and title
plt.xlabel("Feature Importance", fontsize=12)
plt.ylabel("Feature", fontsize=12)
plt.title("Fraud Model Feature Importance", fontsize=14, fontweight="bold")

# Add gridlines for readability
plt.grid(axis="x", linestyle="--", alpha=0.7)

# Remove top and right spines for a cleaner look
plt.gca().spines['top'].set_visible(False)
plt.gca().spines['right'].set_visible(False)

# Adjust tick parameters for better readability
plt.tick_params(axis="both", which="major", labelsize=10)

# Show the plot
plt.tight_layout()
plt.show()

In [None]:
# Export the decision tree to Graphviz format
dot_data = export_graphviz(
    best_tree,  # The trained DecisionTreeClassifier model
    out_file=None,  # No need to save to a file, we handle it in-memory
    feature_names=X_train.columns,  # Feature names from the training dataset
    class_names=["Class 0", "Class 1"],  # Replace with actual class names if available
    filled=True,  # Add colors to nodes based on class distribution
    rounded=True,  # Round the corners of the nodes
    special_characters=True  # Allow special characters in feature names
)

# Render the Graphviz tree
graph = graphviz.Source(dot_data)

display(graph)

In [None]:
from sklearn.metrics import roc_curve, roc_auc_score

# Get predicted probabilities for the train and test sets
y_train_proba = best_tree.predict_proba(X_train)[:, 1]  # Probabilities for positive class (1)
y_test_proba = best_tree.predict_proba(X_test)[:, 1]

# Compute ROC curve and AUC for training data
fpr_train, tpr_train, _ = roc_curve(y_train, y_train_proba)
auc_train = roc_auc_score(y_train, y_train_proba)

# Compute ROC curve and AUC for test data
fpr_test, tpr_test, _ = roc_curve(y_test, y_test_proba)
auc_test = roc_auc_score(y_test, y_test_proba)

In [None]:

# Plot the ROC curves
plt.figure(figsize=(8, 6))
plt.plot(fpr_train, tpr_train, label=f"Train ROC Curve (AUC = {auc_train:.2f})", color="blue")
plt.plot(fpr_test, tpr_test, label=f"Test ROC Curve (AUC = {auc_test:.2f})", color="green")
plt.plot([0, 1], [0, 1], color="red", linestyle="--", label="Random Guess")  # Diagonal line
plt.xlabel("False Positive Rate")
plt.ylabel("True Positive Rate")
plt.title("ROC Curve - Train and Test")
plt.legend(loc="lower right")
plt.grid(alpha=0.4)
plt.show()


- The closer the ROC curve is to the top-left corner, the better the model.
- AUC (Area Under Curve):
    - Ranges from 0 to 1.
    - 0.5: Random guessing.
    - 1.0: Perfect classifier.
- Diagonal Line:
    - Represents random guessing (baseline).


**Interpreting the Results**
- If the train AUC is much higher than the test AUC:
    - The model may be overfitting to the training data.
- If both curves are similar:
    - The model generalizes well to unseen data.

## ii. Random Forest

In [None]:
# Train rf_model
rf_model = RandomForestClassifier(random_state=42)
rf_model.fit(X_train, y_train)

# Evaluate
y_pred = rf_model.predict(X_test)
print("Accuracy:", accuracy_score(y_test, y_pred))
print(classification_report(y_test, y_pred))

In [None]:
importance = rf_model.feature_importances_
feature_names = X.columns

In [None]:
# Sort features by importance for better readability
sorted_idx = np.argsort(importance)
sorted_importance = importance[sorted_idx]
sorted_feature_names = feature_names[sorted_idx]

# Create a larger figure for better visualization
plt.figure(figsize=(20, 5))

# Horizontal bar plot
plt.barh(sorted_feature_names, sorted_importance, color="skyblue")

# Add labels and title
plt.xlabel("Feature Importance", fontsize=12)
plt.ylabel("Feature", fontsize=12)
plt.title("Fraud Model Feature Importance", fontsize=14, fontweight="bold")

# Add gridlines for readability
plt.grid(axis="x", linestyle="--", alpha=0.7)

# Remove top and right spines for a cleaner look
plt.gca().spines['top'].set_visible(False)
plt.gca().spines['right'].set_visible(False)

# Adjust tick parameters for better readability
plt.tick_params(axis="both", which="major", labelsize=10)

# Show the plot
plt.tight_layout()
plt.show()

In [None]:
# Access the first tree from the Random Forest
tree = rf.estimators_[0]

# Check the depth of the tree
depth = tree.tree_.max_depth
print(f"Depth of the tree: {depth}")

# Check the number of nodes in the tree
num_nodes = tree.tree_.node_count
print(f"Number of nodes in the tree: {num_nodes}")

In [None]:
from sklearn.tree import plot_tree


In [None]:
# Train a Random Forest model
rf = RandomForestClassifier(n_estimators=10, random_state=42)
rf.fit(X_train, y_train)

# Visualize one tree from the forest
plt.figure(figsize=(16, 10))
plot_tree(
    rf.estimators_[0],  # Extract the first tree
    feature_names=X_train.columns,  # Feature names
    class_names=["Class 0", "Class 1"],  # Replace with actual class names
    filled=True,
    rounded=True,
    fontsize=10
)
plt.title("Decision Tree from Random Forest", fontsize=14)
plt.show()

## iii. XGBoost

In [None]:
# Initialize the model
xgb_model = XGBClassifier(random_state=42)

# Train the model
xgb_model.fit(X_train, y_train)

# Make predictions
y_pred = xgb_model.predict(X_test)

# Evaluate the model
print("Accuracy:", accuracy_score(y_test, y_pred))
print("\nClassification Report:\n", classification_report(y_test, y_pred))

# 5. Model Evaluation

In [None]:
# Calculate AUC
y_prob = rf_model.predict_proba(X_test)[:, 1]  # Probabilities for class 1
roc_auc = roc_auc_score(y_test, y_prob)
print("ROC-AUC Score:", roc_auc)

# 6. Fine-Tune the Model

In [None]:
# from sklearn.model_selection import GridSearchCV

# # Define the parameter grid
# param_grid = {
#     'n_estimators': [100, 200],
#     'max_depth': [10, 20, None],
#     'min_samples_split': [2, 5],
#     'min_samples_leaf': [1, 2]
# }

# # Grid search
# grid_search = GridSearchCV(RandomForestClassifier(random_state=42), param_grid, cv=3, scoring='roc_auc')
# grid_search.fit(X_train, y_train)

# # Best parameters
# print("Best parameters:", grid_search.best_params_)
