In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.metrics import precision_recall_curve, roc_curve, auc, accuracy_score, classification_report, confusion_matrix, average_precision_score
from sklearn.preprocessing import StandardScaler, OneHotEncoder, LabelEncoder, label_binarize
from sklearn.multiclass import OneVsRestClassifier
from sklearn.metrics import RocCurveDisplay, PrecisionRecallDisplay
import pickle

In [2]:
# Load in Data
df = pd.read_csv('train_data_ads.csv')

In [3]:
# Split data for training
# random forest features fit best on numeric features from prior analysis
df_numeric = df.select_dtypes(include=[float, int])

X_all = df_numeric.drop(columns=['gender'])
y_all = df_numeric['gender']

X_train_all, X_test_all, y_train_all, y_test_all = train_test_split(X_all, y_all, test_size=0.3, random_state=13)

In [None]:
# Train random forest on all data
rf_all = RandomForestClassifier(random_state=13)
rf_all.fit(X_train_all, y_train_all)

In [None]:
# Evaluate the model
# Find the accuracy classification report
y_pred_all = rf_all.predict(X_test_all)

accuracy_all = accuracy_score(y_test_all, y_pred_all)
class_report_all = classification_report(y_test_all, y_pred_all)

print("Accuracy:", accuracy_all)
print("Classification Report:\n", class_report_all)

In [None]:
# Identify the most important features in the dataset for the random forest
importances = rf_all.feature_importances_
feature_names = X_all.columns

feature_importances = pd.DataFrame({'feature': feature_names, 'importance': importances})
feature_importances = feature_importances.sort_values(by='importance', ascending=False)

top_features = feature_importances.head(10)
print(top_features)

# Select the top 5 features for subsequent training
N = 5
selected_features = top_features['feature'].head(N).tolist()

In [None]:
# Change the input and test data to correspond to the selected features
selected_features = ['emui_dev', 'series_group', 'residence', 'age', 'city', 'gender']
selected_features_x = ['emui_dev', 'series_group', 'residence', 'age', 'city']

X_train = X_train_all[selected_features_x]
X_test = X_test_all[selected_features_x]

In [None]:
# Fit the new model and find its accuracy and classification report
rf_classifier = RandomForestClassifier(random_state=13)

rf_classifier.fit(X_train, y_train_all)

y_pred = rf_classifier.predict(X_test)

accuracy = accuracy_score(y_test_all, y_pred)
class_report = classification_report(y_test_all, y_pred)

print("Accuracy:", accuracy)
print("Classification Report:\n", class_report)

In [None]:
# Binarize the classes for evaluating Precision-Recall Curve and ROC
y_train_bin = label_binarize(y_train_all, classes=[2, 3, 4])
y_test_bin = label_binarize(y_test_all, classes=[2, 3, 4])

rf_for_plot = OneVsRestClassifier(RandomForestClassifier(random_state=13))
rf_for_plot.fit(X_train, y_train_bin)

In [None]:
# Precision-Recall Plot
y_pred_proba = rf_for_plot.predict_proba(X_test)
plt.figure(figsize=(12, 8))
colors = ['blue', 'gold', 'red']
for i, color in zip(range(3), colors):
    precision, recall, _ = precision_recall_curve(y_test_bin[:, i], y_pred_proba[:, i])
    pr_display = PrecisionRecallDisplay(precision=precision, recall=recall)
    pr_display.plot(ax=plt.gca(), name=f'Gender {i + 2}', color=color)
plt.title("Multi-Class Precision-Recall Curve")
plt.xlabel("Recall")
plt.ylabel("Precision")
plt.legend()
plt.grid(True)

# Save
plt.savefig('precision_recall_curve.pdf')
plt.close()

In [None]:
# ROC Plot
plt.figure(figsize=(12, 8))
for i, color in zip(range(3), colors):
    fpr, tpr, _ = roc_curve(y_test_bin[:, i], y_pred_proba[:, i])
    roc_auc = auc(fpr, tpr)
    roc_display = RocCurveDisplay(fpr=fpr, tpr=tpr, roc_auc=roc_auc)
    roc_display.plot(ax=plt.gca(), name=f'Gender {i + 2}', color=color)
plt.title("Multi-Class ROC Curve")
plt.xlabel("False Positive Rate")
plt.ylabel("True Positive Rate")
plt.legend()
plt.grid(True)

# Save
plt.savefig('roc_curve.pdf')
plt.close()