In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.metrics import precision_recall_curve, roc_curve, auc, accuracy_score, classification_report, confusion_matrix, average_precision_score
from sklearn.preprocessing import StandardScaler, OneHotEncoder, LabelEncoder, label_binarize
from sklearn.decomposition import PCA
from sklearn.multiclass import OneVsRestClassifier
from sklearn.metrics import RocCurveDisplay, PrecisionRecallDisplay
import pickle

In [None]:
# Load in Data
df = pd.read_csv('train_data_ads.csv')
categorical_features = df.select_dtypes(include=['object']).columns
numerical_features = df.select_dtypes(include=[float, int]).columns

In [None]:
# One hot encode the categorical features
def limit_onehot(df, columns, top_n=10):
    for col in columns:
        top_categories = df[col].value_counts().nlargest(top_n).index
        df[col] = df[col].where(df[col].isin(top_categories), other='Other')
    return df

df = limit_onehot(df, categorical_features, top_n=10)

In [None]:
# Create a processor to handle transformed data
preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), numerical_features),
        ('cat', OneHotEncoder(drop='first', sparse_output=False), categorical_features)
    ])

In [None]:
# Run PCA
pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('pca', PCA(n_components=2))
])

pca = pipeline.fit_transform(df)

In [None]:
# Find how much variance is explained by the first two PCs
explained_variance_ratio = pipeline.named_steps['pca'].explained_variance_ratio_
print(f'Explained variance by component: {explained_variance_ratio}')

In [None]:
# Visualize PCA
gender_colors = df['gender'].map({2: 'blue', 3: 'yellow', 4: 'red'})

plt.figure(figsize=(8, 6))
colors = {'(2)': 'blue', '(3)': 'yellow', '(4)': 'red'}
legend_labels = [plt.Line2D([0], [0], marker='o', color='w', markerfacecolor=color, markersize=10)
                 for color in colors.values()]
plt.legend(legend_labels, colors.keys(), title="Gender")
plt.scatter(pca[:, 0], pca[:, 1], alpha=0.05, c=gender_colors)
plt.xlabel('Principal Component 1')
plt.ylabel('Principal Component 2')
plt.title('PCA Scatterplot of First Two Components')
plt.grid(True)
# Save
plt.savefig('ads_pca.pdf')
plt.close()

In [None]:
# Find how well gender correlates on our PCA
correlations = np.corrcoef(pca.T, df['gender'])[0:2, -1]
print("Correlation of PC1 with gender:", correlations[0])
print("Correlation of PC2 with gender:", correlations[1])