In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.cluster import KMeans
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import silhouette_score
from sklearn.decomposition import PCA

%matplotlib inline


In [None]:
FILE_PATH = "Rank.csv"  # adjust if needed

df = pd.read_csv(FILE_PATH)
df["UpdateDT"] = pd.to_datetime(df["UpdateDT"], errors='coerce')
df.sort_values(["SubjectID", "UpdateDT"], inplace=True)

print("Data shape:", df.shape)
display(df.head())

print("\nInfo:")
print(df.info())

print("\nDescribe:")
display(df.describe(include="all"))

sns.histplot(data=df, x="Rank", kde=True, bins=30)
plt.title("Distribution of Rank (Continuous Score)")
plt.show()


In [None]:
# Identify columns that start with 'Driver'
driver_cols = [c for c in df.columns if c.startswith("Driver")] 

# Convert nominal driver columns to dummies (drop_first=True to avoid dummy trap)
df_encoded = pd.get_dummies(df, columns=driver_cols, drop_first=True)

print("Shape before encoding:", df.shape)
print("Shape after encoding:", df_encoded.shape)


In [None]:
# If you want to incorporate time-based info (like days from earliest), do so here.

if "UpdateDT" in df_encoded.columns:
    earliest_date = df_encoded["UpdateDT"].min()
    df_encoded["DaysFromEarliest"] = (df_encoded["UpdateDT"] - earliest_date).dt.days

    # Example: difference in days between consecutive rows of same SubjectID
    df_encoded["TimeDiff"] = df_encoded.groupby("SubjectID")["UpdateDT"].diff().dt.days
    df_encoded["TimeDiff"] = df_encoded["TimeDiff"].fillna(0)


In [None]:
# We decide which columns to use for clustering. Typically exclude ID & UpdateDT.

exclude_cols = ["SubjectID", "UpdateDT"]  # won't cluster on ID or actual date

# Option A: Include "Rank" as a feature
# Option B: Exclude "Rank" to see how clusters differ in rank after the fact.
include_rank = True

if include_rank:
    exclude_cols.extend([])  # nothing extra to exclude
else:
    exclude_cols.append("Rank")  # if we don't want to cluster on rank

feature_cols = [c for c in df_encoded.columns if c not in exclude_cols]

X = df_encoded[feature_cols].copy()

# Check for missing
missing = X.isnull().sum()
print("\nMissing values in features:")
print(missing[missing > 0])

# We'll fill or drop them if needed. For now, assume minimal missing.
X.fillna(0, inplace=True)

# Scale features
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

print("Feature matrix shape:", X_scaled.shape)


In [None]:

k_values = range(2, 8)

inertias = []
sils = []
for k in k_values:
    kmeans_temp = KMeans(n_clusters=k, random_state=42)
    kmeans_temp.fit(X_scaled)
    inertias.append(kmeans_temp.inertia_)
    sils.append(silhouette_score(X_scaled, kmeans_temp.labels_))

fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(12, 4))
ax1.plot(k_values, inertias, marker='o')
ax1.set_title("Elbow Method (Inertia)")
ax1.set_xlabel("k")
ax1.set_ylabel("Inertia")

ax2.plot(k_values, sils, marker='o')
ax2.set_title("Silhouette Scores")
ax2.set_xlabel("k")
ax2.set_ylabel("Score")

plt.show()


In [None]:
best_k = 4  # pick from the elbow/silhouette results
kmeans = KMeans(n_clusters=best_k, random_state=42)
kmeans.fit(X_scaled)
cluster_labels = kmeans.labels_

print("Cluster label counts:")
print(pd.Series(cluster_labels).value_counts())

# Attach labels back to df_encoded for interpretation
clustered_df = df_encoded.copy()
clustered_df["Cluster"] = cluster_labels

print("\nMean of 'Rank' by cluster (if included in features):")
if "Rank" in clustered_df.columns:
    print(clustered_df.groupby("Cluster")["Rank"].mean())

# We can also look at means of any driver dummy columns
print("\nSample driver factor frequencies by cluster:")
driver_dummy_cols = [c for c in clustered_df.columns if c.startswith("Driver")] # after encoding
means_by_cluster = clustered_df.groupby("Cluster")[driver_dummy_cols].mean()
display(means_by_cluster)


In [None]:
pca = PCA(n_components=2, random_state=42)
pca_result = pca.fit_transform(X_scaled)
clustered_df["pca_1"] = pca_result[:, 0]
clustered_df["pca_2"] = pca_result[:, 1]

plt.figure(figsize=(8, 6))
sns.scatterplot(
    data=clustered_df,
    x="pca_1", y="pca_2",
    hue="Cluster",
    palette="Set1",
    alpha=0.7
)
plt.title("K-Means Clusters (PCA=2 components)")
plt.show()


In [None]:

"""
In this section, we demonstrate how to train a predictive model (e.g., RandomForestRegressor)
for the continuous 'Rank' score. We’ll do a simple example with a time-based split (older data
as train, newer data as test). If you prefer a random split, you can just import train_test_split
from sklearn and ignore the timestamps.

Steps:
1. Create a new DataFrame from df_encoded.
2. Decide on a time-based cutoff (e.g., 80th percentile).
3. Separate features vs. target (Rank).
4. Train a RandomForestRegressor.
5. Evaluate with MSE, MAE, Spearman correlation.
6. Show how to do predictions.
"""

import pickle
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, mean_absolute_error
from scipy.stats import spearmanr

# 1) We'll copy df_encoded into a new DataFrame
model_df = df_encoded.copy()

# 2) Time-based train/test split (80% oldest => train, 20% newest => test).
cutoff_date = model_df["UpdateDT"].quantile(0.80)  # or another approach
train_df = model_df[model_df["UpdateDT"] <= cutoff_date]
test_df  = model_df[model_df["UpdateDT"] >  cutoff_date]

print(f"Training set size: {len(train_df)}")
print(f"Test set size    : {len(test_df)}")

# 3) Define features & target
# Exclude ID, date, and 'Rank' (since 'Rank' is the target we want to predict).
exclude_cols2 = ["SubjectID", "UpdateDT", "Rank"]
feature_cols2 = [col for col in model_df.columns if col not in exclude_cols2]

X_train = train_df[feature_cols2]
y_train = train_df["Rank"]
X_test  = test_df[feature_cols2]
y_test  = test_df["Rank"]

# 4) (Optional) Scale features for the model
# It's often beneficial to scale numeric columns. We can reuse StandardScaler if we want.
scaler2 = StandardScaler()
X_train_scaled = scaler2.fit_transform(X_train)
X_test_scaled  = scaler2.transform(X_test)

# 5) Initialize and train a RandomForestRegressor
rf = RandomForestRegressor(n_estimators=100, random_state=42)
rf.fit(X_train_scaled, y_train)

# 6) Predictions & evaluation
y_pred = rf.predict(X_test_scaled)

# Mean Squared Error, Mean Absolute Error, Spearman correlation
mse_val = mean_squared_error(y_test, y_pred)
mae_val = mean_absolute_error(y_test, y_pred)
spearman_corr, _ = spearmanr(y_test, y_pred)

print("\n--- RandomForestRegressor Evaluation (Time-based Split) ---")
print(f"Test MSE: {mse_val:.2f}")
print(f"Test MAE: {mae_val:.2f}")
print(f"Spearman Correlation: {spearman_corr:.3f}")

# 7) Save the model for future usage
with open("random_forest_model.pkl", "wb") as f:
    pickle.dump(rf, f)

print("\nModel saved to 'random_forest_model.pkl'")


In [None]:

"""
If you want to predict rank on new/unseen data, you must:
1. Apply the same one-hot encoding to the new data (matching old factor columns).
2. Scale using the same scaler (fit on training data).
3. Load the trained model and call .predict().

Below is a simplified example. If you have a 'new_data.csv' with the same columns,
you would do something like:
"""

# Example demonstration (commented out):
"""
new_df = pd.read_csv('new_data.csv')
new_df['UpdateDT'] = pd.to_datetime(new_df['UpdateDT'], errors='coerce')
# Sort, etc., if needed

# 1) One-hot encode the driver columns in the same way. You must replicate the exact
#    dummy column structure. Usually, you'd re-use the same method or store the columns.
#    For example:
driver_cols_new = [c for c in new_df.columns if c.startswith('Driver')]
new_df_encoded = pd.get_dummies(new_df, columns=driver_cols_new, drop_first=True)

# Make sure new_df_encoded has the same columns as X_train did
# If any columns are missing, you might need to add them with default 0. 
# If new columns appear, you should drop or handle them consistently.

# 2) Scale using the same scaler
X_new = new_df_encoded[feature_cols2]  # same feature_cols2 as before
X_new_scaled = scaler2.transform(X_new)

# 3) Load the model and predict
with open('random_forest_model.pkl', 'rb') as f:
    loaded_rf = pickle.load(f)

new_preds = loaded_rf.predict(X_new_scaled)
print(new_preds)
"""

