In [0]:
# Import dependencies
import pandas as pd
from pyspark.sql.functions import col
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.model_selection import train_test_split

# Load data from Spark table
df_spark = spark.table("default.lol_raw_data_updated")

# Convert to Pandas DataFrame
df = df_spark.toPandas()

# Convert dates
df['OBS DATE'] = pd.to_datetime(df['OBS DATE'])
df['LASTLOGIN'] = pd.to_datetime(df['LASTLOGIN'])

# Churn label: 1 if inactive 31+ days
df['CHURN'] = ((df['OBS DATE'] - df['LASTLOGIN']).dt.days >= 31).astype(int)

# Clean tier data
df['TIER_CLEANED'] = df['TIER'].astype(str).str.upper().str.extract(r'([A-Z]+)')
tier_order = {
    'GOLD': 3, 'PLATINUM': 4, 'EMERALD': 5,
    'DIAMOND': 6, 'MASTER': 7, 'GRANDMASTER': 8, 'CHALLENGER': 9
}
df['TIER_ENCODED'] = df['TIER_CLEANED'].map(tier_order)

# Clean 'WIN PER' and 'LAST 20 WIN PER' columns by removing 'L' and '%', and keeping only numeric
for colname in ['WIN PER', 'LAST 20 WIN PER']:
    df[colname] = df[colname].astype(str).str.replace('L', '', regex=False).str.replace('%', '', regex=False)
    df = df[df[colname].str.replace('.', '', regex=False).str.isnumeric()]
    df[colname] = df[colname].astype(float)

# Clean 'WIN PER' columns
for colname in ['WIN PER', 'LAST 20 WIN PER']:
    df[colname] = df[colname].astype(str).str.replace('%', '', regex=False).astype(float)


# Define features
features = [
    'LP', 'LEVEL', 'AVSCORE', 'WIN PER', 'LAST 20 WIN PER', 'TOTAL MATCH', 'LOSING STREAK',
    'NO CHAMPIONS PLAYED', 'TIER_ENCODED', 'NO TEAM GAMES',
    'NO TEAM PARTICIPANTS', 'TEAM WIN', 'TEAM LOSE'
]

# Drop missing data
df_model = df[features + ['CHURN']].dropna()

# Split into X and y
X = df_model[features]
y = df_model['CHURN']

# Stratified split
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.25, stratify=y, random_state=42
)

# Train Random Forest model
rf_model = RandomForestClassifier(n_estimators=100, max_depth=10, class_weight='balanced', random_state=42)
rf_model.fit(X_train, y_train)

# Predictions and evaluation
y_pred = rf_model.predict(X_test)
print("Confusion Matrix:")
print(confusion_matrix(y_test, y_pred))

print("\nClassification Report:")
print(classification_report(y_test, y_pred))


Confusion Matrix:
[[45976 18136]
 [ 3204  6556]]

Classification Report:
              precision    recall  f1-score   support

           0       0.93      0.72      0.81     64112
           1       0.27      0.67      0.38      9760

    accuracy                           0.71     73872
   macro avg       0.60      0.69      0.60     73872
weighted avg       0.85      0.71      0.75     73872



In [0]:
import mlflow
import mlflow.sklearn

mlflow.sklearn.autolog()

with mlflow.start_run():
    rf_model.fit(X_train, y_train)




In [0]:
import joblib

# Save your trained model
joblib.dump(rf_model, "rf_churn_model.pkl", compress=3)



print("✅ Model saved!")


✅ Model saved!
