In [None]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import (
    confusion_matrix, precision_score, recall_score, accuracy_score, f1_score,
    roc_auc_score, average_precision_score
)
import lightgbm as lgb
import xgboost as xgb
import re
import matplotlib.pyplot as plt
from imblearn.over_sampling import SMOTE, ADASYN
from scipy.stats import ks_2samp

_splitter = re.compile(r"[,;\^\s]+")

def count_listish(x):
    if x is None or (isinstance(x, float) and pd.isna(x)):
        return 0
    s = str(x).strip()
    if s == "" or s == "[]":
        return 0
    if s.startswith("[") and s.endswith("]"):
        s = s[1:-1]
    parts = [p for p in _splitter.split(s) if p]
    return len(parts)

def eval_at_threshold(y_true, y_prob, thr=0.5, label="Model"):
    y_pred = (y_prob >= thr).astype(int)
    cm = confusion_matrix(y_true, y_pred, labels=[0,1])
    precision = precision_score(y_true, y_pred, zero_division=0)
    recall = recall_score(y_true, y_pred, zero_division=0)
    acc = accuracy_score(y_true, y_pred)
    f1 = f1_score(y_true, y_pred, zero_division=0)
    roc_auc = roc_auc_score(y_true, y_prob)
    pr_auc = average_precision_score(y_true, y_prob)
    print(f"\n{label} @ threshold={thr:.3f}")
    print("Confusion Matrix:\n", cm)
    print(f"Precision: {precision:.4f}, Recall: {recall:.4f}, "
          f"Accuracy: {acc:.4f}, F1: {f1:.4f}, ROC-AUC: {roc_auc:.4f}, PR-AUC: {pr_auc:.4f}")
    return {
        "thr": thr, "precision": precision, "recall": recall, "accuracy": acc,
        "f1": f1, "roc_auc": roc_auc, "pr_auc": pr_auc
    }

def best_threshold_by_f1(y_true, y_prob, grid=None):
    if grid is None:
        grid = np.linspace(0.02, 0.50, 49)
    best = (-1, 0.5)
    for t in grid:
        f1 = f1_score(y_true, (y_prob >= t).astype(int), zero_division=0)
        if f1 > best[0]:
            best = (f1, t)
    return best[1]

# loading data
ads_train = pd.read_csv("train_data_ads.csv")
ads_test = pd.read_csv("test_data_ads.csv")
feeds_train = pd.read_csv("train_data_feeds.csv")
feeds_test = pd.read_csv("test_data_feeds.csv")
print("Data loaded")

# preprocessing ads data
def preprocess_ads(df):
    categorical_cols_ads = [
        'age', 'gender', 'residence', 'city', 'city_rank', 'series_dev', 'series_group',
        'emui_dev', 'device_name', 'device_size', 'net_type', 'task_id', 'adv_id',
        'creat_type_cd', 'adv_prim_id', 'inter_type_cd', 'slot_id', 'site_id',
        'spread_app_id', 'hispace_app_tags', 'app_second_class'
    ]
    df = df.copy()
    for col in categorical_cols_ads:
        if col in df.columns:
            # Convert to numeric to preserve original categorical values (not shifted to 0)
            # Fill missing values with -1 as a sentinel value
            df[col] = pd.to_numeric(df[col], errors='coerce').fillna(-1).astype(int)

    list_cols_click = ['ad_click_list_001', 'ad_click_list_002', 'ad_click_list_003']
    list_cols_close = ['ad_close_list_001', 'ad_close_list_002', 'ad_close_list_003']
    for col in list_cols_click:
        if col in df.columns:
            df[col] = df[col].fillna('[]').apply(count_listish)
    for col in list_cols_close:
        if col in df.columns:
            df[col] = df[col].fillna('[]').apply(count_listish)

    if 'pt_d' in df.columns:
        s = pd.to_datetime(df['pt_d'].astype(str), format='%Y%m%d%H%M', errors='coerce')
        if s.isna().all():
            s = pd.to_datetime(df['pt_d'].astype(str), format='%Y%m%d', errors='coerce')
        df['hour'] = s.dt.hour.fillna(-1).astype(int)
        df['dayofweek'] = s.dt.dayofweek.fillna(-1).astype(int)

    return df

# preprocessing feeds data
def preprocess_feeds(df):
    categorical_cols_feeds = [
        'u_phonePrice', 'u_browserLifeCycle', 'u_browserMode', 'u_feedLifeCycle',
        'u_newsCatInterests', 'u_newsCatDislike', 'u_newsCatInterestsST',
        'i_s_sourceId','i_regionEntity','i_cat','I_dtype',
        'e_ch','e_m','e_pl','e_section'
    ]
    list_cols_feeds = ['u_newsCatInterests','u_newsCatDislike','u_newsCatInterestsST',
                       'u_click_ca2_news','i_entities']

    df = df.copy()
    for col in list_cols_feeds:
        if col in df.columns:
            df[col] = df[col].fillna('[]').apply(count_listish)

    for col in categorical_cols_feeds:
        if col in df.columns:
            # Convert to numeric to preserve original categorical values (not shifted to 0)
            # Fill missing values with -1 as a sentinel value
            df[col] = pd.to_numeric(df[col], errors='coerce').fillna(-1).astype(int)

    # extract time
    if 'e_et' in df.columns:
        s = pd.to_datetime(df['e_et'].astype(str), format='%Y%m%d%H%M', errors='coerce')
        if s.isna().all():
            s = pd.to_datetime(df['e_et'].astype(str), format='%Y%m%d', errors='coerce')
        df['e_hour'] = s.dt.hour.fillna(-1).astype(int)
        df['e_dayofweek'] = s.dt.dayofweek.fillna(-1).astype(int)

    if 'label' in df.columns:
        df['label'] = df['label'].map({1:1, -1:0})

    return df

def aggregate_feeds_user(df):
    agg_cols = [
        'u_phonePrice','u_browserLifeCycle','u_browserMode','u_feedLifeCycle',
        'u_refreshTimes','u_newsCatInterests','u_newsCatDislike',
        'u_newsCatInterestsST','u_click_ca2_news'
    ]
    agg_dict = {col: 'mean' for col in agg_cols if col in df.columns}
    if 'label' in df.columns:
        agg_dict['label'] = 'mean'
    agg = df.groupby('u_userId').agg(agg_dict).reset_index()
    agg.columns = ['u_userId'] + [f'feeds_{c}' for c in agg.columns if c != 'u_userId']
    return agg

train_ads_p = preprocess_ads(ads_train)
test_ads_p  = preprocess_ads(ads_test)
train_feeds_p = preprocess_feeds(feeds_train)
test_feeds_p  = preprocess_feeds(feeds_test)
print("Data preprocessed")

all_feeds = pd.concat([train_feeds_p, test_feeds_p], axis=0, ignore_index=True)
feeds_user_agg = aggregate_feeds_user(all_feeds)

train_merged = train_ads_p.merge(feeds_user_agg, left_on='user_id', right_on='u_userId', how='left').fillna(0)
test_merged  = test_ads_p.merge(feeds_user_agg,  left_on='user_id', right_on='u_userId',  how='left').fillna(0)
train_merged = train_merged.drop(columns=['u_userId'], errors='ignore')
test_merged  = test_merged.drop(columns=['u_userId'], errors='ignore')
print("Data merged")

target = 'label'
X = train_merged.drop(columns=[target], errors='ignore')
y = train_merged[target] if target in train_merged.columns else None
print("Data prepared")

for col in ['pt_d', 'log_id']:
    if col in X.columns:
        X = X.drop(columns=[col])
    if col in test_merged.columns:
        test_merged = test_merged.drop(columns=[col])

for df_ in [X, test_merged]:
    obj_cols = df_.select_dtypes(include=['object']).columns
    if len(obj_cols):
        df_.drop(columns=obj_cols, inplace=True)

X_train, X_val, y_train, y_val = train_test_split(
    X, y, test_size=0.2, stratify=y, random_state=42)
print("Data split")

# Generate SMOTE + ADASYN
class_counts = np.bincount(y_train)
print("Original training class counts:", class_counts)

neg, pos = class_counts[0], class_counts[1]
scale_pos_weight_val = neg / pos
print("Recommended scale_pos_weight:", scale_pos_weight_val)


# SMOTE
smote = SMOTE(
    sampling_strategy="auto",   
    random_state=42,
    k_neighbors=5
)
X_train_sm, y_train_sm = smote.fit_resample(X_train, y_train)
print("After SMOTE class counts:", np.bincount(y_train_sm))

smote_df = pd.DataFrame(X_train_sm, columns=X_train.columns)
smote_df["label"] = y_train_sm.values
smote_df.to_csv("synthetic_train_SMOTE_raw.csv", index=False)
print("SMOTE data saved to csv")

# ADASYN
adasyn = ADASYN(
    sampling_strategy="auto", 
    random_state=42,
    n_neighbors=5
)
X_train_ada, y_train_ada = adasyn.fit_resample(X_train, y_train)
print("After ADASYN class counts:", np.bincount(y_train_ada))

adasyn_df = pd.DataFrame(X_train_ada, columns=X_train.columns)
adasyn_df["label"] = y_train_ada.values
adasyn_df.to_csv("synthetic_train_ADASYN_raw.csv", index=False)
print("ADASYN data saved to csv")
