In [1]:
import numpy as np
import pandas as pd

from sklearn.preprocessing           import LabelEncoder
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition           import TruncatedSVD
from sklearn.linear_model            import SGDClassifier, LogisticRegression
from sklearn.calibration             import CalibratedClassifierCV
from sklearn.ensemble                import (
    HistGradientBoostingClassifier,
    RandomForestClassifier,
    AdaBoostClassifier,
    StackingClassifier
)
from sklearn.tree                    import DecisionTreeClassifier
from sklearn.naive_bayes             import GaussianNB
from lightgbm                        import LGBMClassifier
from catboost                        import CatBoostClassifier

# Load the data
train      = pd.read_csv('train.csv')
test       = pd.read_csv('test.csv')
submission = pd.read_csv('sample_submission.csv')



In [2]:
# Prepare text and categorical encodings
le_lang = LabelEncoder().fit(pd.concat([train.lang, test.lang]).fillna('NA'))
le_loc  = LabelEncoder().fit(pd.concat([train.location, test.location]).fillna('NA'))

# TF–IDF vectorizer on descriptions, capped at 1k features
tfidf = TfidfVectorizer(max_features=1000, ngram_range=(1,2)).fit(
    pd.concat([train.description.fillna(''), test.description.fillna('')])
)

# SVD to reduce TF–IDF down to 20 components
svd = TruncatedSVD(n_components=20, random_state=42)

# Precompute frequency counts for freqency encoding
lang_freq = train.lang.fillna('NA').value_counts()
loc_freq  = train.location.fillna('NA').value_counts()

def preprocess(df, fit_svd=False):
    df = df.copy()

    # Time features
    df['created_at'] = pd.to_datetime(df['created_at'])
    df['acct_weekday'] = df['created_at'].dt.weekday
    df['acct_hour']    = df['created_at'].dt.hour

    # Log-scale skewed counts
    for c in ['followers_count','friends_count','statuses_count','favourites_count']:
        df[f'log1p_{c}'] = np.log1p(df[c])

    # Activity ratios
    df['statuses_per_day']      = df['statuses_count']      / (df['account_age_days'] + 1)
    df['favourites_per_day']    = df['favourites_count']     / (df['account_age_days'] + 1)
    df['friends_followers_ratio']= df['friends_count']       / (df['followers_count'] + 1)
    df['tweets_fav_ratio']      = df['statuses_count']      / (df['favourites_count'] + 1)

    # Description text stats
    desc = df['description'].fillna('')
    df['desc_len']         = desc.str.len()
    df['has_desc']         = (df['desc_len'] > 0).astype(int)
    df['desc_hashtags']    = desc.str.count(r'#\w+')
    df['desc_mentions']    = desc.str.count(r'@\w+')
    df['desc_urls']        = desc.str.count(r'https?://')
    df['desc_upper_ratio'] = desc.str.count(r'[A-Z]') / (df['desc_len'] + 1)
    df['desc_digit_ratio'] = desc.str.count(r'\d')   / (df['desc_len'] + 1)

    # Boolean flags
    for flag in ['default_profile','default_profile_image','geo_enabled','verified']:
        df[flag] = df[flag].astype(int)

    # Frequency encoding for low-card columns
    df['lang_freq']     = df['lang'].map(lang_freq).fillna(0).astype(int)
    df['location_freq'] = df['location'].map(loc_freq).fillna(0).astype(int)

    # Label encoding
    df['lang_le'] = le_lang.transform(df['lang'].fillna('NA'))
    df['loc_le']  = le_loc.transform(df['location'].fillna('NA'))

    # TF–IDF
    tf_mat = tfidf.transform(desc)
    if fit_svd:
        svd.fit(tf_mat)
    tf_svd = svd.transform(tf_mat)

    # Drop raw/text columns not needed
    df = df.drop(columns=[
        'id','screen_name',
        'profile_background_image_url','profile_image_url',
        'description','created_at','lang','location'
    ], errors='ignore')

    return df, tf_mat, tf_svd


In [3]:
# Preprocess train, test
train_df, train_tf, train_svd = preprocess(train, fit_svd=True)
test_df,  test_tf,  test_svd  = preprocess(test)

y_train = train['target']



In [4]:
# Assemble full feature matrix
tf_cols  = [f'tfidf_{i}' for i in range(train_tf.shape[1])]
svd_cols = [f'svd_{i}'   for i in range(train_svd.shape[1])]

X_train = pd.concat([
    train_df.reset_index(drop=True),
    pd.DataFrame(train_tf.toarray(), columns=tf_cols),
    pd.DataFrame(train_svd,        columns=svd_cols)
], axis=1)

X_test = pd.concat([
    test_df.reset_index(drop=True),
    pd.DataFrame(test_tf.toarray(), columns=tf_cols),
    pd.DataFrame(test_svd,         columns=svd_cols)
], axis=1)

# Drop stray index/target columns
for df_ in (X_train, X_test):
    for c in ('index','target'):
        if c in df_.columns:
            df_.drop(columns=c, inplace=True)



In [5]:
# Define tuned base learners
base_models = {
    'sgd': CalibratedClassifierCV(
        SGDClassifier(
            loss='log_loss',
            penalty='elasticnet',
            alpha=1e-4,
            l1_ratio=0.15,
            learning_rate='invscaling',
            eta0=0.01,
            max_iter=5000,
            tol=1e-5,
            class_weight='balanced',
            random_state=42
        ),
        cv=3, n_jobs=-1
    ),
    'hgb': HistGradientBoostingClassifier(
        loss='log_loss',
        learning_rate=0.02,
        max_iter=500,
        early_stopping=True,
        validation_fraction=0.1,
        n_iter_no_change=20,
        max_leaf_nodes=31,
        l2_regularization=0.1,
        random_state=42
    ),
    'lgb': LGBMClassifier(
        boosting_type='gbdt',
        objective='binary',
        metric='auc',
        learning_rate=0.01,
        n_estimators=800,
        num_leaves=31,
        max_depth=6,
        subsample=0.8,
        colsample_bytree=0.8,
        reg_alpha=0.5,
        reg_lambda=0.5,
        scale_pos_weight=(y_train==0).sum()/(y_train==1).sum(),
        random_state=42,
        n_jobs=-1,
        verbose=-1
    ),
    'cat': CatBoostClassifier(
        iterations=800,
        learning_rate=0.01,
        depth=8,
        l2_leaf_reg=3,
        bagging_temperature=0.5,
        border_count=128,
        verbose=False,
        random_seed=42
    ),
    'rf': RandomForestClassifier(
        n_estimators=800,
        max_depth=20,
        min_samples_split=10,
        min_samples_leaf=2,
        max_features='sqrt',
        class_weight='balanced_subsample',
        random_state=42,
        n_jobs=-1
    ),
    'ada': AdaBoostClassifier(
        estimator=DecisionTreeClassifier(max_depth=1, class_weight='balanced'),
        n_estimators=300,
        learning_rate=1.0,
        algorithm='SAMME.R',
        random_state=42
    ),
    'nb': GaussianNB(var_smoothing=1e-9)
}


In [6]:
# Build a small stacking ensemble
stack = StackingClassifier(
    estimators=[(name, mdl) for name, mdl in base_models.items()],
    final_estimator=LogisticRegression(C=1.0, max_iter=1000),
    cv=5, n_jobs=-1, passthrough=False
)



In [7]:
# Train & make predictions
stack.fit(X_train, y_train)
submission['target'] = stack.predict_proba(X_test)[:, 1]

# Save to disk
submission.to_csv('submission.csv', index=False)
print(submission.head())

   index    target
0      0  0.030306
1      1  0.875539
2      2  0.077401
3      3  0.042577
4      4  0.967537
