In [None]:
# ==============================================================================
# CELL 1: SETUP AND LOAD YOUR STAGE 1 CLASSIFIER
# ==============================================================================
print("Installing all necessary libraries...")
!pip install transformers datasets scikit-learn pandas openpyxl lightgbm vaderSentiment --quiet

import pandas as pd
import numpy as np
import lightgbm as lgb
from google.colab import drive
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
from transformers import AutoTokenizer, AutoModelForSequenceClassification
from datasets import Dataset
import torch
import gc
import re
import warnings
warnings.filterwarnings("ignore")

device = "cuda" if torch.cuda.is_available() else "cpu"
print(f"Using device: {device}")

# --- Mount Google Drive ---
print("\nMounting Google Drive...")
drive.mount('/content/drive', force_remount=True)

# --- 1. LOAD STAGE 1 CLASSIFIER (Your Winning RoBERTa Model) ---
STAGE_1_PATH = "/content/drive/MyDrive/my_best_ROBERTA_model3"
print(f"Loading your winning classifier from: {STAGE_1_PATH}...")
try:
    s1_tokenizer = AutoTokenizer.from_pretrained(STAGE_1_PATH)
    s1_model = AutoModelForSequenceClassification.from_pretrained(STAGE_1_PATH).to(device)
    print("Stage 1 Classifier loaded successfully.")
except Exception as e:
    print(f"*** FATAL ERROR: Could not load your model. {e} ***")
    print("Please check the folder path and contents.")

Installing all necessary libraries...
[2K   [90m‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ[0m [32m126.0/126.0 kB[0m [31m4.4 MB/s[0m eta [36m0:00:00[0m
[?25hUsing device: cuda

Mounting Google Drive...
Mounted at /content/drive
Loading your winning classifier from: /content/drive/MyDrive/my_best_ROBERTA_model3...
Stage 1 Classifier loaded successfully.


In [None]:
# ==============================================================================
# CELL 2: LOAD DATA & GENERATE ALL FEATURES FOR STAGE 2
# ==============================================================================

# --- 1. Load Full 300k Dataset ---
def load_data_from_drive(filename="Copy of behaviour_simulation_train.xlsx"):
    file_path = f'/content/drive/MyDrive/{filename}'
    try:
        print(f"\nAttempting to load full training data from: {file_path}")
        df = pd.read_excel(file_path)
        print(f"Data loaded successfully! Shape: {df.shape}")
        df.rename(columns={'dates': 'date', 'inferred company': 'company'}, inplace=True)
        df['date'] = pd.to_datetime(df['date'])
        return df
    except Exception as e:
        print(f"An error occurred: {e}")
        return None

full_train_df = load_data_from_drive()

# --- 2. Define Helper Functions ---
def format_input_text(row):
    tweet_text = str(row['content']).strip()
    company = str(row['company']).strip()
    hour = row['date'].hour
    day = row['date'].day_name()
    has_media = "yes" if pd.notna(row['media']) else "no"
    return f"Brand: {company} | Day: {day} | Hour: {hour} | Media: {has_media} | Tweet: {tweet_text}"

def create_manual_features(df):
    print("Generating manual features (text length, time, sentiment)...")
    temp_df = df.copy()
    analyzer = SentimentIntensityAnalyzer()

    temp_df['content'] = temp_df['content'].fillna('').astype(str)
    temp_df['text_len'] = temp_df['content'].apply(len)
    temp_df['word_count'] = temp_df['content'].apply(lambda x: len(x.split()))

    # We use .astype('category') to prevent the ValueError
    temp_df['hour'] = temp_df['date'].dt.hour.astype('category')
    temp_df['dayofweek'] = temp_df['date'].dt.dayofweek.astype('category')
    temp_df['has_media'] = temp_df['media'].notna().astype('category')
    temp_df['company_cat'] = temp_df['company'].astype('category')
    temp_df['username_cat'] = temp_df['username'].astype('category')

    temp_df['sentiment'] = temp_df['content'].apply(lambda x: analyzer.polarity_scores(x)['compound'])

    print("Manual features complete.")
    return temp_df

# --- 3. Generate Transformer Probabilities (This is the slow part) ---
print("Generating formatted text for Transformer...")
full_train_df['text'] = full_train_df.apply(format_input_text, axis=1)

# Convert to a 'Dataset' to use the fast .map() function
hf_dataset = Dataset.from_pandas(full_train_df[['text']])

def predict_probabilities(batch):
    inputs = s1_tokenizer(batch['text'], return_tensors="pt", padding=True, truncation=True, max_length=256).to(device)
    with torch.no_grad():
        logits = s1_model(**inputs).logits
    probabilities = torch.softmax(logits, dim=1).cpu().numpy()
    return {
        'prob_low': probabilities[:, 0],
        'prob_medium': probabilities[:, 1],
        'prob_high': probabilities[:, 2],
        'prob_viral': probabilities[:, 3],
    }

print(f"Running Stage 1 Classifier to generate probabilities for {len(full_train_df)} tweets...")
print("(This will take 20-30 minutes, please be patient!)")
prob_dataset = hf_dataset.map(predict_probabilities, batched=True, batch_size=64)
print("Probability generation complete.")

# --- 4. Combine All Features into X and y ---
print("Combining all features...")
prob_df = prob_dataset.to_pandas()
manual_features_df = create_manual_features(full_train_df)

manual_feature_cols = ['text_len', 'word_count', 'hour', 'dayofweek', 'sentiment', 'has_media', 'company_cat', 'username_cat']
prob_feature_cols = ['prob_low', 'prob_medium', 'prob_high', 'prob_viral']

X = pd.concat([manual_features_df[manual_feature_cols], prob_df[prob_feature_cols]], axis=1)
y = np.log1p(full_train_df['likes']) # Our target is log(likes)

# We also need the original text and likes for the "highlight reel"
original_text = manual_features_df['content']
original_likes = full_train_df['likes']

print(f"Final training dataset 'X' created with shape: {X.shape}")


Attempting to load full training data from: /content/drive/MyDrive/Copy of behaviour_simulation_train.xlsx
Data loaded successfully! Shape: (300000, 7)
Generating formatted text for Transformer...
Running Stage 1 Classifier to generate probabilities for 300000 tweets...
(This will take 20-30 minutes, please be patient!)


Map:   0%|          | 0/300000 [00:00<?, ? examples/s]

Probability generation complete.
Combining all features...
Generating manual features (text length, time, sentiment)...
Manual features complete.
Final training dataset 'X' created with shape: (300000, 12)


In [None]:
# ==============================================================================
# CELL 3: TRAIN STAGE 2 REGRESSOR (LightGBM)
# ==============================================================================

print("Creating the identical 80/20 split for training and validation...")
# We must create a mask that perfectly matches the 60,000 samples
# your classifier was validated on.
# We re-create the split from the 300k data to get the *indices*.
# stratify=full_train_df['label'] is not needed here as we use random_state
indices = np.arange(len(full_train_df))
train_indices, val_indices = train_test_split(indices, test_size=0.2, random_state=42)

print(f"Split complete. Training on {len(train_indices)}, validating on {len(val_indices)}.")

# Create the final train/val sets for LightGBM
X_train, y_train = X.iloc[train_indices], y.iloc[train_indices]
X_val, y_val = X.iloc[val_indices], y.iloc[val_indices]

# Also grab the original text/likes for the validation set
val_text = original_text.iloc[val_indices]
val_actual_likes = original_likes.iloc[val_indices]


# --- Train the LightGBM Model ---
print("\nTraining Stage 2 LightGBM Regressor... (This will take 5-10 minutes)")
categorical_features = ['has_media', 'company_cat', 'username_cat', 'dayofweek', 'hour']

lgbm_regressor = lgb.LGBMRegressor(
    objective='regression_l1',
    metric='rmse',
    n_estimators=1000,
    learning_rate=0.05,
    num_leaves=31,
    random_state=42,
    n_jobs=-1
)

lgbm_regressor.fit(
    X_train, y_train,
    eval_set=[(X_val, y_val)],
    eval_metric='rmse',
    callbacks=[lgb.early_stopping(100)],
    categorical_feature=categorical_features
)

print("\n--- Stage 2 Model Evaluation ---")
val_preds_log = lgbm_regressor.predict(X_val)
val_preds_real = np.expm1(val_preds_log)
val_preds_real[val_preds_real < 0] = 0

final_rmse = np.sqrt(mean_squared_error(val_actual_likes, val_preds_real))
print(f"Final Model RMSE (on real 'likes' scale): {final_rmse:.4f}")

print("\nCreating results DataFrame for 'Highlight Reel'...")
# --- Create the "Highlight Reel" DataFrame ---
df_results = pd.DataFrame({
    'Tweet Text': val_text,
    'Actual Likes': val_actual_likes,
    'Predicted Likes': val_preds_real.astype(int)
})

# This is the key column for finding the "best" predictions
df_results['Error'] = df_results['Actual Likes'] - df_results['Predicted Likes']
df_results['Absolute Error'] = df_results['Error'].abs()

print("Results DataFrame created.")

Creating the identical 80/20 split for training and validation...
Split complete. Training on 240000, validating on 60000.

Training Stage 2 LightGBM Regressor... (This will take 5-10 minutes)
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.051984 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 3252
[LightGBM] [Info] Number of data points in the train set: 240000, number of used features: 11
[LightGBM] [Info] Start training from score 4.343805
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[446]	valid_0's rmse: 0.78147

--- Stage 2 Model Evaluation ---
Final Model RMSE (on real 'likes' scale): 4039.1569

Creating results DataFrame for 'Highlight Reel'...
Results DataFrame created.


In [None]:
# ==============================================================================
# CELL 4: THE "HIGHLIGHT REEL" (BEST PREDICTIONS)
# ==============================================================================

# Set pandas to show full tweet text
pd.set_option('display.max_colwidth', None)

print("\n" + "="*50)
print("     HIGHLIGHT REEL: THE 20 BEST PREDICTIONS")
print("  (Sorted by smallest absolute error, highest likes first)")
print("="*50)

# Sort by smallest error, but show the most popular tweets first
best_predictions = df_results.sort_values(by=['Absolute Error', 'Actual Likes'], ascending=[True, False])

print(best_predictions.head(20))


print("\n" + "="*50)
print("     'ALMOST' REEL: THE 20 MOST IMPRESSIVE 'NEAR MISSES'")
print("  (For 'Viral' tweets where the prediction was also 'Viral' or 'High')")
print("="*50)

# Filter for *actual* viral tweets, then sort by error
impressive_misses = df_results[df_results['Actual Likes'] > 10000].sort_values(by='Absolute Error')

print(impressive_misses.head(20))


     HIGHLIGHT REEL: THE 20 BEST PREDICTIONS
  (Sorted by smallest absolute error, highest likes first)
                                                                                                                                                                                                                                                                                                      Tweet Text  \
175545                                                                                                                                                               Britain's Princess Charlotte, youngest child of the Duke and Duchess of Cambridge, started nursery school on Monday <hyperlink> <hyperlink>   
57495                                                                                                        The US surgeon general calls on healthy Americans to donate blood. The Red Cross is facing a "severe" shortage due to blood drive cancellations in response to coronavirus