In [None]:
# ==============================================================================
# CELL 1 (NEW): SETUP FOR STAGE 2
# ==============================================================================
print("Installing LightGBM for our Stage 2 Regressor...")
!pip install lightgbm scikit-learn pandas vaderSentiment --quiet

# We need all the libraries from both models
import pandas as pd
import numpy as np
import lightgbm as lgb
from google.colab import drive
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
import torch
from transformers import AutoTokenizer, AutoModelForSequenceClassification
from datasets import Dataset
import re
import warnings
warnings.filterwarnings("ignore")

device = "cuda" if torch.cuda.is_available() else "cpu"

# --- Mount Drive and Load the FULL 300k Excel file ---
print("\nMounting Google Drive...")
drive.mount('/content/drive', force_remount=True)

def load_data_from_drive(filename="Copy of behaviour_simulation_train.xlsx"):
    file_path = f'/content/drive/MyDrive/{filename}'
    try:
        print(f"\nAttempting to load full training data from: {file_path}")
        df = pd.read_excel(file_path)
        print(f"Data loaded successfully! Shape: {df.shape}")
        df.rename(columns={'dates': 'date', 'inferred company': 'company'}, inplace=True)
        df['date'] = pd.to_datetime(df['date'])
        return df
    except Exception as e:
        print(f"An error occurred: {e}")
        return None

# This is our full 300,000 row dataset
full_train_df = load_data_from_drive()

Installing LightGBM for our Stage 2 Regressor...
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m126.0/126.0 kB[0m [31m11.5 MB/s[0m eta [36m0:00:00[0m
[?25h
Mounting Google Drive...
Mounted at /content/drive

Attempting to load full training data from: /content/drive/MyDrive/Copy of behaviour_simulation_train.xlsx
Data loaded successfully! Shape: (300000, 7)


In [None]:
# ==============================================================================
# CELL 2 (NEW): LOAD YOUR STAGE 1 CLASSIFIER
# ==============================================================================
# Define the base model name (must match what you trained)
BASE_MODEL_NAME = "distilbert-base-uncased"
# This is the path to YOUR saved model weights in Google Drive
SAVED_MODEL_PATH = "/content/drive/MyDrive/my_best_tweet_model"

print(f"Loading tokenizer '{BASE_MODEL_NAME}' from Hugging Face Hub...")
try:
    # --- LOAD TOKENIZER FROM WEB ---
    tokenizer = AutoTokenizer.from_pretrained(BASE_MODEL_NAME)
    print("Tokenizer loaded successfully from web.")

    # --- LOAD FINE-TUNED MODEL FROM DRIVE ---
    print(f"Loading fine-tuned Stage 1 Classifier from: {SAVED_MODEL_PATH}...")
    classifier_model = AutoModelForSequenceClassification.from_pretrained(SAVED_MODEL_PATH).to(device)
    print("Stage 1 Classifier loaded successfully from Drive.")

except Exception as e:
     print(f"\n*** AN UNEXPECTED ERROR OCCURRED: {e} ***")

Loading tokenizer 'distilbert-base-uncased' from Hugging Face Hub...


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/483 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

Tokenizer loaded successfully from web.
Loading fine-tuned Stage 1 Classifier from: /content/drive/MyDrive/my_best_tweet_model...
Stage 1 Classifier loaded successfully from Drive.


In [None]:
# ==============================================================================
# CELL 3 (NEW, CORRECTED): GENERATE ALL FEATURES FOR STAGE 2
# ==============================================================================

# --- 1. Define our helper functions ---
def format_input_text(row):
    tweet_text = str(row['content']).strip()
    company = str(row['company']).strip()
    hour = row['date'].hour
    day = row['date'].day_name()
    has_media = "yes" if pd.notna(row['media']) else "no"
    return f"Brand: {company} | Day: {day} | Hour: {hour} | Media: {has_media} | Tweet: {tweet_text}"

def create_manual_features(df):
    print("Generating manual features (text length, time, sentiment)...")
    temp_df = df.copy()
    analyzer = SentimentIntensityAnalyzer()

    # Simple text features
    temp_df['content'] = temp_df['content'].fillna('').astype(str)
    temp_df['text_len'] = temp_df['content'].apply(len)
    temp_df['word_count'] = temp_df['content'].apply(lambda x: len(x.split()))

    # --- START OF FIX ---
    # We convert categorical columns to the special 'category' dtype
    # This ensures all splits (train/val) will know about all possible categories.
    temp_df['hour'] = temp_df['date'].dt.hour.astype('category')
    temp_df['dayofweek'] = temp_df['date'].dt.dayofweek.astype('category')
    temp_df['has_media'] = temp_df['media'].notna().astype('category')

    # For high-cardinality features like company/username, 'category' is the
    # correct dtype for LightGBM to handle them efficiently.
    temp_df['company_cat'] = temp_df['company'].astype('category')
    temp_df['username_cat'] = temp_df['username'].astype('category')
    # --- END OF FIX ---

    # Sentiment
    temp_df['sentiment'] = temp_df['content'].apply(lambda x: analyzer.polarity_scores(x)['compound'])

    print("Manual features complete.")
    return temp_df

# --- 2. Generate Transformer Probabilities (This is the slow part) ---
print("Generating formatted text for Transformer...")
full_train_df['text'] = full_train_df.apply(format_input_text, axis=1)

# Convert to a 'Dataset' to use the fast .map() function
hf_dataset = Dataset.from_pandas(full_train_df[['text']])

def predict_probabilities(batch):
    # Tokenize the batch of text
    inputs = tokenizer(
        batch['text'],
        return_tensors="pt",
        padding=True,
        truncation=True,
        max_length=256
    ).to(device)

    # Get model outputs (logits)
    with torch.no_grad():
        logits = classifier_model(**inputs).logits

    # Convert logits to probabilities using softmax
    probabilities = torch.softmax(logits, dim=1).cpu().numpy()

    # Return as a dictionary of new columns
    return {
        'prob_low': probabilities[:, 0],
        'prob_medium': probabilities[:, 1],
        'prob_high': probabilities[:, 2],
        'prob_viral': probabilities[:, 3],
    }

print("Running Stage 1 Classifier to generate probabilities for 300k tweets...")
print("(This will take a significant amount of time, maybe 20-30 minutes)")
prob_dataset = hf_dataset.map(predict_probabilities, batched=True, batch_size=64)
print("Probability generation complete.")

# --- 3. Combine All Features ---
print("Combining all features...")
# Convert probability dataset back to pandas
prob_df = prob_dataset.to_pandas()

# Create manual features
manual_features_df = create_manual_features(full_train_df)

# Define our feature names
manual_feature_cols = ['text_len', 'word_count', 'hour', 'dayofweek', 'sentiment', 'has_media', 'company_cat', 'username_cat']
prob_feature_cols = ['prob_low', 'prob_medium', 'prob_high', 'prob_viral']

# This is our final, feature-rich dataset for Stage 2
X = pd.concat([
    manual_features_df[manual_feature_cols],
    prob_df[prob_feature_cols]
], axis=1)

# This is our target variable: log of likes
y = np.log1p(full_train_df['likes'])

print(f"Final training dataset 'X' created with shape: {X.shape}")

Generating formatted text for Transformer...
Running Stage 1 Classifier to generate probabilities for 300k tweets...
(This will take a significant amount of time, maybe 20-30 minutes)


Map:   0%|          | 0/300000 [00:00<?, ? examples/s]

Probability generation complete.
Combining all features...
Generating manual features (text length, time, sentiment)...
Manual features complete.
Final training dataset 'X' created with shape: (300000, 12)


In [None]:
# ==============================================================================
# CELL 4 (NEW): TRAIN STAGE 2 REGRESSOR (LightGBM)
# ==============================================================================

print("Training Stage 2 LightGBM Regressor...")

# We need to tell LightGBM which features are categorical
categorical_features = ['has_media', 'company_cat', 'username_cat', 'dayofweek', 'hour']

# Split our new dataset for training and validation
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

# Create the LightGBM model
lgbm_regressor = lgb.LGBMRegressor(
    objective='regression_l1', # L1 (MAE) is robust to outliers
    metric='rmse',
    n_estimators=1000,
    learning_rate=0.05,
    num_leaves=31,
    random_state=42,
    n_jobs=-1
)

# Train the model
lgbm_regressor.fit(
    X_train, y_train,
    eval_set=[(X_val, y_val)],
    eval_metric='rmse',
    callbacks=[lgb.early_stopping(100)],
    categorical_feature=categorical_features
)

# --- Evaluate our new model ---
print("\n--- Stage 2 Model Evaluation ---")
val_preds_log = lgbm_regressor.predict(X_val)

# Convert log predictions back to the real number of likes
val_preds_real = np.expm1(val_preds_log)
y_val_real = np.expm1(y_val)

# Ensure no negative predictions
val_preds_real[val_preds_real < 0] = 0

# Calculate the final RMSE! This is your Task 1 score.
final_rmse = np.sqrt(mean_squared_error(y_val_real, val_preds_real))
print(f"Final Model RMSE (on real 'likes' scale): {final_rmse:.4f}")

Training Stage 2 LightGBM Regressor...
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.022166 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 3252
[LightGBM] [Info] Number of data points in the train set: 240000, number of used features: 11
[LightGBM] [Info] Start training from score 4.343805
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[450]	valid_0's rmse: 0.73472

--- Stage 2 Model Evaluation ---
Final Model RMSE (on real 'likes' scale): 3931.6662


In [None]:
# ==============================================================================
# CELL 5 (NEW): SAVE THE STAGE 2 REGRESSOR
# ==============================================================================
import joblib

REGRESSOR_PATH = "/content/drive/MyDrive/my_lgbm_regressor.txt"
print(f"Saving Stage 2 Regressor to: {REGRESSOR_PATH}")

# Save the trained model
joblib.dump(lgbm_regressor, REGRESSOR_PATH)

print("Stage 2 Regressor saved successfully.")

Saving Stage 2 Regressor to: /content/drive/MyDrive/my_lgbm_regressor.txt
Stage 2 Regressor saved successfully.


In [None]:
# ==============================================================================
# CELL 5b: SAVE THE MISSING SCHEMA FILE
# (Run this in your original TRAINING notebook)
# ==============================================================================
import pandas as pd

try:
    # 'X' is the "master spreadsheet" (features) you created in Cell 3
    # We are just grabbing its "blueprint" (the columns and data types)
    schema_to_save = X.dtypes

    # Define the save path (must match the loading path)
    SCHEMA_PATH = "/content/drive/MyDrive/my_data_schema.pkl"

    # Save the schema blueprint to your Drive
    schema_to_save.to_pickle(SCHEMA_PATH)

    print(f"\n--- SUCCESS! ---")
    print(f"File '{SCHEMA_PATH}' has been created.")
    print("You can now go back to your INFERENCE notebook and re-run the loading cell.")

except NameError:
    print(f"\n*** ERROR: The 'X' dataframe is no longer in memory. ***")
    print("Please go back and re-run Cell 3 (the long feature generation cell) to recreate 'X'.")
    print("After that, run this cell (Cell 5b) again.")
except Exception as e:
    print(f"An unexpected error occurred: {e}")


--- SUCCESS! ---
File '/content/drive/MyDrive/my_data_schema.pkl' has been created.
You can now go back to your INFERENCE notebook and re-run the loading cell.


In [None]:
# ==============================================================================
# CELL 6 (NEW): FINAL 2-STAGE INFERENCE
# ==============================================================================
import joblib
import pandas as pd
import numpy as np
import torch
from transformers import AutoTokenizer, AutoModelForSequenceClassification
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer

# --- Load BOTH models ---
print("Loading Stage 1 (Classifier) and Stage 2 (Regressor)...")

# --- Model 1: Classifier (Transformer) ---
STAGE_1_PATH = "/content/drive/MyDrive/my_best_tweet_model"
BASE_MODEL_NAME = "distilbert-base-uncased"
# Ensure tokenizer is loaded from the hub consistently if needed, or from local if saved
try:
    tokenizer = AutoTokenizer.from_pretrained(BASE_MODEL_NAME)
    print("Tokenizer loaded successfully.")
except Exception as e:
    print(f"Error loading tokenizer: {e}")
    # Fallback or error handling

# Ensure model is on the correct device
try:
    device = "cuda" if torch.cuda.is_available() else "cpu"
    classifier_model = AutoModelForSequenceClassification.from_pretrained(STAGE_1_PATH).to(device)
    print(f"Stage 1 Classifier loaded successfully on device: {device}")
except Exception as e:
    print(f"Error loading classifier model: {e}")
    # Fallback or error handling


# --- Model 2: Regressor (LightGBM) ---
STAGE_2_PATH = "/content/drive/MyDrive/my_lgbm_regressor.txt"
try:
    regressor_model = joblib.load(STAGE_2_PATH)
    print("Stage 2 Regressor loaded successfully.")
except Exception as e:
    print(f"Error loading regressor model: {e}")
    # Fallback or error handling


print("All models loaded successfully.")

# --- Regenerate Factorizers and Mappings for Categorical Features ---
# This is necessary to ensure consistency with training data encoding
print("Regenerating factorizers and mappings for categorical features...")
# Use the full training dataframe to get all possible categories
company_factorizer_train = pd.factorize(full_train_df['company'])
username_factorizer_train = pd.factorize(full_train_df['username'])
dayofweek_train_levels = ['Monday', 'Tuesday', 'Wednesday', 'Thursday', 'Friday', 'Saturday', 'Sunday']
hour_train_levels = list(range(24))
has_media_train_levels = [0, 1] # 0 for no, 1 for yes

# Create mappings from original value to factorized ID
company_to_id = {val: id for id, val in enumerate(company_factorizer_train[1])}
username_to_id = {val: id for id, val in enumerate(username_factorizer_train[1])}
dayofweek_to_id = {day: i for i, day in enumerate(dayofweek_train_levels)}
hour_to_id = {i: i for i in hour_train_levels}


# Define categorical dtypes using the levels from the training data
# These dtypes are useful for creating the DataFrame with correct categories
company_dtype = pd.CategoricalDtype(categories=company_factorizer_train[1])
username_dtype = pd.CategoricalDtype(categories=username_factorizer_train[1])
dayofweek_dtype = pd.CategoricalDtype(categories=dayofweek_train_levels)
hour_dtype = pd.CategoricalDtype(categories=hour_train_levels)
has_media_dtype = pd.CategoricalDtype(categories=has_media_train_levels)


print("Factorizers and categorical dtypes regenerated.")

# --- Define categorical feature names for LightGBM prediction ---
categorical_feature_names_for_predict = ['has_media', 'company_cat', 'username_cat', 'dayofweek', 'hour']


# --- Define the full prediction pipeline ---
def predict_likes_twostage(tweet_text, company, username, day, hour, has_media):

    # --- 1. Create Stage 1 Input (Transformer) ---
    formatted_text = f"Brand: {company} | Day: {day} | Hour: {hour} | Media: {has_media} | Tweet: {tweet_text}"
    inputs = tokenizer(formatted_text, return_tensors="pt", padding=True, truncation=True, max_length=256).to(device)

    # --- 2. Get Stage 1 Probabilities ---
    with torch.no_grad():
        logits = classifier_model(**inputs).logits
    probabilities = torch.softmax(logits, dim=1).cpu().numpy()[0] # Get the 1D array of 4 probs

    # --- 3. Create Stage 2 Input (LGBM) ---
    # We need to create the *exact same* manual features as our training set
    manual_features = {}
    manual_features['text_len'] = len(tweet_text)
    manual_features['word_count'] = len(tweet_text.split())

    # Map input values to factorized IDs using the training mappings
    manual_features['hour'] = hour_to_id.get(hour, -1) # Use .get with a default for safety
    manual_features['dayofweek'] = dayofweek_to_id.get(day, -1) # Use .get with a default for safety
    manual_features['sentiment'] = SentimentIntensityAnalyzer().polarity_scores(tweet_text)['compound']
    manual_features['has_media'] = 1 if has_media == "yes" else 0 # Map 'yes'/'no' to 1/0

    # Use the regenerated factorizers for company and username
    manual_features['company_cat'] = company_to_id.get(company, -1) # Use .get with a default for safety
    manual_features['username_cat'] = username_to_id.get(username, -1) # Use .get with a default for safety

    # Create the feature DataFrame
    # Column order must be EXACTLY as it was in training (Cell 3)
    feature_names = ['text_len', 'word_count', 'hour', 'dayofweek', 'sentiment',
                     'has_media', 'company_cat', 'username_cat',
                     'prob_low', 'prob_medium', 'prob_high', 'prob_viral']

    # Combine manual features and probability features
    all_features = [
        manual_features['text_len'], manual_features['word_count'], manual_features['hour'],
        manual_features['dayofweek'], manual_features['sentiment'], manual_features['has_media'],
        manual_features['company_cat'], manual_features['username_cat'],
        probabilities[0], probabilities[1], probabilities[2], probabilities[3]
    ]

    # Create a 2D array for the single prediction
    X_pred = pd.DataFrame([all_features], columns=feature_names)

    # Explicitly set categorical feature types using the training levels
    # This helps pandas align categories but might not be enough for LGBM predict directly
    X_pred['has_media'] = X_pred['has_media'].astype(has_media_dtype)
    X_pred['company_cat'] = X_pred['company_cat'].astype(company_dtype)
    X_pred['username_cat'] = X_pred['username_cat'].astype(username_dtype)
    X_pred['dayofweek'] = X_pred['dayofweek'].astype(dayofweek_dtype)
    X_pred['hour'] = X_pred['hour'].astype(hour_dtype)


    # --- 4. Get Stage 2 Prediction (Log scale) ---
    # Pass categorical_feature names explicitly to the predict method
    pred_log = regressor_model.predict(X_pred, categorical_feature=categorical_feature_names_for_predict)[0]

    # --- 5. Convert to Real Likes ---
    pred_real = np.expm1(pred_log)
    pred_real = max(0, pred_real) # Ensure no negative likes

    return pred_real

# --- Run a Final Test ---
print("\n--- Running a test 2-Stage prediction ---")
test_tweet = "SAY HIS NAME!! #CannonHinnant Some low life ghetto Thug Murdered a shot a 5 year old boy in the head!It took the mainstream media a week to talk about it! SO SAD!THIS IS WHY I SAY ALL LIVES MATTERCannon Hinnant life mattered too! Say his name <hyperlink>"
test_company = "williams" # Use the actual company name
test_day = "Friday"
test_hour = 23
test_media = "yes"
test_username = "w_terrence" # Use the actual username

# Check if the test company and username exist in the training data's factorizers
if test_company not in company_to_id:
    print(f"Warning: Test company '{test_company}' not found in training data. Using placeholder ID (-1).")
if test_username not in username_to_id:
     print(f"Warning: Test username '{test_username}' not found in training data. Using placeholder ID (-1).")
if test_day not in dayofweek_to_id:
     print(f"Warning: Test day '{test_day}' not found in dayofweek mapping. Using placeholder ID (-1).")
if test_hour not in hour_to_id:
     print(f"Warning: Test hour '{test_hour}' not found in hour mapping. Using placeholder ID (-1).")


final_likes_prediction = predict_likes_twostage(
    test_tweet,
    test_company,
    test_username,
    test_day,
    test_hour,
    test_media
)

print(f"\n---> Final Predicted Likes: {final_likes_prediction:.0f}")

Loading Stage 1 (Classifier) and Stage 2 (Regressor)...
Tokenizer loaded successfully.
Stage 1 Classifier loaded successfully on device: cuda
Stage 2 Regressor loaded successfully.
All models loaded successfully.
Regenerating factorizers and mappings for categorical features...
Factorizers and categorical dtypes regenerated.

--- Running a test 2-Stage prediction ---

---> Final Predicted Likes: 17663


In [None]:
# ==============================================================================
# CELL 6 (NEW, CORRECTED): FINAL 2-STAGE INFERENCE (Fixed fillna)
# ==============================================================================
import joblib
import pandas as pd
import numpy as np
import torch
from transformers import AutoTokenizer, AutoModelForSequenceClassification
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer

# --- Load BOTH models AND the schema ---
print("Loading Stage 1 (Classifier), Stage 2 (Regressor), and Schema...")

# --- Model 1: Classifier (Transformer) ---
STAGE_1_PATH = "/content/drive/MyDrive/my_best_tweet_model"
BASE_MODEL_NAME = "distilbert-base-uncased"
tokenizer = AutoTokenizer.from_pretrained(BASE_MODEL_NAME)
classifier_model = AutoModelForSequenceClassification.from_pretrained(STAGE_1_PATH).to(device)

# --- Model 2: Regressor (LightGBM) ---
STAGE_2_PATH = "/content/drive/MyDrive/my_lgbm_regressor.txt"
regressor_model = joblib.load(STAGE_2_PATH)

# --- Schema (The Dtypes) ---
SCHEMA_PATH = "/content/drive/MyDrive/my_data_schema.pkl"
train_schema = pd.read_pickle(SCHEMA_PATH)

print("All models and schema loaded successfully.")

# --- Define the full prediction pipeline ---
def predict_likes_twostage_hybrid(tweet_text, company, username, day, hour, has_media,
                                    classifier_threshold=0.90, safe_low_prediction=50.0):

    # --- 1. Create Stage 1 Input (Transformer) ---
    formatted_text = f"Brand: {company} | Day: {day} | Hour: {hour} | Media: {has_media} | Tweet: {tweet_text}"
    inputs = tokenizer(formatted_text, return_tensors="pt", padding=True, truncation=True, max_length=256).to(device)

    # --- 2. Get Stage 1 Probabilities ---
    with torch.no_grad():
        logits = classifier_model(**inputs).logits
    probabilities = torch.softmax(logits, dim=1).cpu().numpy()[0] # Get the 1D array of 4 probs

    prob_low = probabilities[0]

    # --- 3. *** THE HYBRID LOGIC *** ---
    if prob_low > classifier_threshold:
        return safe_low_prediction # Return a safe, low number

    # --- 4. ELSE, use the "Genius" Regressor for Medium/High/Viral tweets ---

    # Create Stage 2 Input (LGBM)
    data = {
        'text_len': len(tweet_text),
        'word_count': len(tweet_text.split()),
        'hour': hour,
        'dayofweek': pd.to_datetime(f"2023-01-02 {hour}:00:00").dayofweek,
        'sentiment': SentimentIntensityAnalyzer().polarity_scores(tweet_text)['compound'],
        'has_media': 1 if has_media == "yes" else 0,
        'company_cat': company,
        'username_cat': username,
        'prob_low': probabilities[0],
        'prob_medium': probabilities[1],
        'prob_high': probabilities[2],
        'prob_viral': probabilities[3]
    }

    X_pred = pd.DataFrame(data, index=[0], columns=train_schema.index)

    # Apply the saved dtypes (with all known categories)
    for col, dtype in train_schema.items():
        if str(dtype) == 'category':
            X_pred[col] = pd.Categorical(X_pred[col], categories=dtype.categories)
        else:
            X_pred[col] = X_pred[col].astype(dtype)

    # --- THIS IS THE FIX ---
    # We cannot fill 'NaN' with 0 for all columns.
    # 'company_cat' and 'username_cat' are categorical and will crash
    # if we try to fill their NaN (unknown company) with the integer 0.
    # LightGBM is smart and can handle NaNs in categorical features perfectly.

    # So, we only fillna(0) for the *non-categorical* columns.
    numerical_cols = train_schema[train_schema != 'category'].index

    # This fills NaNs with 0 only for columns like text_len, sentiment, probs, etc.
    # It leaves the NaNs in company_cat and username_cat, which is what LGBM wants.
    X_pred[numerical_cols] = X_pred[numerical_cols].fillna(0)
    # --- END OF FIX ---

    # --- 5. Get Stage 2 Prediction (Log scale) ---
    pred_log = regressor_model.predict(X_pred)[0]

    # --- 6. Convert to Real Likes ---
    pred_real = np.expm1(pred_log)
    pred_real = max(0, pred_real) # Ensure no negative likes

    return pred_real

# --- Run a Final Test ---
print("\n--- Running a test with the FINAL HYBRID prediction logic ---")

test_tweet = "SAY HIS NAME!! #CannonHinnant Some low life ghetto Thug Murdered a shot a 5 year old boy in the head!It took the mainstream media a week to talk about it! SO SAD!THIS IS WHY I SAY ALL LIVES MATTERCannon Hinnant life mattered too! Say his name <hyperlink>"
test_company = "williams" # Use the actual company name
test_day = "Friday"
test_hour = 23
test_media = "yes"
test_username = "w_terrence" # Use the actual username

final_likes_prediction = predict_likes_twostage_hybrid(
    test_tweet,
    test_company,
    test_username,
    test_day,
    test_hour,
    test_media
)

print(f"\n---> Final Predicted Likes: {final_likes_prediction:.0f}")

# --- Test a "boring" tweet ---
print("\n--- Running a test on a 'boring' tweet ---")
boring_tweet = "Our Q3 earnings are in line with expectations. See the full report here: [link]"
boring_company = "Microsoft"
boring_day = "Tuesday"
boring_hour = 8
boring_media = "no"
boring_username = "Microsoft"

boring_likes_prediction = predict_likes_twostage_hybrid(
    boring_tweet,
    boring_company,
    boring_username,
    boring_day,
    boring_hour,
    boring_media
)
print(f"\n---> Final Predicted Likes: {boring_likes_prediction:.0f}")

Loading Stage 1 (Classifier), Stage 2 (Regressor), and Schema...
All models and schema loaded successfully.

--- Running a test with the FINAL HYBRID prediction logic ---

---> Final Predicted Likes: 16673

--- Running a test on a 'boring' tweet ---

---> Final Predicted Likes: 192
