# Like Count Prediction


In [49]:
import re
import json
import gzip
import numpy as np
import pandas as pd
from datetime import datetime
from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import RandomForestRegressor
from google.colab import drive
import xgboost as xgb

drive.mount('/content/drive')

# ================================
# STEP 0: Define file paths and initialize dictionaries
# ================================
train_data_path = "/content/drive/MyDrive/released_dataset/training-dataset.jsonl.gz"
test_data_path  = "/content/drive/MyDrive/released_dataset/test-regression-round3.jsonl"

username2posts_train = {}
username2profile_train = {}

# ================================
# STEP 1: Load Data
# ================================
with gzip.open(train_data_path, "rt", encoding="utf-8") as fh:
    for line in fh:
        sample = json.loads(line)
        profile = sample["profile"]
        username = profile["username"]
        username2posts_train[username] = sample["posts"]
        username2profile_train[username] = profile

print("Finished loading training dataset.")

# --- Define helper function for word count ---
def word_count(caption):
    return len(caption.split())

# ================================
# STEP 2: Calculate average like_count, comment_count and word count per user
# ================================
user2avg_likes_train = {}
user2avg_comments_train = {}
user2avg_wordcount_train = {}
for uname, posts in username2posts_train.items():
    total_likes = 0.0
    total_comments = 0.0
    count_posts = 0
    total_word_count = 0.0
    for post in posts:
        caption = post.get("caption", "") or ""
        # Get like and comment counts; if negative, set to 0.
        like_count = post.get("like_count", 0) or 0
        if like_count < 0:
            like_count = 0.0
        comments_count = post.get("comments_count", 0) or 0
        total_likes += like_count
        total_comments += comments_count
        count_posts += 1
        caption_word_count = word_count(caption)
        total_word_count += caption_word_count

    user2avg_likes_train[uname] = total_likes / count_posts if count_posts > 0 else 0.0
    user2avg_wordcount_train[uname] = total_word_count / count_posts if count_posts > 0 else 0.0
    user2avg_comments_train[uname] = total_comments / count_posts if count_posts > 0 else 0.0

global_avg_likes = float(np.mean(list(user2avg_likes_train.values()))) if user2avg_likes_train else 0.0
global_avg_comment = float(np.mean(list(user2avg_comments_train.values()))) if user2avg_comments_train else 0.0
global_avg_word_count = float(np.mean(list(user2avg_wordcount_train.values()))) if user2avg_wordcount_train else 0.0

# ================================
# Helper Functions for Feature Extraction
# ================================
def count_hashtags(caption):
    return len(re.findall(r"#\w+", caption))

def count_mentions(caption):
    return len(re.findall(r"@\w+", caption))

def count_emojis(caption):
    emoji_pattern = re.compile("[\U0001F600-\U0001F64F]", flags=re.UNICODE)
    return len(emoji_pattern.findall(caption))

def count_urls(caption):
    return len(re.findall(r"http[s]?://\S+", caption))

def punctuation_count(caption):
    return len(re.findall(r"[!?.]", caption))

def text_length(caption):
    return len(caption)

def word_count(caption):
    return len(caption.split())

# ================================
# STEP 3: Build the post-level training rows with extra features
# ================================
train_rows = []

for uname, posts in username2posts_train.items():
    profile = username2profile_train.get(uname, {})
    follower_count  = profile.get("follower_count", 0) or 0
    following_count = profile.get("following_count", 0) or 0
    user_avg_likes = user2avg_likes_train.get(uname, global_avg_likes)
    user_avg_comments = user2avg_comments_train.get(uname, global_avg_comment)
    user_avg_word_count = user2avg_wordcount_train.get(uname, global_avg_word_count)
    # Boolean features (casting to int)
    is_business = int(profile.get("is_business_account", False))
    is_supervision_enabled = int(profile.get("is_supervision_enabled", False))
    is_verified = int(profile.get("is_verified", False))
    is_professional_account = int(profile.get("is_professional_account", False))

    for post in posts:
        # Get like_count and reset negative values to 0.
        like_count = post.get("like_count", 0) or 0
        if like_count < 0:
            like_count = 0.0

        if like_count == 0:
            like_count = 0.01

        comments_count = post.get("comments_count", 0) or 0
        pid = post.get("id", 0) or 0  # unique ID
        caption = post.get("caption", "") or ""

        # Timestamp
        timestamp_str = post.get("timestamp", None)
        post_hour = np.nan
        post_dayofweek = np.nan
        post_month = np.nan
        timestamp = None
        if timestamp_str:
            try:

                timestamp = datetime.strptime(timestamp_str, "%Y-%m-%d %H:%M:%S")
                post_hour = timestamp.hour
                post_dayofweek = timestamp.weekday()
                post_month = timestamp.month
            except Exception as e:

                post_hour = np.nan
                post_dayofweek = np.nan
                post_month = np.nan


        num_hashtags = count_hashtags(caption)
        num_mentions = count_mentions(caption)
        num_emojis = count_emojis(caption)
        num_urls = count_urls(caption)
        num_punctuations = punctuation_count(caption)
        caption_length = text_length(caption)
        caption_word_count = word_count(caption)


        comment_diff = user_avg_comments - comments_count


        if user_avg_comments > 0:
            r = comments_count / user_avg_comments
        else:
            r = 0.0
        norm_comment_ratio = (r - 1) / (r + 1)


        follower_business = follower_count * is_business

        row_dict = {
            "id": pid,
            "username": uname,
            "like_count": like_count,
            "user_avg_likes": user_avg_likes,
            "user_avg_comments": user_avg_comments,
            "comment_count": comments_count,
            "comment_diff": comment_diff,
            "norm_comment_ratio": norm_comment_ratio,
            "following_count": following_count,
            "follower_count": follower_count,
            "follower_business": follower_business,
            "is_business": is_business,
            "is_verified": is_verified,
            "is_professional_account": is_professional_account,
            "is_supervision_enabled": is_supervision_enabled,
            "num_hashtags": num_hashtags,
            "num_mentions": num_mentions,
            "num_emojis": num_emojis,
            "num_urls": num_urls,
            "num_punctuations": num_punctuations,
            "caption_length": caption_length,
            "caption_word_count": caption_word_count,
            "user_avg_word_count": user_avg_word_count,

            "post_hour": post_hour,
            "post_dayofweek": post_dayofweek,
            "post_month": post_month

        }

        train_rows.append(row_dict)

# Optionally, save train_rows to a JSON file for debugging
output_json_path = "train_rows.json"
with open(output_json_path, "w", encoding="utf-8") as json_file:
    json.dump(train_rows, json_file, indent=4, default=str)
print(f"train_rows saved to {output_json_path}.")

# ================================
# STEP 4: Load DataFrame and Prepare Features/Target
# ================================
df_reg_train = pd.DataFrame(train_rows)
print("Post-level train DataFrame shape:", df_reg_train.shape)
print(df_reg_train.head(5))


feature_cols = [
    "user_avg_likes",
    "following_count",
    "follower_count",

    "num_hashtags",
    "is_business",
    "is_verified",
    "user_avg_comments",
    "comment_count",
    "norm_comment_ratio",

    "post_dayofweek",

]
target_col = "like_count"

X_all = df_reg_train[feature_cols].values
y_all = df_reg_train[target_col].values

# ================================
# STEP 5: Split Overall Data into Training and Validation Sets
# (Select exactly 3000 random samples for the validation set)
# ================================
np.random.seed(42)  # for reproducibility
total_samples = df_reg_train.shape[0]
validation_size = 3000

all_indices = np.arange(total_samples)
val_indices = np.random.choice(all_indices, size=validation_size, replace=False)
train_indices = np.setdiff1d(all_indices, val_indices)

df_train = df_reg_train.iloc[train_indices].copy()
df_val   = df_reg_train.iloc[val_indices].copy()

print(f"Overall Training set: {df_train.shape[0]} samples")
print(f"Overall Validation set: {df_val.shape[0]} samples (should be exactly 3000)")

# Reset index of validation set to ensure it is 0 to 2999
df_val = df_val.reset_index(drop=True)

# ================================
# STEP 6: Normalize the Data and Train the Model
# ================================
scaler = StandardScaler()

X_train = df_train[feature_cols].values.astype(np.float32)
X_train_scaled = scaler.fit_transform(X_train)

X_val = df_val[feature_cols].values.astype(np.float32)
X_val_scaled = scaler.transform(X_val)

y_train = df_train[target_col].values.astype(np.float32)

model = RandomForestRegressor(random_state=42, n_estimators = 50, max_depth = 100)

model.fit(X_train_scaled, y_train)




# ================================
# STEP 7: Evaluate on the Validation Set and Adjust Predictions
# ================================
y_val_pred = model.predict(X_val_scaled)
adjusted_preds = []
for i, row in df_val.iterrows():
    # Custom post-prediction logic can be applied here if needed.
    pred = y_val_pred[i]
    adjusted_preds.append(pred)

validation_results = pd.DataFrame({
    'Actual': df_val[target_col],
    'Predicted': adjusted_preds
})
print("\n=== Actual vs Predicted Like Counts (first 100 rows) ===")
print(validation_results.head(100))

epsilon = 1e-10
individual_errors = np.log10(np.abs(df_val[target_col] - np.array(adjusted_preds)) + epsilon)
sum_log_errors = np.sum(individual_errors)
print(f"\nSum of log10(individual absolute errors) = {sum_log_errors:.4f}")

# ================================
# STEP 8: Save the Predictions to CSV
# ================================
output_csv_path = "predicted_like_counts_validation.csv"
df_val["predicted_like_count"] = adjusted_preds
df_val.to_csv(output_csv_path, index=False)
print(f"Validation predictions saved to {output_csv_path}.")


Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
Finished loading training dataset.
train_rows saved to train_rows.json.
Post-level train DataFrame shape: (187302, 26)
                  id    username  like_count  user_avg_likes  \
0  17990918969458720  deparmedya         6.0       11.542857   
1  18219250732221045  deparmedya        22.0       11.542857   
2  18311380465102328  deparmedya        19.0       11.542857   
3  18089518138361507  deparmedya        19.0       11.542857   
4  18012743929758497  deparmedya        21.0       11.542857   

   user_avg_comments  comment_count  comment_diff  norm_comment_ratio  \
0           0.342857              0      0.342857           -1.000000   
1           0.342857              1     -0.657143            0.489362   
2           0.342857              0      0.342857           -1.000000   
3           0.342857              1     -0.657143            0.489362   
4 

In [50]:
print("\nRetraining on the entire dataset for final inference...")
final_model = RandomForestRegressor(n_estimators=100, random_state=42)
final_model.fit(X_all, y_all)

# ================================
# STEP 10: Generate Predictions on Official Test Set Using the Same Feature Extraction
# ================================
print("\n=== STEP 10: Generate Predictions on Official Test Set ===")

# 1) Load the test data
official_test_posts = []
with open(test_data_path, "r", encoding="utf-8") as test_f:
    for line in test_f:
        official_test_posts.append(json.loads(line))

output_improved_path = "prediction-regression-improved.json"
predictions_improved = {}


for sample in official_test_posts:
    post_id = sample.get("id", "")
    username = sample.get("username", "")


    profile = username2profile_train.get(username, {})

    user_avg_likes    = user2avg_likes_train.get(username, global_avg_likes)
    user_avg_comments = user2avg_comments_train.get(username, global_avg_comment)
    following_count   = profile.get("following_count", 0) or 0
    follower_count    = profile.get("follower_count", 0) or 0
    is_business       = int(profile.get("is_business_account", False))
    is_verified       = int(profile.get("is_verified", False))


    # Post-level data
    caption = sample.get("caption", "") or ""
    comments_count = sample.get("comments_count", 0) or 0

    # Extract text feature: number of hashtags
    num_hashtags = count_hashtags(caption)

    # Compute normalized comment ratio following same logic as training:
    if user_avg_comments > 0:
        r = comments_count / user_avg_comments
    else:
        r = 0.0
    norm_comment_ratio = (r - 1) / (r + 1)

    # Extract timestamp-derived feature "post_dayofweek"
    timestamp_str = sample.get("timestamp", None)
    post_dayofweek = np.nan
    if timestamp_str:
        try:
            timestamp = datetime.strptime(timestamp_str, "%Y-%m-%d %H:%M:%S")
            post_dayofweek = timestamp.weekday()  # Monday=0, Sunday=6
        except Exception as e:
            post_dayofweek = np.nan

    # Build feature vector
    row_feat = [
        user_avg_likes,
        following_count,
        follower_count,
        num_hashtags,
        is_business,
        is_verified,
        user_avg_comments,
        comments_count,
        norm_comment_ratio,
        post_dayofweek
    ]


    pred_like = final_model.predict([row_feat])[0]
    pred_like = max(int(pred_like), 0)
    predictions_improved[post_id] = pred_like


with open(output_improved_path, "w", encoding='utf-8') as of:
    json.dump(predictions_improved, of, indent=4)

print(f"Saved improved predictions to: {output_improved_path}")
print("=== End of Pipeline ===")


Retraining on the entire dataset for final inference...

=== STEP 10: Generate Predictions on Official Test Set ===
Saved improved predictions to: prediction-regression-improved.json
=== End of Pipeline ===
