Alper Arda OLGUN 00031199
codes for project part 3

In [16]:
import numpy as np
import pandas as pd
import json
import gzip
import lightgbm as lgb
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
import re
from tqdm import tqdm

# --- Helper Functions ---
def preprocess_text(text: str):
    # Lower casing Turkish Text, don't use str.lower :)
    text = text.casefold()

    # Remove URLs
    text = re.sub(r'http\S+|www\S+|https\S+', '', text, flags=re.MULTILINE)

    # Remove special characters and punctuation
    text = re.sub(r'[^a-zçğıöşü0-9\s#@]', '', text)

    # Remove numbers
    text = re.sub(r'\d+', '', text)

    # Remove extra whitespaces
    text = re.sub(r'\s+', ' ', text).strip()

    return text

# --- Load Data ---
train_classification_df = pd.read_csv("train-classification.csv")
train_classification_df = train_classification_df.rename(columns={'Unnamed: 0': 'user_id', 'label': 'category'})

# Unifying labels
train_classification_df["category"] = train_classification_df["category"].apply(str.lower)
username2_category = train_classification_df.set_index("user_id").to_dict()["category"]

train_data_path = "training-dataset.jsonl.gz"

username2posts_train = dict()
username2profile_train = dict()
username2posts_test = dict()
username2profile_test = dict()

with gzip.open(train_data_path, "rt") as fh:
    for line in fh:
        sample = json.loads(line)
        profile = sample["profile"]
        username = profile["username"]
        if username in username2_category:
            username2posts_train[username] = sample["posts"]
            username2profile_train[username] = profile
        else:
            username2posts_test[username] = sample["posts"]
            username2profile_test[username] = profile

# Profile Dataframe
train_profile_df = pd.DataFrame(username2profile_train).T.reset_index(drop=True)
test_profile_df = pd.DataFrame(username2profile_test).T.reset_index(drop=True)

# --- Preprocessing Text Data ---
corpus = []
train_usernames = []

for username, posts in username2posts_train.items():
    train_usernames.append(username)
    cleaned_captions = []
    for post in posts:
        post_caption = post.get("caption", "")
        if post_caption is None:
            continue
        post_caption = preprocess_text(post_caption)
        if post_caption != "":
            cleaned_captions.append(post_caption)
    user_post_captions = "\n".join(cleaned_captions)
    corpus.append(user_post_captions)

vectorizer = TfidfVectorizer(stop_words='english', max_features=5000)
vectorizer.fit(corpus)

x_post_train = vectorizer.transform(corpus)
y_train = [username2_category.get(uname, "NA") for uname in train_usernames]

# --- Prepare Test Data ---
test_usernames = []
test_corpus = []
for username, posts in username2posts_test.items():
    test_usernames.append(username)
    cleaned_captions = []
    for post in posts:
        post_caption = post.get("caption", "")
        if post_caption is None:
            continue
        post_caption = preprocess_text(post_caption)
        if post_caption != "":
            cleaned_captions.append(post_caption)
    user_post_captions = "\n".join(cleaned_captions)
    test_corpus.append(user_post_captions)

x_post_test = vectorizer.transform(test_corpus)

# --- Classification Model ---
# Train-test split for classification
X_train, X_val, y_train, y_val = train_test_split(x_post_train, y_train, test_size=0.2, stratify=y_train)

# Use LightGBM Classifier without early stopping
classifier = lgb.LGBMClassifier(n_estimators=1000, learning_rate=0.01, max_depth=10)
classifier.fit(X_train, y_train)  # Removed eval_set and eval_metric parameters

# --- Regression Model ---
# Predict like count using average of previous posts
def predict_like_count(username, current_post=None):
    def get_avg_like_count(posts):
        total = 0.
        for post in posts:
            if current_post is not None and post["id"] == current_post["id"]:
                continue
            like_count = post.get("like_count", 0)
            if like_count is None:  # Handle the case where like_count is None
                like_count = 0
            total += like_count
        if len(posts) == 0:
            return 0.
        return total / len(posts)
    
    if username in username2posts_train:
        return get_avg_like_count(username2posts_train[username])
    elif username in username2posts_test:
        return get_avg_like_count(username2posts_test[username])
    else:
        return -1

# --- Generate Prediction for Classification & Regression ---
# Classification Predictions
test_classification_path = "test-classification-round3.dat"
test_regression_path = "test-regression-round3.jsonl"

# Read test usernames
test_unames = []
with open(test_classification_path, "rt") as fh:
    for line in fh:
        test_unames.append(line.strip())

# Predict classification
test_pred_class = classifier.predict(x_post_test)

output_classification = dict()
for index, uname in enumerate(test_unames):
    output_classification[uname] = test_pred_class[index]

# Save classification output
with open("prediction-classification-round3.json", "w") as f:
    json.dump(output_classification, f, indent=4)

# Regression Predictions
output_regression = {}
with open(test_regression_path, "rt") as fh:
    for line in fh:
        sample = json.loads(line)
        pred_val = predict_like_count(sample["username"])
        output_regression[sample["username"]] = int(pred_val)  # Store as integer

# Save regression output in the required format
with open("prediction-regression-round3.json", "w") as f:
    json.dump(output_regression, f, indent=4)


[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.064105 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 166139
[LightGBM] [Info] Number of data points in the train set: 2192, number of used features: 4506
[LightGBM] [Info] Start training from score -2.662132
[LightGBM] [Info] Start training from score -2.139610
[LightGBM] [Info] Start training from score -2.216106
[LightGBM] [Info] Start training from score -1.678854
[LightGBM] [Info] Start training from score -5.389985
[LightGBM] [Info] Start training from score -1.696118
[LightGBM] [Info] Start training from score -2.913446
[LightGBM] [Info] Start training from score -3.192760
[LightGBM] [Info] Start training from score -2.068552
[LightGBM] [Info] Start training from score -2.232984
