In [1]:
import numpy as np
import pandas as pd
import gzip
import json
from pprint import pprint
import re
from sklearn.feature_extraction.text import TfidfVectorizer
from scipy.sparse import hstack, csr_matrix
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score, classification_report
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report
from google.colab import drive
drive.mount('/content/drive')
#@title Turkish StopWords

import nltk
from nltk.corpus import stopwords

nltk.download('stopwords')
turkish_stopwords = stopwords.words('turkish')
english_stopwords = stopwords.words('english')

Mounted at /content/drive


[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


In [2]:
train_classification_df = pd.read_csv("/content/drive/MyDrive/released_dataset/train-classification.csv")
train_classification_df = train_classification_df.rename(columns={'Unnamed: 0': 'user_id', 'label': 'category'})
train_classification_df["category"] = train_classification_df["category"].apply(str.lower)
username2_category = train_classification_df.set_index("user_id").to_dict()["category"]

In [3]:
# stats about the labels
train_classification_df.groupby("category").count()

Unnamed: 0_level_0,user_id
category,Unnamed: 1_level_1
art,191
entertainment,323
fashion,299
food,511
gaming,13
health and lifestyle,503
mom and children,149
sports,113
tech,346
travel,294


In [4]:
username2_category["kravatistan"]

'fashion'

In [7]:
train_data_path = "/content/drive/MyDrive/released_dataset/training-dataset.jsonl.gz"

username2posts_train = dict()
username2profile_train = dict()

username2posts_test = dict()
username2profile_test = dict()


with gzip.open(train_data_path, "rt") as fh:
  for line in fh:
    sample = json.loads(line)

    profile = sample["profile"]
    username = profile["username"]
    if username in username2_category:
      # train data info
      username2posts_train[username] = sample["posts"]
      username2profile_train[username] = profile


    else:
      # it is test data info
      username2posts_test[username] = sample["posts"]
      username2profile_test[username] = profile


In [8]:
# Profile Dataframe
train_profile_df = pd.DataFrame(username2profile_train).T.reset_index(drop=True)
test_profile_df = pd.DataFrame(username2profile_test).T.reset_index(drop=True)

train_profile_df.head(4)

Unnamed: 0,username,id,full_name,biography,category_name,post_count,follower_count,following_count,is_business_account,is_private,...,business_category_name,overall_category_name,category_enum,is_verified_by_mv4b,is_regulated_c18,profile_pic_url,should_show_category,should_show_public_contacts,show_account_transparency_details,profile_picture_base64
0,deparmedya,3170700063,Depar Medya,#mediaplanning #mediabuying #sosyalmedya,Local business,,1167,192,True,False,...,,,LOCAL,False,False,https://instagram.fsaw2-3.fna.fbcdn.net/v/t51....,True,True,True,/9j/4AAQSkZJRgABAQAAAQABAAD/2wBDAAgGBgcGBQgHBw...
1,kafesfirin,266439571,KAFES FIRIN,📍Söğütözü📍FTZ AVM\n🛒Ankara macro▲center v...,Brand,,11997,17,True,False,...,,,BRAND,False,False,https://instagram.fada1-13.fna.fbcdn.net/v/t51...,True,True,True,/9j/4AAQSkZJRgABAQAAAQABAAD/2wBDAAgGBgcGBQgHBw...
2,vimerang,2367195567,Vimerang,Dijital İletişim Yönetimi🎬info@vimerang.comq,,,2321,454,True,False,...,Creators & Celebrities,,VIDEO_CREATOR,False,False,https://instagram.fist19-1.fna.fbcdn.net/v/t51...,True,True,True,/9j/4AAQSkZJRgABAQAAAQABAAD/2wBDAAgGBgcGBQgHBw...
3,mustafa_yalcinn38,9606564254,Mustafa Yalçın,Talas Belediye Başkanı,Politician,,13647,29,True,False,...,,,POLITICIAN,False,False,https://instagram.fist1-4.fna.fbcdn.net/v/t51....,True,False,True,/9j/4AAQSkZJRgABAQAAAQABAAD/2wBDAAgGBgcGBQgHBw...


In [9]:
test_profile_df.head(2)

Unnamed: 0,username,id,full_name,biography,category_name,post_count,follower_count,following_count,is_business_account,is_private,...,business_category_name,overall_category_name,category_enum,is_verified_by_mv4b,is_regulated_c18,profile_pic_url,should_show_category,should_show_public_contacts,show_account_transparency_details,profile_picture_base64
0,beyazyakaliyiz,8634457436,Selam Beyaz Yakalı,Beyaz yakalıların dünyasına hoşgeldiniz 😀😀😀,Personal blog,,1265,665,True,False,...,,,PERSONAL_BLOG,False,False,https://instagram.fist6-1.fna.fbcdn.net/v/t51....,True,True,True,/9j/4AAQSkZJRgABAQAAAQABAAD/2wBDAAgGBgcGBQgHBw...
1,totalenergies_istasyonlari,7066643793,TotalEnergies İstasyonları,TotalEnergies İstasyonları resmi Instagram hes...,Energy Company,,28025,4,True,False,...,,,ENERGY_COMPANY,False,False,https://instagram.fsaw2-1.fna.fbcdn.net/v/t51....,True,True,True,/9j/4AAQSkZJRgABAQAAAQABAAD/2wBDAAgGBgcGBQgHBw...


In [27]:

from datetime import datetime


# -------------------------------
WORDS_TO_EXCLUDE = {
    "gmail", "belediye", "kasım", "eylül", "eylul", "cumhuriyet", "cumhuriyetyasinde",
    "cumhuriyetyaşında", "ahmet", "murat", "türkiye", "emre", "ali", "can",
    "esma", "turkiye", "10kasım", "ekim", "cumhuriyetyaşında",
    "cumhuriyetbayramı", "atatürk", "30ağustos", "zaferbayramı",
    "ekimcumhuriyetbayramı", "ağustoszaferbayramı", "çokyaşacumhuriyet",
    "mustafakemalatatürk", "none", "your", "istanbul", "ıstanbul",
    "beğeni", "favori", "takip", "me", "with", "for", "you", "and", "we",
    "olarak", "olan", "ben", "sen", "biz", "bir", "yeni", "of", "to", "our",
    "olsun", "ol", "the", "end", "in", "on", "at", "işbirliği", "isbirligi"
}

# -------------------------------
# 1) Preprocessing Functions
# -------------------------------
def quick_extract_emojis(text: str) -> list:
    """
    Extract emojis using a simpler regex that matches common Emoji Unicode ranges.
    Returns a list of each emoji found in `text`.
    """
    if not text:
        return []

    emoji_pattern = re.compile(
        r'[\U0001F300-\U0001F6FF\U0001F700-\U0001F77F\u2600-\u26FF]+'
    )
    found_emojis = emoji_pattern.findall(text)
    return found_emojis

def preprocess_text(text: str) -> str:
    """
    Preprocess the text by:
    - casefolding
    - removing URLs
    - removing special characters (except #,@)
    - removing digits
    - removing extra whitespace
    - removing all WORDS_TO_EXCLUDE
    (Does NOT remove emojis, because we handle them in quick_extract_emojis.)
    """
    if not text:
        return ""
    # Lowercase
    text = text.casefold()
    # Remove URLs
    text = re.sub(r'http\S+|www\S+|https\S+', '', text, flags=re.MULTILINE)
    # Keep only letters, digits, spaces, #, and @ (note: emojis are removed here,
    # but we’re capturing them *before* in quick_extract_emojis).
    text = re.sub(r'[^a-zçğıöşü0-9\s#@]', '', text)
    # Remove digits
    text = re.sub(r'\d+', '', text)
    # Remove extra spaces
    text = re.sub(r'\s+', ' ', text).strip()

    # Split into tokens
    tokens = text.split()
    # Remove excluded tokens
    filtered_tokens = [t for t in tokens if t not in WORDS_TO_EXCLUDE]

    return " ".join(filtered_tokens)

def extract_tags(original_text: str) -> list:
    """
    Extract all hashtags (#word) and @mentions (e.g. @user),
    then apply preprocess_text to each extracted tag,
    and exclude anything in WORDS_TO_EXCLUDE.
    """
    if not original_text:
        return []
    # Identify hashtags (#\w+) and mentions (@\w+)
    hashtags = re.findall(r'#(\w+)', original_text)
    mentions = re.findall(r'@(\w+)', original_text)
    # Combine, lowercase
    raw_tags = [tag.lower() for tag in (hashtags + mentions)]

    # Preprocess each tag to remove digits, extra chars, etc.
    filtered_tags = []
    for tag in raw_tags:
        clean_tag = preprocess_text(tag)
        if clean_tag and clean_tag not in WORDS_TO_EXCLUDE:
            filtered_tags.append(clean_tag)

    return filtered_tags

# ------------------------------------------------------------------------------
# Suppose we already have these (from your dataset):
#   username2posts_train   (dict: username -> list of posts)
#   username2profile_train (dict: username -> user profile info)
#   username2_category     (dict: username -> category label)
#   turkish_stopwords      (list of stopwords)
# ------------------------------------------------------------------------------
train_usernames = list(username2posts_train.keys())

# -------------------------------
# 2) Build the corpora
# -------------------------------
capbio_corpus = []   # merged captions+biography (TEXT only)
fullname_corpus = []
tags_corpus = []
cat_name_corpus = []
emoji_corpus = []    # corpus of emojis (extracted separately)
avg_post_hour_list = []  # NEW: average post hour per user

for uname in train_usernames:
    posts = username2posts_train[uname]

    user_captions = []
    user_tags = []
    user_emojis = []
    total_hours = 0
    hour_count = 0

    # (A) Gather captions, emojis, tags, and extract post hours
    for post in posts:
        raw_caption = post.get("caption", "")

        # Extract timestamp and calculate post hour if available
        timestamp = post.get("timestamp")
        if timestamp:
            try:
                # Assuming timestamp is a UNIX timestamp (seconds since epoch)
                post_dt = datetime.fromtimestamp(int(timestamp))
                total_hours += post_dt.hour
                hour_count += 1
            except Exception as e:
                pass  # If conversion fails, ignore this post's timestamp

        # 1) Extract emojis from the raw caption
        found_emojis = quick_extract_emojis(raw_caption)
        if found_emojis:
            # Combine them in a single string for that user
            user_emojis.append(" ".join(found_emojis))

        # 2) Preprocess text (which does remove emojis, but we already extracted them)
        clean_caption = preprocess_text(raw_caption)
        if clean_caption:
            user_captions.append(clean_caption)

        # 3) Extract hashtags/mentions
        found_tags = extract_tags(raw_caption)
        user_tags.extend(found_tags)

    # (B) Biography
    raw_bio = username2profile_train[uname].get("biography", "")
    # Extract emojis from bio
    bio_emojis = quick_extract_emojis(raw_bio)
    if bio_emojis:
        user_emojis.append(" ".join(bio_emojis))

    # Preprocess bio
    clean_bio = preprocess_text(raw_bio)
    if clean_bio:
        user_captions.append(clean_bio)

    # Extract tags from bio
    bio_tags = extract_tags(raw_bio)
    user_tags.extend(bio_tags)

    # Merge all user_captions
    final_capbio_text = "\n".join(user_captions)
    capbio_corpus.append(final_capbio_text)

    # Merge all extracted emojis for that user
    final_emoji_text = " ".join(user_emojis) if user_emojis else ""
    emoji_corpus.append(final_emoji_text)

    # (C) Full name
    raw_fullname = username2profile_train[uname].get("full_name", "")
    clean_fullname = preprocess_text(raw_fullname)
    fullname_corpus.append(clean_fullname)

    # (D) Final tags (remove duplicates)
    final_tags_text = " ".join(list(set(user_tags)))
    tags_corpus.append(final_tags_text)

    # (E) Category Name
    raw_cat_name = username2profile_train[uname].get("category_name", "")
    clean_cat_name = preprocess_text(raw_cat_name)
    cat_name_corpus.append(clean_cat_name)

    # (F) Average Post Hour (NEW CHANNEL)
    if hour_count > 0:
        avg_hour = total_hours / hour_count
    else:
        avg_hour = 0  # default if no valid timestamps
    avg_post_hour_list.append(avg_hour)

# Build your target vector
y_train = [username2_category.get(u, "NA") for u in train_usernames]

# -------------------------------
# 3) Vectorize each text channel
# -------------------------------
capbio_vectorizer = TfidfVectorizer(
    stop_words=turkish_stopwords,
    max_features=5000,
    strip_accents=None
)
X_capbio_train = capbio_vectorizer.fit_transform(capbio_corpus)

fullname_vectorizer = TfidfVectorizer(
    stop_words=None,
    max_features=5000,
    strip_accents=None
)
X_fullname_train = fullname_vectorizer.fit_transform(fullname_corpus)

tags_vectorizer = TfidfVectorizer(
    stop_words=None,
    max_features=500,
    strip_accents=None
)
X_tags_train = tags_vectorizer.fit_transform(tags_corpus)

cat_name_vectorizer = TfidfVectorizer(
    stop_words=None,
    max_features=200,
    strip_accents=None
)
X_cat_name_train = cat_name_vectorizer.fit_transform(cat_name_corpus)

# --- EMOJI CHANNEL ---
# We can treat each emoji as a separate token if we want. Use a simple
# token_pattern to avoid splitting multi-char emojis (like keycaps, etc.).
emoji_vectorizer = TfidfVectorizer(
    token_pattern=r'\S+',  # splits on whitespace, so each emoji is a "token"
    max_features=150,     # optional limit
    strip_accents=None
)
X_emoji_train = emoji_vectorizer.fit_transform(emoji_corpus)

# -------------------------------
# 4) (Optionally) Add Boolean + Ratio + Average Post Hour Features
# -------------------------------
bool_features = []
ratios = []

for uname in train_usernames:
    user_profile = username2profile_train[uname]

    # Booleans
    is_business = user_profile.get("is_business_account", False)
    is_supervision_enabled = user_profile.get("is_supervision_enabled", False)
    is_verified = user_profile.get("is_verified", False)
    is_professional_account = user_profile.get("is_professional_account", False)

    # Follower/following
    follower_count = user_profile.get("follower_count", 0)
    following_count = user_profile.get("following_count", 0)
    ratio = float(follower_count) / (float(following_count) + 1.0)

    bool_features.append([
        1 if is_business else 0,
        1 if is_supervision_enabled else 0,
        1 if is_verified else 0,
        1 if is_professional_account else 0,
    ])
    ratios.append(ratio)

bool_features = np.array(bool_features, dtype=np.float32)
ratios = np.array(ratios, dtype=np.float32)
# Normalize ratio
ratio_min, ratio_max = ratios.min(), ratios.max()
if ratio_max == ratio_min:
    ratio_normed = np.zeros_like(ratios)
else:
    ratio_normed = (ratios - ratio_min) / (ratio_max - ratio_min)
ratio_normed = ratio_normed.reshape(-1, 1)

# NEW: Process Average Post Hour as an additional numerical feature.
avg_post_hour = np.array(avg_post_hour_list, dtype=np.float32)
# Normalize average post hour (hours are between 0 and 23)
hour_min, hour_max = avg_post_hour.min(), avg_post_hour.max()
if hour_max == hour_min:
    avg_post_hour_normed = np.zeros_like(avg_post_hour)
else:
    avg_post_hour_normed = (avg_post_hour - hour_min) / (hour_max - hour_min)
avg_post_hour_normed = avg_post_hour_normed.reshape(-1, 1)

X_bool_features = csr_matrix(bool_features)
X_ratio_features = csr_matrix(ratio_normed)
X_avg_post_hour = csr_matrix(avg_post_hour_normed)

# -------------------------------
# 5) Combine everything horizontally
# -------------------------------
X_train_combined = hstack([
    X_capbio_train,
    X_fullname_train,
    X_tags_train,
    X_cat_name_train,
    X_emoji_train,     # <--- the EMOJI channel
    X_bool_features,
    X_ratio_features,
    X_avg_post_hour   # <--- the new AVERAGE POST HOUR channel
])

# y_train remains the same

# -------------------------------
# 6) (OPTIONAL) Print top 10 terms per channel, per category
# -------------------------------
unique_categories = sorted(set(y_train))

def print_top_10_terms_per_category(X_matrix, feature_names, channel_label="CHANNEL"):
    print(f"\n========== {channel_label} ==========")
    for cat in unique_categories:
        cat_indices = [i for i, c in enumerate(y_train) if c == cat]
        if not cat_indices:
            continue

        submatrix = X_matrix[cat_indices, :]
        sum_vec = submatrix.sum(axis=0).A1
        if len(cat_indices) > 0:
            sum_vec /= len(cat_indices)

        top_10_idx = np.argsort(sum_vec)[::-1][:10]
        print(f"Category: {cat}")
        for idx in top_10_idx:
            print(f"   {feature_names[idx]:20s} : {sum_vec[idx]:.3f}")
        print("---")

# A) CAPTIONS + BIO
capbio_features = capbio_vectorizer.get_feature_names_out()
print_top_10_terms_per_category(X_capbio_train, capbio_features, "CAPTIONS + BIOGRAPHY")

# B) FULL NAME
fullname_features = fullname_vectorizer.get_feature_names_out()
print_top_10_terms_per_category(X_fullname_train, fullname_features, "FULL NAME")

# C) TAGS (#HASHTAGS + @MENTIONS)
tags_features = tags_vectorizer.get_feature_names_out()
print_top_10_terms_per_category(X_tags_train, tags_features, "TAGS")

# D) CATEGORY NAME
cat_name_features = cat_name_vectorizer.get_feature_names_out()
print_top_10_terms_per_category(X_cat_name_train, cat_name_features, "CATEGORY NAME")

# E) EMOJIS
emoji_features = emoji_vectorizer.get_feature_names_out()
print_top_10_terms_per_category(X_emoji_train, emoji_features, "EMOJIS")



Category: art
   tiyatro              : 0.034
   art                  : 0.033
   sanat                : 0.028
   iyi                  : 0.020
   design               : 0.019
   kadar                : 0.019
   istanbul             : 0.018
   biletler             : 0.018
   ilk                  : 0.018
   bi                   : 0.018
---
Category: entertainment
   biletler             : 0.023
   sahne                : 0.020
   tiyatro              : 0.020
   eğlence              : 0.020
   akşam                : 0.019
   güzel                : 0.019
   başkanımız           : 0.018
   iyi                  : 0.018
   günü                 : 0.018
   müzik                : 0.018
---
Category: fashion
   fashion              : 0.031
   kodu                 : 0.025
   moda                 : 0.023
   işbirliği            : 0.023
   ürün                 : 0.023
   reklam               : 0.022
   şık                  : 0.021
   indirim              : 0.021
   style                : 0.019
   coll

In [28]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report
from imblearn.over_sampling import SMOTE

from sklearn.decomposition import TruncatedSVD

# 1. Split the Data
X_train, X_val, Y_train, y_val = train_test_split(X_train_combined, y_train, test_size=0.1, random_state=42)
print(f"After split:\nX_train shape: {X_train.shape}, X_val shape: {X_val.shape}")

# 2. Scale the data BEFORE applying SMOTE
scaler = StandardScaler(with_mean=False)  # for sparse matrices
X_train_scaled = scaler.fit_transform(X_train)
X_val_scaled = scaler.transform(X_val)

# 3. Apply SMOTE to the SCALED training set
smote = SMOTE(random_state=42)
X_train_balanced, y_train_balanced = smote.fit_resample(X_train_scaled, Y_train)
print("Class distribution after SMOTE on training set:")
print(pd.Series(y_train_balanced).value_counts())

# 4. Apply TruncatedSVD
n_components = 2000
svd = TruncatedSVD(n_components=n_components, random_state=42)
X_train_svd = svd.fit_transform(X_train_balanced)
X_val_svd = svd.transform(X_val_scaled)

# Print explained variance ratio
cumulative_variance_ratio = np.cumsum(svd.explained_variance_ratio_)
print(f"Cumulative explained variance ratio: {cumulative_variance_ratio[-1]:.3f}")
print(f"Features after SVD: {X_train_svd.shape[1]}")



After split:
X_train shape: (2466, 9888), X_val shape: (275, 9888)
Class distribution after SMOTE on training set:
mom and children        460
food                    460
health and lifestyle    460
fashion                 460
sports                  460
entertainment           460
art                     460
travel                  460
tech                    460
gaming                  460
Name: count, dtype: int64
Cumulative explained variance ratio: 0.940
Features after SVD: 2000


In [29]:
# 5. Train Logistic Regression
model = LogisticRegression(
    random_state=42,
    max_iter=2000,
    C=1.0,
    class_weight= None,
    solver='saga'
)
model.fit(X_train_svd, y_train_balanced)

# 6. Evaluate the Model
y_pred = model.predict(X_val_svd)
accuracy = accuracy_score(y_val, y_pred)
print(f"Validation Accuracy: {accuracy:.2f}")
print(classification_report(y_val, y_pred))

Validation Accuracy: 0.65
                      precision    recall  f1-score   support

                 art       0.50      0.25      0.33        20
       entertainment       0.35      0.43      0.39        28
             fashion       0.48      0.73      0.58        22
                food       0.90      0.90      0.90        51
              gaming       0.00      0.00      0.00         5
health and lifestyle       0.62      0.67      0.65        46
    mom and children       0.86      0.32      0.46        19
              sports       0.80      0.80      0.80        15
                tech       0.66      0.78      0.72        37
              travel       0.74      0.72      0.73        32

            accuracy                           0.65       275
           macro avg       0.59      0.56      0.56       275
        weighted avg       0.66      0.65      0.64       275



  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [32]:
from sklearn.preprocessing import StandardScaler
from scipy.sparse import hstack

scaler = StandardScaler(with_mean=False)
X_train_scaled = scaler.fit_transform(X_train_combined)


from imblearn.over_sampling import SMOTE

smote = SMOTE(random_state=42)
X_train_balanced, y_train_balanced = smote.fit_resample(X_train_scaled, y_train)


from sklearn.linear_model import LogisticRegression

model = LogisticRegression(max_iter=1000, random_state=42)
model.fit(X_train_balanced, y_train_balanced)

print("\n[Final Model Trained on All Data]")


test_data_path = "/content/drive/MyDrive/released_dataset/test-classification-round3.dat"
print(f"\nReading official test file: {test_data_path}")


test_unames_official = []
with open(test_data_path, "rt") as fh:
    for line in fh:
        line = line.strip()
        if line and line.lower() != "screenname":
            test_unames_official.append(line)

print("\nFirst 5 test usernames from official list:", test_unames_official[:5])

# Prepare corpora and features for official test users
test_capbio_corpus = []
test_fullname_corpus = []
test_tags_corpus = []
test_cat_name_corpus = []
test_emoji_corpus = []
test_bool_features = []
test_ratios = []
test_avg_post_hours = []

for uname in test_unames_official:
    if uname in username2posts_test:
        posts = username2posts_test[uname]
        profile = username2profile_test[uname]
    elif uname in username2posts_train:
        # If username is in training data, use training data for prediction
        posts = username2posts_train[uname]
        profile = username2profile_train[uname]
    else:
        # If username is not found in either, handle accordingly (e.g., use empty data)
        print(f"Warning: Username '{uname}' not found in train or test sets.")
        posts = []
        profile = {}

    # --- (A) Gather captions, emojis, tags, and compute avg post hour ---
    user_captions = []
    user_tags = []
    user_emojis = []
    total_hours = 0
    hour_count = 0

    for post in posts:
        raw_caption = post.get("caption", "")
        # Extract timestamp and calculate post hour if available
        timestamp = post.get("timestamp")
        if timestamp:
            try:
                post_dt = datetime.fromtimestamp(int(timestamp))
                total_hours += post_dt.hour
                hour_count += 1
            except Exception as e:
                pass

        # 1) Extract emojis
        found_emojis = quick_extract_emojis(raw_caption)
        if found_emojis:
            user_emojis.append(" ".join(found_emojis))
        # 2) Preprocess text
        clean_caption = preprocess_text(raw_caption)
        if clean_caption:
            user_captions.append(clean_caption)
        # 3) Extract tags
        found_tags = extract_tags(raw_caption)
        user_tags.extend(found_tags)

    # --- (B) Biography processing ---
    raw_bio = profile.get("biography", "")
    bio_emojis = quick_extract_emojis(raw_bio)
    if bio_emojis:
        user_emojis.append(" ".join(bio_emojis))
    clean_bio = preprocess_text(raw_bio)
    if clean_bio:
        user_captions.append(clean_bio)
    bio_tags = extract_tags(raw_bio)
    user_tags.extend(bio_tags)

    # --- (C) Merge captions + biography ---
    final_capbio_text = "\n".join(user_captions)
    test_capbio_corpus.append(final_capbio_text)

    # --- (D) Emojis ---
    final_emoji_text = " ".join(user_emojis) if user_emojis else ""
    test_emoji_corpus.append(final_emoji_text)

    # --- (E) Full name ---
    raw_fullname = profile.get("full_name", "")
    clean_fullname = preprocess_text(raw_fullname)
    test_fullname_corpus.append(clean_fullname)

    # --- (F) Tags ---
    final_tags_text = " ".join(list(set(user_tags)))
    test_tags_corpus.append(final_tags_text)

    # --- (G) Category Name ---
    raw_cat_name = profile.get("category_name", "")
    clean_cat_name = preprocess_text(raw_cat_name)
    test_cat_name_corpus.append(clean_cat_name)

    # --- (H) Boolean Features ---
    is_business = profile.get("is_business_account", False)
    is_supervision_enabled = profile.get("is_supervision_enabled", False)
    is_verified = profile.get("is_verified", False)
    is_professional_account = profile.get("is_professional_account", False)
    test_bool_features.append([
        1 if is_business else 0,
        1 if is_supervision_enabled else 0,
        1 if is_verified else 0,
        1 if is_professional_account else 0,
    ])

    # --- (I) Ratio Feature ---
    follower_count = profile.get("follower_count", 0)
    following_count = profile.get("following_count", 0)
    ratio = float(follower_count) / (float(following_count) + 1.0)
    test_ratios.append(ratio)


    if hour_count > 0:
        avg_hour = total_hours / hour_count
    else:
        avg_hour = 0
    test_avg_post_hours.append(avg_hour)


test_ratios = np.array(test_ratios, dtype=np.float32)
if test_ratios.ptp() == 0:
    test_ratio_normed = np.zeros_like(test_ratios)
else:
    test_ratio_normed = (test_ratios - test_ratios.min()) / test_ratios.ptp()
test_ratio_normed = test_ratio_normed.reshape(-1, 1)


test_avg_post_hours = np.array(test_avg_post_hours, dtype=np.float32)
if test_avg_post_hours.ptp() == 0:
    test_avg_post_hours_normed = np.zeros_like(test_avg_post_hours)
else:
    test_avg_post_hours_normed = (test_avg_post_hours - test_avg_post_hours.min()) / test_avg_post_hours.ptp()
test_avg_post_hours_normed = test_avg_post_hours_normed.reshape(-1, 1)


X_capbio_test = capbio_vectorizer.transform(test_capbio_corpus)
X_fullname_test = fullname_vectorizer.transform(test_fullname_corpus)
X_tags_test = tags_vectorizer.transform(test_tags_corpus)
X_cat_name_test = cat_name_vectorizer.transform(test_cat_name_corpus)
X_emoji_test = emoji_vectorizer.transform(test_emoji_corpus)


from scipy.sparse import csr_matrix

test_bool_features = np.array(test_bool_features, dtype=np.float32)
X_bool_test = csr_matrix(test_bool_features)

X_ratio_test = csr_matrix(test_ratio_normed)
X_avg_post_hour_test = csr_matrix(test_avg_post_hours_normed)




X_test_combined = hstack([
    X_capbio_test,
    X_fullname_test,
    X_tags_test,
    X_cat_name_test,
    X_emoji_test,
    X_bool_test,
    X_ratio_test,
    X_avg_post_hour_test
])

X_test_scaled = scaler.transform(X_test_combined)

print("\nPredicting categories for official test users...")
test_pred = model.predict(X_test_scaled)

import json

output = {uname: pred for uname, pred in zip(test_unames_official, test_pred)}
output_save_path = "output_classification.json"
with open(output_save_path, "w") as of:
    json.dump(output, of, indent=4, ensure_ascii=False)

print(f"Classification predictions saved to: {output_save_path}")


[Final Model Trained on All Data]

Reading official test file: /content/drive/MyDrive/released_dataset/test-classification-round3.dat

First 5 test usernames from official list: ['livapastanesi', 'barisgross', 'tusasshop', 'etolyadigital', 'tugrulonur']

Predicting categories for official test users...
Classification predictions saved to: output_classification.json
