In [4]:
import warnings
warnings.filterwarnings("ignore")

import numpy as np
import pandas as pd
import gzip
import json
import string
import re
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import (
    LabelEncoder,
    StandardScaler
)
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.metrics import accuracy_score, classification_report
import nltk
from nltk.corpus import stopwords
from imblearn.combine import SMOTEENN

# Download stopwords
nltk.download('stopwords')
turkish_stopwords = stopwords.words('turkish')

# -----------------------------
# 1. Load Your Data
# -----------------------------
df1 = pd.read_csv("annotated_users_4.csv")

df2 = pd.read_csv("annotated_users_1.csv")
df3 = pd.read_csv("annotated_users_2.csv")
df4 = pd.read_csv("annotated_users_3.csv")

frames = [df2, df3, df4]
df = pd.concat(frames, ignore_index=True)

columns_to_drop = ['url', 'influencerMention', 'accountType']

def clean_data(df):
    return df.drop(columns=columns_to_drop)

df = clean_data(df)

frames = [df, df1]
final_df = pd.concat(frames, ignore_index=True)

# Remove duplicates and NaN
final_df.drop_duplicates(inplace=True)
final_df.dropna(inplace=True)

# Capitalize influencerCategory
final_df['influencerCategory'] = final_df['influencerCategory'].str.capitalize()

# Drop certain classes
classes_to_drop = ['Gaming']
final_df = final_df[~final_df['influencerCategory'].isin(classes_to_drop)]

print(f'The cleaned data has {final_df.shape[0]} rows and {final_df.shape[1]} columns')

# -----------------------------
# 2. Define Username Cleaning
# -----------------------------
def clean_username(username):
    """
    Comprehensive username cleaning function that:
    1. Removes numbers
    2. Removes punctuation
    3. Removes stop words (English)
    4. Removes special characters
    5. Replaces common separators with space
    """
    # Convert to lowercase
    username = str(username).lower()

    # Replace common separators with space
    username = re.sub(r'[._-]', ' ', username)

    # Remove numbers
    username = re.sub(r'\d+', '', username)

    # Remove punctuation
    username = username.translate(str.maketrans('', '', string.punctuation))

    # Split into words
    words = username.split()

    # Remove English stop words
    stop_words = set(stopwords.words('english'))
    words = [word for word in words if word not in stop_words]

    # Remove very short terms (likely not meaningful)
    words = [word for word in words if len(word) > 2]

    # Join words back together
    clean_u = ' '.join(words)

    # Remove any remaining special characters
    clean_u = re.sub(r'[^a-zA-Z\s]', '', clean_u)

    return clean_u.strip()

def preprocess_data(df):
    """
    Preprocess the entire dataset:
     - Clean usernames
     - Filter out empty processed usernames
    """
    processed_df = df.copy()
    processed_df['processed_username'] = processed_df['username'].apply(clean_username)
    processed_df = processed_df[processed_df['processed_username'].str.len() > 0].reset_index(drop=True)
    return processed_df

new_df = preprocess_data(final_df)

# Keep only the processed_username and influencerCategory in that order
new_df = new_df.reindex(columns=['processed_username', 'influencerCategory'])

print("\nProcessed DataFrame:")
print(new_df.head())

# -----------------------------
# 3. TF-IDF Vectorizer and Encoding
# -----------------------------
tfidf = TfidfVectorizer(
    analyzer='char_wb', 
    ngram_range=(2, 5),
    min_df=3,  
    max_df=0.9,  
    sublinear_tf=True,  
    max_features=10000, 
    stop_words=turkish_stopwords  
)

X = tfidf.fit_transform(new_df['processed_username'])
print(f"\nTF-IDF Matrix Shape: {X.shape}")  

# Scale
scaler = StandardScaler(with_mean=False)
X = scaler.fit_transform(X)

# Label encode influencerCategory
le = LabelEncoder()
y = le.fit_transform(new_df['influencerCategory'])

# -----------------------------
# 4. Handle Imbalance + Split
# -----------------------------
smote = SMOTEENN(random_state=42)
X_resampled, y_resampled = smote.fit_resample(X, y)

X_train, X_test, y_train, y_test = train_test_split(
    X_resampled, 
    y_resampled, 
    train_size=0.8, 
    stratify=y_resampled, 
    random_state=2022
)

# -----------------------------
# 5. Train Multiple Classifiers
# -----------------------------
classifiers = {
    'Random Forest': RandomForestClassifier(
        n_estimators=200,
        max_depth=20,
        min_samples_split=5,
        min_samples_leaf=2,
        class_weight='balanced',
        random_state=42
    ),
    'SVM': SVC(kernel='linear', random_state=42),
    'Naive Bayes': MultinomialNB(),
    'Logistic Regression': LogisticRegression(max_iter=1000, random_state=42),
    'Gradient Boosting': GradientBoostingClassifier(random_state=42)
}

results = {}
best_classifier_name = None
best_accuracy = 0
best_model = None

for name, clf in classifiers.items():
    clf.fit(X_train, y_train)
    y_train_pred = clf.predict(X_train)
    y_test_pred = clf.predict(X_test)
    train_accuracy = accuracy_score(y_train, y_train_pred)
    test_accuracy = accuracy_score(y_test, y_test_pred)
    results[name] = {
        'train_accuracy': train_accuracy,
        'test_accuracy': test_accuracy,
        'report': classification_report(y_test, y_test_pred, target_names=le.classes_)
    }
    if test_accuracy > best_accuracy:
        best_accuracy = test_accuracy
        best_classifier_name = name
        best_model = clf

print("\nClassification Results:\n")
for name, result in results.items():
    print(f"\n{name}:")
    print(f"Training Accuracy: {result['train_accuracy']:.4f}")
    print(f"Testing Accuracy: {result['test_accuracy']:.4f}")
    print("\nDetailed Classification Report:")
    print(result['report'])

print(f"\nBest Classifier: {best_classifier_name} with accuracy: {best_accuracy:.4f}")

# -----------------------------
# 6. Save Predictions to JSON (Internal Test Split) - Removed to Avoid Errors
# -----------------------------
# The following section has been removed to prevent the "positional indexers are out-of-bounds" error.
# It attempted to map predictions back to usernames using incorrect indices.

# If you still need to map predictions to usernames for the internal test split,
# consider alternative approaches such as tracking indices before resampling.

# -----------------------------
# 7. Read the New Test File and Predict
# -----------------------------
def predict_and_save_new_data(test_file, model, tfidf_vectorizer, data_scaler, label_encoder, output_file="outputclassification.json"):
    """
    Reads usernames from `test_file`, cleans them, 
    transforms them using the fitted pipeline, 
    predicts categories, and saves to JSON.
    """
    # Read lines from the test file
    with open(test_file, 'r', encoding='utf-8') as f:
        test_usernames = f.read().splitlines()
    
    # Clean each username
    cleaned_test_usernames = [clean_username(u) for u in test_usernames]
    
    # Transform with existing TF-IDF and scaler
    X_test_transformed = tfidf_vectorizer.transform(cleaned_test_usernames)
    X_test_transformed = data_scaler.transform(X_test_transformed)
    
    # Predict
    y_preds = model.predict(X_test_transformed)
    # Convert numeric predictions back to category names
    category_preds = label_encoder.inverse_transform(y_preds)
    
    # Build dictionary {original_username: predicted_category}
    results_dict = {}
    for original, pred in zip(test_usernames, category_preds):
        results_dict[original] = pred
    
    # Save to JSON
    with open(output_file, 'w', encoding='utf-8') as json_file:
        json.dump(results_dict, json_file, indent=4, ensure_ascii=False)
    
    print(f"Predictions have been saved to {output_file}")
    return results_dict

# Use the best model found
predict_and_save_new_data(
    test_file="test-classification-round3.dat",
    model=best_model,
    tfidf_vectorizer=tfidf,
    data_scaler=scaler,
    label_encoder=le,
    output_file="outputclassification.json"
)

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/aminzaka/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


The cleaned data has 2977 rows and 2 columns

Processed DataFrame:
    processed_username influencerCategory
0       ozlem ozdemirr      Entertainment
1        caglaralmendi            Fashion
2  modabelliniofficial            Fashion
3            bkmonline      Entertainment
4         minikkusblog   Mom and children

TF-IDF Matrix Shape: (2973, 7981)

Classification Results:


Random Forest:
Training Accuracy: 0.9194
Testing Accuracy: 0.8295

Detailed Classification Report:
                      precision    recall  f1-score   support

                 Art       0.98      0.81      0.89        77
       Entertainment       0.80      0.83      0.81        29
             Fashion       0.68      0.84      0.75        55
                Food       0.53      0.71      0.61        14
Health and lifestyle       1.00      0.75      0.86         4
    Mom and children       0.97      0.84      0.90        92
              Sports       0.95      0.82      0.88        98
                Tech   

{'livapastanesi': 'Health and lifestyle',
 'barisgross': 'Travel',
 'tusasshop': 'Fashion',
 'etolyadigital': 'Tech',
 'tugrulonur': 'Travel',
 'tulugozlu': 'Art',
 'gokidy': 'Mom and children',
 'cengizgumus_official': 'Entertainment',
 'krossbisiklet': 'Tech',
 'haribochamallows': 'Art',
 'ozatashipyard': 'Art',
 'yenisafak': 'Sports',
 'iamsiddeshjadhav': 'Art',
 'burcinterzioglu': 'Fashion',
 'steakhousegunaydin': 'Travel',
 'baselifeclub': 'Travel',
 'benismailyildirimm': 'Tech',
 'imuneksfarma': 'Tech',
 'dogakoyucatalca': 'Tech',
 'sena.sener': 'Fashion',
 'kandilliborsarestaurant': 'Food',
 'selamiersoyy': 'Sports',
 'deutz_fahr_turkey': 'Tech',
 'cevaheer': 'Fashion',
 'tezatsanat': 'Art',
 'filtresizcom': 'Fashion',
 'palomamarina_suites': 'Travel',
 'westchocolatemarina': 'Travel',
 'sebnemcapa': 'Travel',
 'rozetsepeti': 'Entertainment',
 'ececesmioglu': 'Fashion',
 'ustapidecitr': 'Travel',
 'gocaagonyali': 'Sports',
 'maestro.sanat.kursu': 'Art',
 'oztayteksofficial': 'Fa