In [61]:
import pandas as pd
import json
import gzip

def load_jsonl(file_path):
    """
    Load a .jsonl or .jsonl.gz file into a Pandas DataFrame.
    Handles both compressed and uncompressed formats.
    Expands nested dictionaries into separate columns.
    """
    data = []
    if file_path.endswith('.gz'):
        with gzip.open(file_path, 'rt', encoding='utf-8') as file:
            for line in file:
                data.append(json.loads(line))
    else:
        with open(file_path, 'r', encoding='utf-8') as file:
            for line in file:
                data.append(json.loads(line))
    df = pd.DataFrame(data)

    # Flatten nested dictionaries into separate columns
    for column in df.columns:
        if isinstance(df[column].iloc[0], dict):  # Check if column contains dictionaries
            nested_df = pd.json_normalize(df[column])
            nested_df.columns = [f"{column}.{subcol}" for subcol in nested_df.columns]
            df = pd.concat([df.drop(columns=[column]), nested_df], axis=1)

    return df

# File paths
training_dataset_path = "/Users/borasacir/Documents/projects/CS-412-Project/project-deliverables/training-dataset.jsonl.gz"  
train_classification_path = "/Users/borasacir/Documents/projects/CS-412-Project/project-deliverables/train-classification.csv" 

# Load datasets
try:
    training_data = load_jsonl(training_dataset_path)
    # Remove 'profile.' prefix from column names
    training_data.columns = training_data.columns.str.replace('^profile\\.', '', regex=True)
    print("Updated column names in training_data:", training_data.columns)
    
    classification_labels = pd.read_csv(train_classification_path)
    classification_labels.columns = ["username"] + classification_labels.columns[1:].tolist()
except Exception as e:
    print("Error loading datasets:", e)
    raise

# Normalize username columns
training_data['username'] = training_data['username'].astype(str).str.strip().str.lower()
classification_labels['username'] = classification_labels['username'].astype(str).str.strip().str.lower()

# Debugging Step 1: Check unique usernames in each dataset
print("Unique usernames in training_data:", training_data['username'].nunique())
print("Unique usernames in classification_labels:", classification_labels['username'].nunique())

# Debugging Step 2: Check samples of usernames
print("Sample usernames from training_data:", training_data['username'].head())
print("Sample usernames from classification_labels:", classification_labels['username'].head())

# Debugging Step 3: Identify mismatched usernames
missing_in_training = set(classification_labels['username']) - set(training_data['username'])
missing_in_labels = set(training_data['username']) - set(classification_labels['username'])

print("Usernames in classification_labels but not in training_data:", list(missing_in_training)[:10])
print("Usernames in training_data but not in classification_labels:", list(missing_in_labels)[:10])

# Merge datasets
try:
    merged_data = training_data.merge(classification_labels, on="username", how="inner")
    print(f"Number of rows after merge: {len(merged_data)}")
    if len(merged_data) == 0:
        print("No matching usernames found. Please check the mismatched usernames above.")
    else:
        print("Merged data preview:")
        print(merged_data.head())
except Exception as e:
    print("Error during merge:", e)
    raise

# Check missing values per column
print("Missing values per column in the merged dataset:")
print(merged_data.isnull().sum())

# Drop rows only if essential columns are missing
essential_columns = ['username', 'label']  # Adjust as needed
merged_data = merged_data.dropna(subset=essential_columns)

# Fill missing values in non-essential columns
non_essential_columns = [col for col in merged_data.columns if col not in essential_columns]
merged_data[non_essential_columns] = merged_data[non_essential_columns].fillna("Unknown")

# Final preview of cleaned data
print(f"Number of rows after cleaning: {len(merged_data)}")
print("Cleaned data preview:")
print(merged_data.head())

Updated column names in training_data: Index(['posts', 'username', 'id', 'full_name', 'biography', 'category_name',
       'post_count', 'follower_count', 'following_count',
       'is_business_account', 'is_private', 'is_verified',
       'highlight_reel_count', 'bio_links', 'entities', 'ai_agent_type',
       'fb_profile_biolink', 'restricted_by_viewer', 'country_block',
       'eimu_id', 'external_url', 'fbid', 'has_clips',
       'hide_like_and_view_counts', 'is_professional_account',
       'is_supervision_enabled', 'is_guardian_of_viewer',
       'is_supervised_by_viewer', 'is_supervised_user', 'is_embeds_disabled',
       'is_joined_recently', 'business_address_json',
       'business_contact_method', 'business_email', 'business_phone_number',
       'business_category_name', 'overall_category_name', 'category_enum',
       'is_verified_by_mv4b', 'is_regulated_c18', 'profile_pic_url',
       'should_show_category', 'should_show_public_contacts',
       'show_account_transparency

In [62]:
training_data.head()

Unnamed: 0,posts,username,id,full_name,biography,category_name,post_count,follower_count,following_count,is_business_account,...,business_category_name,overall_category_name,category_enum,is_verified_by_mv4b,is_regulated_c18,profile_pic_url,should_show_category,should_show_public_contacts,show_account_transparency_details,profile_picture_base64
0,[{'caption': 'Cumhuriyetimizin 100.yılı kutlu ...,deparmedya,3170700063,Depar Medya,#mediaplanning #mediabuying #sosyalmedya,Local business,,1167,192,True,...,,,LOCAL,False,False,https://instagram.fsaw2-3.fna.fbcdn.net/v/t51....,True,True,True,/9j/4AAQSkZJRgABAQAAAQABAAD/2wBDAAgGBgcGBQgHBw...
1,[{'caption': 'Bu diyaloğun yaşanmadığı bir onl...,beyazyakaliyiz,8634457436,Selam Beyaz Yakalı,Beyaz yakalıların dünyasına hoşgeldiniz 😀😀😀,Personal blog,,1265,665,True,...,,,PERSONAL_BLOG,False,False,https://instagram.fist6-1.fna.fbcdn.net/v/t51....,True,True,True,/9j/4AAQSkZJRgABAQAAAQABAAD/2wBDAAgGBgcGBQgHBw...
2,[{'caption': 'Bugün bir fincan köpüklü Türk ka...,kafesfirin,266439571,KAFES FIRIN,📍Söğütözü📍FTZ AVM\n🛒Ankara macro▲center v...,Brand,,11997,17,True,...,,,BRAND,False,False,https://instagram.fada1-13.fna.fbcdn.net/v/t51...,True,True,True,/9j/4AAQSkZJRgABAQAAAQABAAD/2wBDAAgGBgcGBQgHBw...
3,[{'caption': 'Saygı ve özlemle🖤 #atatürk #10k...,vimerang,2367195567,Vimerang,Dijital İletişim Yönetimi🎬info@vimerang.comq,,,2321,454,True,...,Creators & Celebrities,,VIDEO_CREATOR,False,False,https://instagram.fist19-1.fna.fbcdn.net/v/t51...,True,True,True,/9j/4AAQSkZJRgABAQAAAQABAAD/2wBDAAgGBgcGBQgHBw...
4,[{'caption': 'Başöğretmenimiz Gazi Mustafa Kem...,totalenergies_istasyonlari,7066643793,TotalEnergies İstasyonları,TotalEnergies İstasyonları resmi Instagram hes...,Energy Company,,28025,4,True,...,,,ENERGY_COMPANY,False,False,https://instagram.fsaw2-1.fna.fbcdn.net/v/t51....,True,True,True,/9j/4AAQSkZJRgABAQAAAQABAAD/2wBDAAgGBgcGBQgHBw...


In [63]:
training_data.columns

Index(['posts', 'username', 'id', 'full_name', 'biography', 'category_name',
       'post_count', 'follower_count', 'following_count',
       'is_business_account', 'is_private', 'is_verified',
       'highlight_reel_count', 'bio_links', 'entities', 'ai_agent_type',
       'fb_profile_biolink', 'restricted_by_viewer', 'country_block',
       'eimu_id', 'external_url', 'fbid', 'has_clips',
       'hide_like_and_view_counts', 'is_professional_account',
       'is_supervision_enabled', 'is_guardian_of_viewer',
       'is_supervised_by_viewer', 'is_supervised_user', 'is_embeds_disabled',
       'is_joined_recently', 'business_address_json',
       'business_contact_method', 'business_email', 'business_phone_number',
       'business_category_name', 'overall_category_name', 'category_enum',
       'is_verified_by_mv4b', 'is_regulated_c18', 'profile_pic_url',
       'should_show_category', 'should_show_public_contacts',
       'show_account_transparency_details', 'profile_picture_base64'],
 

In [64]:
training_data.shape

(5415, 45)

In [67]:
import numpy as np
from sklearn.preprocessing import StandardScaler, LabelEncoder

# Extract features from the `posts` column
 # Extract features from the `posts` column
def extract_post_features(df):
    def analyze_posts(posts):
        # Check if posts is None or not a list
        if posts is None or not isinstance(posts, list):
            return {
                "num_posts": 0,
                "avg_word_count": 0,
                "avg_hashtags": 0,
                "avg_mentions": 0,
                "total_chars": 0
            }
        
        num_posts = len(posts)
        total_words = sum(len(post.get("caption", "").split()) for post in posts if isinstance(post.get("caption", ""), str))
        total_chars = sum(len(post.get("caption", "")) for post in posts if isinstance(post.get("caption", ""), str))
        total_hashtags = sum(post.get("caption", "").count("#") for post in posts if isinstance(post.get("caption", ""), str))
        total_mentions = sum(post.get("caption", "").count("@") for post in posts if isinstance(post.get("caption", ""), str))

        return {
            "num_posts": num_posts,
            "avg_word_count": total_words / num_posts if num_posts > 0 else 0,
            "avg_hashtags": total_hashtags / num_posts if num_posts > 0 else 0,
            "avg_mentions": total_mentions / num_posts if num_posts > 0 else 0,
            "total_chars": total_chars,
        }

    # Apply the function to the posts column
    post_features = df["posts"].apply(analyze_posts)
    post_features_df = pd.DataFrame(list(post_features))
    df = pd.concat([df, post_features_df], axis=1)
    df = df.drop(columns=["posts"])  # Drop the original `posts` column
    return df

# Normalize numerical features
def normalize_numerical_features(df, numerical_cols):
    scaler = StandardScaler()
    df[numerical_cols] = scaler.fit_transform(df[numerical_cols])
    return df

# Encode categorical variables
def encode_categorical_features(df, categorical_cols):
    label_encoders = {}
    for col in categorical_cols:
        df[col] = df[col].fillna("Unknown")  # Handle missing values
        le = LabelEncoder()
        df[col] = le.fit_transform(df[col])
        label_encoders[col] = le
    return df, label_encoders

# Create derived features
def create_derived_features(df):
    df["follower_following_ratio"] = df["follower_count"] / (df["following_count"] + 1)  # Avoid division by zero
    df["is_business_account_flag"] = df["is_business_account"].astype(int)
    df["is_verified_flag"] = df["is_verified"].astype(int)
    return df

# Perform feature engineering
try:
    # Step 1: Extract features from the `posts` column
    training_data = extract_post_features(training_data)
    
    # Step 2: Normalize numerical features
    numerical_cols = ["follower_count", "following_count", "post_count", "num_posts", "avg_word_count", "avg_hashtags", "avg_mentions", "total_chars"]
    training_data = normalize_numerical_features(training_data, numerical_cols)
    
    # Step 3: Encode categorical features
    categorical_cols = ["category_name"]
    training_data, label_encoders = encode_categorical_features(training_data, categorical_cols)
    
    # Step 4: Create derived features
    training_data = create_derived_features(training_data)

    print("Feature engineering completed. Here's a preview of the processed data:")
    print(training_data.head())

except Exception as e:
    print("Error during feature engineering:", e)
    raise

Feature engineering completed. Here's a preview of the processed data:
                     username          id                    full_name  \
0                  deparmedya  3170700063                  Depar Medya   
1              beyazyakaliyiz  8634457436           Selam Beyaz Yakalı   
2                  kafesfirin   266439571                  KAFES FIRIN   
3                    vimerang  2367195567                     Vimerang   
4  totalenergies_istasyonlari  7066643793  TotalEnergies İstasyonları   

                                           biography  category_name  \
0           #mediaplanning #mediabuying #sosyalmedya            230   
1        Beyaz yakalıların dünyasına hoşgeldiniz 😀😀😀            293   
2  📍Söğütözü📍FTZ AVM\n🛒Ankara macro▲center v...             53   
3       Dijital İletişim Yönetimi🎬info@vimerang.comq            405   
4  TotalEnergies İstasyonları resmi Instagram hes...            133   

   post_count  follower_count  following_count  is_busine

In [72]:
from sklearn.model_selection import train_test_split

# Verify if the correct target column is present
if 'category_name' not in training_data.columns:
    print("The target column 'category_name' is not present in the data. Available columns are:")
    print(training_data.columns)
else:
    # Define the features (X) and the target (y)
    X = training_data.drop(columns=['category_name', 'username', 'id', 'full_name', 'biography', 'profile_picture_base64'])
    y = training_data['category_name']

    # Perform an 80-20 split for training and validation
    test_size = 0.2  # 20% for validation
    random_state = 42  # For reproducibility

    # Stratify ensures the label distribution is preserved in both splits
    X_train, X_val, y_train, y_val = train_test_split(
        X, y, test_size=test_size, random_state=random_state, stratify=y
    )

    # Print the shape of the resulting splits
    print(f"Training set shape: {X_train.shape}, {y_train.shape}")
    print(f"Validation set shape: {X_val.shape}, {y_val.shape}")

    # Check label distribution in both splits
    print("Label distribution in training set:")
    print(y_train.value_counts(normalize=True))
    print("\nLabel distribution in validation set:")
    print(y_val.value_counts(normalize=True))

ValueError: The least populated class in y has only 1 member, which is too few. The minimum number of groups for any class cannot be less than 2.

In [69]:
training_data.columns

Index(['username', 'id', 'full_name', 'biography', 'category_name',
       'post_count', 'follower_count', 'following_count',
       'is_business_account', 'is_private', 'is_verified',
       'highlight_reel_count', 'bio_links', 'entities', 'ai_agent_type',
       'fb_profile_biolink', 'restricted_by_viewer', 'country_block',
       'eimu_id', 'external_url', 'fbid', 'has_clips',
       'hide_like_and_view_counts', 'is_professional_account',
       'is_supervision_enabled', 'is_guardian_of_viewer',
       'is_supervised_by_viewer', 'is_supervised_user', 'is_embeds_disabled',
       'is_joined_recently', 'business_address_json',
       'business_contact_method', 'business_email', 'business_phone_number',
       'business_category_name', 'overall_category_name', 'category_enum',
       'is_verified_by_mv4b', 'is_regulated_c18', 'profile_pic_url',
       'should_show_category', 'should_show_public_contacts',
       'show_account_transparency_details', 'profile_picture_base64',
       'num