In [62]:
# import packages
from dotenv import load_dotenv
import fasttext
from huggingface_hub import hf_hub_download
import nltk
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords
from nltk.tokenize import sent_tokenize, word_tokenize
import numpy as np
import os
import pandas as pd
import re
from sentence_transformers import SentenceTransformer
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from transformers import pipeline

# Data Extraction

In [142]:
file_id = "1WPXfR8a8gmiShFoNbGyY9NFfMpQ1A61a"
download_url = f"https://drive.google.com/uc?id={file_id}"

raw_data = pd.read_csv(download_url, index_col = 0)
df = raw_data
df.head()

Unnamed: 0,created_at,default_profile,default_profile_image,description,favourites_count,followers_count,friends_count,geo_enabled,id,lang,location,profile_background_image_url,profile_image_url,screen_name,statuses_count,verified,average_tweets_per_day,account_age_days,account_type
0,2016-10-15 21:32:11,False,False,"Blame @xaiax, Inspired by @MakingInvisible, us...",4,1589,4,False,787405734442958848,en,unknown,http://abs.twimg.com/images/themes/theme1/bg.png,http://pbs.twimg.com/profile_images/7874121826...,best_in_dumbest,11041,False,7.87,1403,bot
1,2016-11-09 05:01:30,False,False,Photographing the American West since 1980. I ...,536,860,880,False,796216118331310080,en,Estados Unidos,http://abs.twimg.com/images/themes/theme1/bg.png,http://pbs.twimg.com/profile_images/8023296328...,CJRubinPhoto,252,False,0.183,1379,human
2,2017-06-17 05:34:27,False,False,Scruffy looking nerf herder and @twitch broadc...,3307,172,594,True,875949740503859204,en,"Los Angeles, CA",http://abs.twimg.com/images/themes/theme1/bg.png,http://pbs.twimg.com/profile_images/1278890453...,SVGEGENT,1001,False,0.864,1159,human
3,2016-07-21 13:32:25,True,False,Wife.Godmother.Friend.Feline Fanatic! Assistan...,8433,517,633,True,756119643622735875,en,"Birmingham, AL",,http://pbs.twimg.com/profile_images/1284884924...,TinkerVHELPK5,1324,False,0.889,1489,human
4,2012-01-15 16:32:35,False,False,Loan coach at @mancity & Aspiring DJ,88,753678,116,True,464781334,en,"England, United Kingdom",http://abs.twimg.com/images/themes/theme1/bg.png,http://pbs.twimg.com/profile_images/9952566258...,JoleonLescott,4202,True,1.339,3138,human


In [143]:
df.to_csv('../data/raw/raw_dataset.csv', index=False)

# Creation of New Features for Analysis

In [88]:
nltk.download('punkt_tab')

### functions to extract features of descriptions
def safe_word_count(desc):
    if pd.isna(desc) or not isinstance(desc, str):
        return 0
    return len(desc.split())


def safe_mean_word_length(desc):
    if pd.isna(desc) or not isinstance(desc, str):
        return 0
    words = desc.split()
    if len(words) == 0:
        return 0
    return np.mean([len(word) for word in words])


def safe_mean_sent_length(desc):
    if pd.isna(desc) or not isinstance(desc, str):
        return 0
    sents = sent_tokenize(desc)
    if len(sents) == 0:
        return 0
    return np.mean([len(sent) for sent in sents])


def count_hashtags(desc):
    if isinstance(desc, str):
        return desc.count("#")
    return 0


def count_handles(desc):
    if isinstance(desc, str):
        return desc.count("@")
    return 0


def count_urls(desc):
    pattern = r"htt[s?://\S+]"
    if isinstance(desc, str):
        matches = re.findall(pattern, desc)
        return len(matches)
    return 0

[nltk_data] Downloading package punkt_tab to
[nltk_data]     C:\Users\CelesteN\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping tokenizers\punkt_tab.zip.


In [None]:
df["length"] = df["description"].str.len().fillna(0)
df["word_count"] = df["description"].apply(safe_word_count)
df["mean_word_length"] = df["description"].apply(safe_mean_word_length)
df["mean_sent_length"] = df["description"].apply(safe_mean_sent_length)
df["hashtag_count"] = df["description"].apply(count_hashtags)
df["handle_count"] = df["description"].apply(count_handles)
df['url_count'] = df["description"].apply(count_urls)

# save to csv
df[['id', 'length', 'word_count', 'mean_word_length', 'mean_sent_length', 'hashtag_count', 'handle_count', 'url_count']].to_csv("../data/interim/new_features.csv", index=False)

# Generation of Embeddings 

In [None]:
# functions for generating translation and embeddings

def detect_language(df):
    """Predict language used in `description` using FastText model."""

    def clean_text(text):
        """Remove emoji and links from text to make prediction faster"""
        if not isinstance(text, str):
            return ""
        
        # Emoji & link cleaner
        emoji_pattern = re.compile("["
                u"\U0001F600-\U0001F64F"
                u"\U0001F300-\U0001F5FF"
                u"\U0001F680-\U0001F6FF"
                u"\U0001F1E0-\U0001F1FF"
                "]+", flags=re.UNICODE)
        
        text = re.sub(r"http\S+|www\S+|@\S+", "", text)
        text = emoji_pattern.sub(r"", text)
        text = text.replace("\n", " ").replace("\r", " ")
        text = re.sub(r"\s+", " ", text)
        return text.strip()


    # load fastText model
    model_path = hf_hub_download(repo_id="facebook/fasttext-language-identification", filename="model.bin")
    model = fasttext.load_model(model_path)
    
    def detect_text_language(text):
        text = clean_text(text)
        if not text:
            return "unknown"
        labels, probs = model.predict(text)
        probs = np.asarray(probs)  # fix for NumPy 2.x
        return labels[0].replace("__label__", "")

    df["description_language"] = df["description"].apply(detect_text_language)

    return df
    

def translate_language(
        df,
        translation_model_map={
        "yue_Hant": pipeline("translation", model="Helsinki-NLP/opus-mt-zh-en"),
        "kor_Hang": pipeline("translation", model="Helsinki-NLP/opus-mt-ko-en"),
        "spa_Latn": pipeline("translation", model="Helsinki-NLP/opus-mt-es-en")
    },
        chunk_size=1000):
    
        df["description_en"] = ""

        for start in range(0, len(df), chunk_size):
            end = min(start + chunk_size, len(df))
            chunk = df.iloc[start:end]

            for i, row in chunk.iterrows():
                lang = row["description_language"]
                text = row["description"]

                if lang == "eng_Latn":
                    translated = text
                elif lang == "unknown" or lang not in translation_model_map:
                    translated = ""
                else:
                    translator = translation_model_map[lang]
                    translated = translator(text)

                df.at[i, "description_en"] = translated

        return df
    

def custom_preprocessor(text):
    """
    Remove translated pattern from descriptions that were translated to english.
    Normalize  by replacing handles and URLs with placeholders, lowercase and lemmatize.
    """
    translated_pattern = r"'translation_text': '([^']*)'"
    match = re.search(translated_pattern, text)
    if match:
        text = match[1]

    # replace URLs with placeholder
    text = re.sub(r'[^a-zA-Z\s]', '', text.lower())

    # remove newlines, tabs, and extra spaces
    text = re.sub(r"[\n\t]+", " ", text)
    text = re.sub(r"\s+", " ", text)

    text = text.lower().strip()

    stop_words = set(nltk.corpus.stopwords.words("english"))
    filtered = [word for word in text.split() if word not in stop_words]
    lemmatizer = WordNetLemmatizer()
    lemmatized = [lemmatizer.lemmatize(word) for word in filtered]
    return " ".join(lemmatized)


def embed_descriptions(description_df, hf_api):
    model = SentenceTransformer('all-MiniLM-L6-v2', token=hf_api)
    embeddings_lst = []

    for i, row in description_df.iterrows():
        if row["description_en"]:
            embeddings = model.encode(row["description_en"])
        else:
            embeddings = np.zeros(384, dtype=float)  # default dim=384
        embeddings_lst.append(embeddings)

        if not i % 1000:
            print(f"progress: {i}")

    description_df["description_en_embeddings"] = embeddings_lst
    
    return description_df

In [None]:
df = detect_language(df)
df = translate_language(df)
df = embed_descriptions(df, os.getenv("HF_API_KEY"))

# # preprocess translated user descriptions and impute missing values with empty string
# df["description_en"] = df["description_en"].fillna("")
# df["description_en"] = df["description_en"].apply(custom_preprocessor)

# save to csv
df[['id', 'description_language','description_en', 'description_en_embeddings']].to_csv("../data/interim/translated_embeddings.csv", index = False)