In [6]:
import pandas as pd
import re
import nltk
from nltk.tokenize import TweetTokenizer
from nltk.stem import WordNetLemmatizer
from nltk.corpus import wordnet
from sklearn.feature_extraction.text import TfidfVectorizer

In [7]:
try:
    nltk.data.find('tokenizers/punkt')
    nltk.data.find('corpora/wordnet')
    nltk.data.find('taggers/averaged_perceptron_tagger')
except LookupError:
    print("Downloading necessary NLTK data...")
    nltk.download(['punkt', 'wordnet', 'averaged_perceptron_tagger', 'omw-1.4', 'averaged_perceptron_tagger_eng'], quiet=True)
    print("NLTK data download complete.")

Downloading necessary NLTK data...
NLTK data download complete.


In [8]:
tweet_tokenizer = TweetTokenizer()
lemmatizer = WordNetLemmatizer()

In [9]:
def load_data(data_path='twitter_training.csv', text_column='Tweet'):
    """Loads the dataset and prepares the DataFrame."""
    try:
        df = pd.read_csv(data_path, header=None, encoding='latin1')
        df.columns = ['ID', 'Entity', 'Sentiment', text_column]
        df = df.dropna(subset=[text_column]).reset_index(drop=True)
        return df
    except FileNotFoundError:
        print(f"Error: Data file not found at {data_path}.")
        return None


In [10]:
def get_wordnet_pos(tag):
    """Map NLTK POS tags to WordNet POS tags"""
    if tag.startswith('J'):
        return wordnet.ADJ
    elif tag.startswith('V'):
        return wordnet.VERB
    elif tag.startswith('N'):
        return wordnet.NOUN
    elif tag.startswith('R'):
        return wordnet.ADV
    else:
        return wordnet.NOUN

In [11]:
def extract_features(text):
    hashtags = re.findall(r'#(\w+)', text)
    mentions = re.findall(r'@(\w+)', text)
    return ' '.join(hashtags), ' '.join(mentions)

In [12]:
def clean_and_tokenize(text):
    if pd.isna(text) or not isinstance(text, str):
        return []

    text = re.sub(r'http\S+|www\S+|https\S+', '', text, flags=re.MULTILINE)
    text = re.sub(r'@\w+', '', text)
    text = re.sub(r'#', '', text)
    text = re.sub(r'[^A-Za-z0-9\s]', '', text)
    tokens = tweet_tokenizer.tokenize(text)

    return tokens

In [13]:
def lemmatize_tokens(tokens):
    """Lemmatizes tokens using POS tagging for context-aware normalization."""
    tagged_tokens = nltk.pos_tag(tokens)

    lemmatized_tokens = []
    for word, tag in tagged_tokens:
        wntag = get_wordnet_pos(tag)
        if isinstance(word, str):
            lemma = lemmatizer.lemmatize(word, pos=wntag)
            lemmatized_tokens.append(lemma)

    return lemmatized_tokens

In [14]:
def preprocess_data(df, text_column='Tweet'):
    df[text_column] = df[text_column].fillna('')

    df['hashtags'], df['mentions'] = zip(*df[text_column].apply(extract_features))

    df['tokens'] = df[text_column].apply(clean_and_tokenize)

    df['lemmas'] = df['tokens'].apply(lemmatize_tokens)

    df['processed_text'] = df['lemmas'].apply(lambda x: ' '.join(x))

    return df

In [15]:
def vectorize_data(df, text_column='processed_text'):

    tfidf_vectorizer = TfidfVectorizer(sublinear_tf=True)

    tfidf_matrix = tfidf_vectorizer.fit_transform(df[text_column])

    feature_names = tfidf_vectorizer.get_feature_names_out()
    tfidf_df = pd.DataFrame(tfidf_matrix.toarray(), columns=feature_names)

    return tfidf_df, tfidf_vectorizer

In [1]:
if __name__ == '__main__':
    TEXT_COLUMN = 'Tweet'

    print("--- 1. Loading Data ---")
    df = load_data(text_column=TEXT_COLUMN)

    if df is None:
        print("Exiting script due to data loading error.")
        exit()

    df_sample = df.head(100).copy()

    print(f"Successfully loaded {len(df)} rows. Processing a sample of {len(df_sample)} rows.")
    print("\n--- Original Data Sample (First 5 rows) ---")
    print(df_sample[[TEXT_COLUMN]].head().to_markdown(index=False))

    print("\n--- 2. Applying Preprocessing Pipeline ---")
    processed_df = preprocess_data(df_sample, text_column=TEXT_COLUMN)

    print("\n--- 3. Processed Data Sample (First 5 rows) ---")
    print(processed_df[[TEXT_COLUMN, 'hashtags', 'mentions', 'processed_text']].head().to_markdown(index=False))

    print("\n--- 4. Applying TF-IDF Vectorization (sublinear_tf=True) ---")
    tfidf_df, vectorizer = vectorize_data(processed_df)

    print(f"\nTotal features (vocabulary size): {len(vectorizer.get_feature_names_out())}")

    print("\n--- 5. TF-IDF Vectorization Sample (First 5 rows, first 10 features) ---")

    feature_names = vectorizer.get_feature_names_out()[:10]
    tfidf_sample_output = tfidf_df.iloc[:5, :10]
    tfidf_sample_output.columns = feature_names
    tfidf_sample_output.index = [f"Doc {i+1}" for i in range(5)]

    print(tfidf_sample_output.to_markdown())

    print("\n--- Script Execution Complete ---")

--- 1. Loading Data ---


NameError: name 'load_data' is not defined