# Sentiment140 Text Processing Pipeline
Dataset: [Sentiment140 (Kaggle)](https://www.kaggle.com/datasets/kazanova/sentiment140).
Each section below corresponds to one assignment task.

## 1. Text Cleaning
Lowercase tweets, expand contractions, and remove emojis or special symbols.

In [None]:
import pandas as pd
import re

DATA_PATH = "data/data1.csv"
try:
    df = pd.read_csv(DATA_PATH)
except UnicodeDecodeError:
    df = pd.read_csv(DATA_PATH, encoding="ISO-8859-1")

if list(df.columns) != ["sentiment", "tweet_id", "timestamp", "query", "user", "text"]:
    df.columns = ["sentiment", "tweet_id", "timestamp", "query", "user", "text"]

CONTRACTIONS = {
    "aren't": "are not",
    "can't": "cannot",
    "couldn't": "could not",
    "didn't": "did not",
    "doesn't": "does not",
    "don't": "do not",
    "hadn't": "had not",
    "hasn't": "has not",
    "haven't": "have not",
    "he's": "he is",
    "i'd": "i would",
    "i'll": "i will",
    "i'm": "i am",
    "i've": "i have",
    "isn't": "is not",
    "it's": "it is",
    "let's": "let us",
    "shouldn't": "should not",
    "that's": "that is",
    "there's": "there is",
    "they'd": "they would",
    "they'll": "they will",
    "they're": "they are",
    "they've": "they have",
    "wasn't": "was not",
    "we'd": "we would",
    "we're": "we are",
    "weren't": "were not",
    "what's": "what is",
    "who's": "who is",
    "won't": "will not",
    "wouldn't": "would not",
    "you'd": "you would",
    "you'll": "you will",
    "you're": "you are",
    "you've": "you have"
}

contractions_pattern = re.compile(
    "(" + "|".join(map(re.escape, CONTRACTIONS.keys())) + ")",
    flags=re.IGNORECASE,
)
emoji_pattern = re.compile(
    "[\U0001F600-\U0001F64F\U0001F300-\U0001F5FF\U0001F680-\U0001F6FF\U0001F1E0-\U0001F1FF]+",
    flags=re.UNICODE,
)
special_pattern = re.compile(r"[^a-z0-9\s]")


def expand_contractions(text: str) -> str:
    return contractions_pattern.sub(lambda match: CONTRACTIONS[match.group(0).lower()], text)


def clean_text(text) -> str:
    if not isinstance(text, str):
        text = "" if pd.isna(text) else str(text)
    lowered = text.lower()
    expanded = expand_contractions(lowered)
    no_emoji = emoji_pattern.sub(" ", expanded)
    cleaned = special_pattern.sub(" ", no_emoji)
    return re.sub(r"\s+", " ", cleaned).strip()


df["clean_text"] = df["text"].apply(clean_text)
print("Dataset shape:", df.shape)
df[["text", "clean_text"]].head()

## 2. Tokenization
Split each cleaned tweet into alphanumeric tokens to prepare for downstream processing.

In [None]:
token_pattern = re.compile(r"\b[a-z0-9]+\b")


def tokenize(text: str) -> list[str]:
    if not isinstance(text, str):
        text = "" if pd.isna(text) else str(text)
    return token_pattern.findall(text.lower())


df["tokens"] = df["clean_text"].apply(tokenize)
print("Tokenized rows:", len(df["tokens"]))
df[["clean_text", "tokens"]].head()

## 3. Stop Words Removal
Filter common function words to focus on content-bearing terms.

In [None]:
import nltk
from nltk.corpus import stopwords

nltk.download("stopwords", quiet=True)
STOP_WORDS = set(stopwords.words("english"))

df["tokens_no_stop"] = df["tokens"].apply(lambda toks: [t for t in toks if t not in STOP_WORDS])
df[["clean_text", "tokens_no_stop"]].head()

## 4. Stemming vs. Lemmatization
Build a vocabulary to compare how Porter stemming and WordNet lemmatization transform words.

In [None]:
from itertools import chain
import nltk
from nltk.stem import PorterStemmer, WordNetLemmatizer

nltk.download("wordnet", quiet=True)
nltk.download("omw-1.4", quiet=True)

unique_words = sorted(set(chain.from_iterable(df["tokens_no_stop"])))
print(f"Unique vocabulary size: {len(unique_words):,}")

porter = PorterStemmer()
lemmatizer = WordNetLemmatizer()

vocab_df = pd.DataFrame({
    "word": unique_words,
})
vocab_df["stem"] = vocab_df["word"].apply(porter.stem)
vocab_df["lemma"] = vocab_df["word"].apply(lemmatizer.lemmatize)
vocab_df.head()

In [None]:
from IPython.display import display

total_words = len(vocab_df)
stem_changed = (vocab_df["stem"] != vocab_df["word"]).sum()
lemma_changed = (vocab_df["lemma"] != vocab_df["word"]).sum()
stem_vs_lemma_diff = (vocab_df["stem"] != vocab_df["lemma"]).sum()

summary_df = pd.DataFrame(
    [
        {"metric": "Total unique words", "count": total_words},
        {"metric": "Words altered by stemming", "count": stem_changed},
        {"metric": "Words altered by lemmatization", "count": lemma_changed},
        {"metric": "Stem vs lemma disagree", "count": stem_vs_lemma_diff},
    ]
)
display(summary_df)

diff_examples = vocab_df[vocab_df["stem"] != vocab_df["lemma"]][["word", "stem", "lemma"]].head(10)
display(diff_examples)

## 5. Feature Extraction (BoW & TF-IDF)
Vectorize the cleaned tweets with capped vocabulary to create sparse Bag-of-Words and TF-IDF representations.

In [None]:
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer

MAX_FEATURES = 5000  # cap vocabulary to keep matrix size manageable
MIN_DF = 5

count_vectorizer = CountVectorizer(max_features=MAX_FEATURES, min_df=MIN_DF)
bow_matrix = count_vectorizer.fit_transform(df["clean_text"])

tfidf_transformer = TfidfTransformer()
tfidf_matrix = tfidf_transformer.fit_transform(bow_matrix)

print("BoW matrix shape:", bow_matrix.shape)
print("TF-IDF matrix shape:", tfidf_matrix.shape)
print("Vocabulary size:", len(count_vectorizer.get_feature_names_out()))

In [None]:
bow_sample = pd.DataFrame(
    bow_matrix[:5].toarray(),
    columns=count_vectorizer.get_feature_names_out(),
    dtype=int,
)
tfidf_sample = pd.DataFrame(
    tfidf_matrix[:5].toarray(),
    columns=count_vectorizer.get_feature_names_out(),
)

display(bow_sample.head())
display(tfidf_sample.head())

## 6. Visualization
Highlight the most common terms using a word cloud and bar chart to interpret the cleaned corpus.

In [None]:
from collections import Counter
from itertools import chain
import matplotlib.pyplot as plt
from wordcloud import WordCloud

token_counts = Counter(chain.from_iterable(df["tokens_no_stop"]))
wordcloud = WordCloud(width=800, height=400, background_color="white").generate_from_frequencies(token_counts)

plt.figure(figsize=(12, 6))
plt.imshow(wordcloud, interpolation="bilinear")
plt.axis("off")
plt.title("Word Cloud of Filtered Tokens")
plt.show()

In [None]:
top_words = token_counts.most_common(10)
top_df = pd.DataFrame(top_words, columns=["word", "count"])
display(top_df)

plt.figure(figsize=(10, 5))
plt.bar(top_df["word"], top_df["count"], color="steelblue")
plt.title("Top 10 Words by Frequency")
plt.xlabel("Word")
plt.ylabel("Count")
plt.xticks(rotation=45, ha="right")
plt.tight_layout()
plt.show()

## 7. N-grams
Extract consecutive word pairs (bigrams) from stop-word stripped tokens to surface short phrases.

In [None]:
from collections import Counter

def extract_bigrams(tokens):
    if len(tokens) < 2:
        return []
    return list(zip(tokens[:-1], tokens[1:]))

bigram_counter = Counter()
for toks in df["tokens_no_stop"]:
    bigram_counter.update(extract_bigrams(toks))

top_bigrams = bigram_counter.most_common(5)
top_bigrams_df = pd.DataFrame(top_bigrams, columns=["bigram", "count"])
top_bigrams_df