In [7]:
import pandas as pd

# loading the dataset
df = pd.read_csv(r"C:\Users\asad0\OneDrive\Documents\Fall 2025\Natural Language Processing\midterm\IMDB Dataset.csv")

print("Shape (rows, columns):", df.shape)
print("\nColumns:", df.columns.tolist())

print("\nfirst 3 rows:")
display(df.head(3))


Shape (rows, columns): (50000, 2)

Columns: ['review', 'sentiment']

first 3 rows:


Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,positive
1,A wonderful little production. <br /><br />The...,positive
2,I thought this was a wonderful way to spend ti...,positive


In [9]:
# counting how many reviews per sentiment
sentiment_counts = df["sentiment"].value_counts()

print("sentiment distribution:")
print(sentiment_counts)

sentiment distribution:
sentiment
positive    25000
negative    25000
Name: count, dtype: int64


In [17]:
# picking one positive and one negative review
positive_review = df[df["sentiment"] == "positive"]["review"].iloc[0]
negative_review = df[df["sentiment"] == "negative"]["review"].iloc[0]

print("POSITIVE REVIEW:\n")
print(positive_review[:1000]) # print first 1000 characters

print("\n" + "-"*80 + "\n")

print("NEGATIVE REVIEW: \n")
print(negative_review[:1000])

POSITIVE REVIEW:

One of the other reviewers has mentioned that after watching just 1 Oz episode you'll be hooked. They are right, as this is exactly what happened with me.<br /><br />The first thing that struck me about Oz was its brutality and unflinching scenes of violence, which set in right from the word GO. Trust me, this is not a show for the faint hearted or timid. This show pulls no punches with regards to drugs, sex or violence. Its is hardcore, in the classic use of the word.<br /><br />It is called OZ as that is the nickname given to the Oswald Maximum Security State Penitentary. It focuses mainly on Emerald City, an experimental section of the prison where all the cells have glass fronts and face inwards, so privacy is not high on the agenda. Em City is home to many..Aryans, Muslims, gangstas, Latinos, Christians, Italians, Irish and more....so scuffles, death stares, dodgy dealings and shady agreements are never far away.<br /><br />I would say the main appeal of the show

In [27]:
import re

def basic_clean(text):
    # convert to lower case
    text = text.lower()

    # remove HTML tags like <br />
    text = re.sub(r"<.*?>", "", text)
    return text

# take one raw review
raw_review = df["review"].iloc[0]

cleaned_review = basic_clean(raw_review)

print("BEFORE CLEANING:\n")
print(raw_review[:500])

print("\n" +"-"*80 + "\n")

print("After Cleaning: \n")
print(cleaned_review[:500])

BEFORE CLEANING:

One of the other reviewers has mentioned that after watching just 1 Oz episode you'll be hooked. They are right, as this is exactly what happened with me.<br /><br />The first thing that struck me about Oz was its brutality and unflinching scenes of violence, which set in right from the word GO. Trust me, this is not a show for the faint hearted or timid. This show pulls no punches with regards to drugs, sex or violence. Its is hardcore, in the classic use of the word.<br /><br />It is called OZ

--------------------------------------------------------------------------------

After Cleaning: 

one of the other reviewers has mentioned that after watching just 1 oz episode you'll be hooked. they are right, as this is exactly what happened with me.the first thing that struck me about oz was its brutality and unflinching scenes of violence, which set in right from the word go. trust me, this is not a show for the faint hearted or timid. this show pulls no punches with re

In [35]:
import string
from sklearn.feature_extraction.text import ENGLISH_STOP_WORDS

# customize stopwords: remove negation words from stopword list
custom_stopwords = ENGLISH_STOP_WORDS - {"not", "no", "never"}

def clean_text(text):
    # lowercasing
    text = text.lower()

    # remove HTML
    text = re.sub(r"<.*>", "", text)

    # remove punctuation
    text = text.translate(str.maketrans("", "", string.punctuation))


    # remove stopwords
    words = text.split()
    words = [word for word in words if word not in custom_stopwords]

    return " ".join(words)


# test on one review
raw_review = df["review"].iloc[0]
cleaned_review = clean_text(raw_review)

print("BEFORE: \n")
print(raw_review[:400])

print("\n" + "-"*80 + "\n")

print("AFTER:\n")
print(cleaned_review[:400])

BEFORE: 

One of the other reviewers has mentioned that after watching just 1 Oz episode you'll be hooked. They are right, as this is exactly what happened with me.<br /><br />The first thing that struck me about Oz was its brutality and unflinching scenes of violence, which set in right from the word GO. Trust me, this is not a show for the faint hearted or timid. This show pulls no punches with regards to

--------------------------------------------------------------------------------

AFTER:

reviewers mentioned watching just 1 oz episode youll hooked right exactly happened mei say main appeal fact goes shows wouldnt dare forget pretty pictures painted mainstream audiences forget charm forget romanceoz doesnt mess episode saw struck nasty surreal say ready watched developed taste oz got accustomed high levels graphic violence not just violence injustice crooked guards wholl sold nickel


In [42]:
from sklearn.feature_extraction.text import CountVectorizer

# take 2 cleaned reviews 
reviews = [
    clean_text(df["review"].iloc[0]),
    clean_text(df["review"].iloc[1])
]

vectorizer = CountVectorizer()

X = vectorizer.fit_transform(reviews)

print("Vocabulary:")
print(vectorizer.get_feature_names_out())

print("\nNumerical reprsentation (Bag of Words):")
print(X.toarray())

Vocabulary:
['accustomed' 'appeal' 'audiences' 'away' 'bitches' 'charm' 'class'
 'comes' 'comfortable' 'concerning' 'crooked' 'dare' 'darker' 'decorating'
 'developed' 'disappears' 'doesnt' 'dream' 'episode' 'exactly'
 'experience' 'fact' 'fantasy' 'flat' 'forget' 'goes' 'got' 'graphic'
 'guard' 'guards' 'halliwell' 'halliwells' 'happened' 'high' 'home'
 'hooked' 'injustice' 'inmates' 'just' 'kill' 'knowledge' 'lack' 'levels'
 'little' 'main' 'mainstream' 'mannered' 'mei' 'mentioned' 'mess' 'middle'
 'murals' 'nasty' 'nickel' 'not' 'order' 'orton' 'oz' 'painted'
 'particularly' 'pictures' 'plays' 'pretty' 'prison' 'production' 'ready'
 'realism' 'really' 'remains' 'reviewers' 'right' 'romanceoz' 'saw' 'say'
 'scenes' 'senses' 'sets' 'shows' 'skills' 'sold' 'solid' 'street'
 'struck' 'surface' 'surreal' 'taste' 'techniques' 'terribly' 'things'
 'touch' 'traditional' 'turned' 'uncomfortable' 'use' 'viewingthats'
 'violence' 'watched' 'watching' 'wholl' 'wonderful' 'wouldnt' 'youll']

Num

In [46]:
from sklearn.feature_extraction.text import TfidfVectorizer

tfidf_vectorizer = TfidfVectorizer()

X_tfidf = tfidf_vectorizer.fit_transform(reviews)

print("TF-IDF Vocabulary:")
print(tfidf_vectorizer.get_feature_names_out())

print("\nTF-IDF numerical representation:")
print(X_tfidf.toarray())



TF-IDF Vocabulary:
['accustomed' 'appeal' 'audiences' 'away' 'bitches' 'charm' 'class'
 'comes' 'comfortable' 'concerning' 'crooked' 'dare' 'darker' 'decorating'
 'developed' 'disappears' 'doesnt' 'dream' 'episode' 'exactly'
 'experience' 'fact' 'fantasy' 'flat' 'forget' 'goes' 'got' 'graphic'
 'guard' 'guards' 'halliwell' 'halliwells' 'happened' 'high' 'home'
 'hooked' 'injustice' 'inmates' 'just' 'kill' 'knowledge' 'lack' 'levels'
 'little' 'main' 'mainstream' 'mannered' 'mei' 'mentioned' 'mess' 'middle'
 'murals' 'nasty' 'nickel' 'not' 'order' 'orton' 'oz' 'painted'
 'particularly' 'pictures' 'plays' 'pretty' 'prison' 'production' 'ready'
 'realism' 'really' 'remains' 'reviewers' 'right' 'romanceoz' 'saw' 'say'
 'scenes' 'senses' 'sets' 'shows' 'skills' 'sold' 'solid' 'street'
 'struck' 'surface' 'surreal' 'taste' 'techniques' 'terribly' 'things'
 'touch' 'traditional' 'turned' 'uncomfortable' 'use' 'viewingthats'
 'violence' 'watched' 'watching' 'wholl' 'wonderful' 'wouldnt' 'youll

In [51]:
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression

# 1) X = raw text raw text - I will vectorize it. 
X_text = df["review"]

# 2) y = labels (positive/negative)
y = df["sentiment"]

# 3) Train/test split (random_state=42 for reproducibility
X_train_text, X_test_text, y_train, y_test = train_test_split(
    X_text, y, test_size=0.2, random_state=42, stratify=y
)

# 4) TF_IDF vectorization (fit on train, transform train+test)
tfidf = TfidfVectorizer(preprocessor=clean_text)
X_train = tfidf.fit_transform(X_train_text)
X_test = tfidf.transform(X_test_text)

# 5) train a logistic regression classifier
model = LogisticRegression(max_iter=1000, random_state=42)
model.fit(X_train, y_train)

# Quick accuracy check
accuracy = model.score(X_test, y_test)
print("Test Accuracy:", accuracy)

Test Accuracy: 0.8539


In [53]:
from sklearn.metrics import classification_report, confusion_matrix

# 1) Make predictions on test data
y_pred = model.predict(X_test)

# 2) Print precision, recall, F1-score
print("Classification Report: \n")
print(classification_report(y_test, y_pred))

# 3 Print Confusion Matrix
print("Confusion Matrix: \n")
print(confusion_matrix(y_test, y_pred))

Classification Report: 

              precision    recall  f1-score   support

    negative       0.86      0.84      0.85      5000
    positive       0.85      0.87      0.86      5000

    accuracy                           0.85     10000
   macro avg       0.85      0.85      0.85     10000
weighted avg       0.85      0.85      0.85     10000

Confusion Matrix: 

[[4212  788]
 [ 673 4327]]


In [61]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, confusion_matrix

# TF-IDF with Unigrams +  Bigrams
tfidf_bigram = TfidfVectorizer(
    preprocessor=clean_text,
    ngram_range=(1,2), # (1,2) = unigrams + bigrams
    min_df=2  # ignore words/phrases that appear in only one document
)

X_train_bigram = tfidf_bigram.fit_transform(X_train_text)
X_test_bigram = tfidf_bigram.transform(X_test_text)

model_bigram = LogisticRegression(max_iter=1000, random_state=42)
model_bigram.fit(X_train_bigram, y_train)

y_pred_bigram = model_bigram.predict(X_test_bigram)

print("Bigram Model Accuracy:", model_bigram.score(X_test_bigram, y_test))
print("\nClassification Report: \n")
print(classification_report(y_test, y_pred_bigram))
print("Confusion Matrix: \n")
print(confusion_matrix(y_test, y_pred_bigram))

Bigram Model Accuracy: 0.8599

Classification Report: 

              precision    recall  f1-score   support

    negative       0.87      0.84      0.86      5000
    positive       0.85      0.88      0.86      5000

    accuracy                           0.86     10000
   macro avg       0.86      0.86      0.86     10000
weighted avg       0.86      0.86      0.86     10000

Confusion Matrix: 

[[4221  779]
 [ 622 4378]]


In [63]:
# take a subset for topic modelling (for speed and clarity)
topic_texts = df["review"].sample(n=5000, random_state=42)

# apply cleaning
topic_texts_cleaned = topic_texts.apply(clean_text)

# Quick sanity check
topic_texts_cleaned.head()

33553    really liked summerslam look arena curtains ju...
9427     not television shows appeal quite different ki...
199      film quickly gets major chase scene increasing...
12447    jane austen definitely approve onehighly recom...
39489    expectations somewhat high went movie thought ...
Name: review, dtype: object

In [67]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.decomposition import LatentDirichletAllocation

# 1) Convert text to word counts
count_vectorizer = CountVectorizer(
    max_df=0.9,   # ignore very common words
    min_df=10,    # ignore very rare words
)

X_counts = count_vectorizer.fit_transform(topic_texts_cleaned)

# 2) Train LDA model
num_topics = 5

lda_model = LatentDirichletAllocation(
    n_components=num_topics, 
    random_state=42,
    learning_method="batch"

) 

lda_model.fit(X_counts)

# 3) Function to display top words per topic
def display_topics(model, feature_names, num_top_words=10):
    for topic_idx, topic in enumerate(model.components_):
        print(f"\nTopic {topic_idx + 1}:")
        top_words = topic.argsort()[-num_top_words:][::-1]
        print([feature_names[i] for i in top_words])

# 4) Show topics
feature_names = count_vectorizer.get_feature_names_out()
display_topics(lda_model, feature_names)






Topic 1:
['film', 'not', 'movie', 'just', 'time', 'no', 'like', 'dont', 'good', 'really']

Topic 2:
['movie', 'not', 'film', 'just', 'like', 'no', 'horror', 'good', 'watch', 'really']

Topic 3:
['movie', 'not', 'like', 'just', 'bad', 'good', 'seen', 'time', 'movies', 'really']

Topic 4:
['movie', 'not', 'good', 'just', 'like', 'great', 'really', 'film', 'story', 'bad']

Topic 5:
['film', 'not', 'films', 'story', 'best', 'movie', 'like', 'life', 'great', 'good']


In [69]:
from sklearn.feature_extraction.text import ENGLISH_STOP_WORDS

# Extended stopwords for topic modeling
topic_stopwords = ENGLISH_STOP_WORDS.union({
    "movie", "film", "films", "movie",
    "good", "bad", "great", "really", "just",
    "like", "dont", "didnt", "doesnt", "not", 
    "watch", "watched", "watching", "seen", 
    "time", "story"
})

def clean_text_topic(text):
    text =  text.lower()
    text = re.sub(r"<.*?>", "", text)
    text = re.sub(r"[^a-z\s]", "", text) # keep letters only
    words = text.split()
    words = [w for w in words if w not in topic_stopwords and len(w) > 2 ]
    return " ".join(words)

In [71]:
topic_texts_cleaned = topic_texts.apply(clean_text_topic)

In [73]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.decomposition import LatentDirichletAllocation

count_vectorizer = CountVectorizer(
    min_df=20,     # stronger filtering
    max_df=0.8
)

X_counts = count_vectorizer.fit_transform(topic_texts_cleaned)
lda_model = LatentDirichletAllocation(
    n_components=6,
    random_state=42,
    learning_method="batch"
)

lda_model.fit(X_counts)

feature_names = count_vectorizer.get_feature_names_out()
display_topics(lda_model, feature_names, num_top_words=10)



Topic 1:
['movies', 'make', 'acting', 'people', 'horror', 'think', 'plot', 'effects', 'characters', 'scene']

Topic 2:
['characters', 'character', 'series', 'plot', 'original', 'does', 'episode', 'acting', 'new', 'game']

Topic 3:
['love', 'best', 'comedy', 'funny', 'years', 'old', 'role', 'man', 'young', 'cast']

Topic 4:
['life', 'people', 'world', 'love', 'war', 'young', 'american', 'way', 'best', 'man']

Topic 5:
['horror', 'man', 'little', 'end', 'way', 'scene', 'killer', 'gets', 'does', 'character']

Topic 6:
['people', 'movies', 'think', 'know', 'did', 'say', 'better', 'make', 'acting', 'thing']


In [75]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import NMF

# TF-IDF for topic modeling
tfidf_vectorizer = TfidfVectorizer(
    min_df=20,
    max_df=0.8
)

X_tfidf_topics = tfidf_vectorizer.fit_transform(topic_texts_cleaned)

# Train NMF
num_topics = 6

nmf_model = NMF(
    n_components=num_topics,
    random_state=42,
    init="nndsvd"
)

nmf_model.fit(X_tfidf_topics)

# display topics
feature_names = tfidf_vectorizer.get_feature_names_out()

def display_topics_nmf(model, feature_names, num_top_words=10):
    for topic_idx, topic in enumerate(model.components_):
        print(f"\nTopic {topic_idx + 1}:")
        top_words =  topic.argsort()[-num_top_words:][::-1]
        print([feature_names[i] for i in top_words])

display_topics_nmf(nmf_model, feature_names)




Topic 1:
['man', 'character', 'role', 'scene', 'does', 'best', 'performance', 'young', 'characters', 'director']

Topic 2:
['acting', 'plot', 'worst', 'better', 'movies', 'make', 'people', 'did', 'say', 'know']

Topic 3:
['love', 'life', 'people', 'think', 'saw', 'did', 'family', 'book', 'years', 'loved']

Topic 4:
['funny', 'comedy', 'laugh', 'jokes', 'humor', 'hilarious', 'best', 'fun', 'funniest', 'lot']

Topic 5:
['horror', 'movies', 'gore', 'little', 'dead', 'scary', 'fun', 'house', 'killer', 'effects']

Topic 6:
['series', 'episode', 'characters', 'episodes', 'season', 'new', 'original', 'television', 'character', 'shows']




In [79]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import NMF

# TF-IDF for topic modeling
tfidf_vectorizer = TfidfVectorizer(
    min_df=20,
    max_df=0.8
)

X_tfidf_topics = tfidf_vectorizer.fit_transform(topic_texts_cleaned)

# Train NMF
num_topics = 6

nmf_model = NMF(
    n_components=num_topics,
    random_state=42,
    init="nndsvd",
    max_iter=500  # increased from default 200
)

nmf_model.fit(X_tfidf_topics)

# display topics
feature_names = tfidf_vectorizer.get_feature_names_out()

def display_topics_nmf(model, feature_names, num_top_words=10):
    for topic_idx, topic in enumerate(model.components_):
        print(f"\nTopic {topic_idx + 1}:")
        top_words =  topic.argsort()[-num_top_words:][::-1]
        print([feature_names[i] for i in top_words])

display_topics_nmf(nmf_model, feature_names)


Topic 1:
['man', 'character', 'role', 'scene', 'does', 'best', 'performance', 'young', 'characters', 'director']

Topic 2:
['acting', 'plot', 'worst', 'better', 'movies', 'make', 'did', 'people', 'say', 'know']

Topic 3:
['love', 'life', 'people', 'think', 'saw', 'did', 'family', 'book', 'years', 'loved']

Topic 4:
['funny', 'comedy', 'laugh', 'jokes', 'humor', 'hilarious', 'best', 'fun', 'funniest', 'lot']

Topic 5:
['horror', 'movies', 'gore', 'little', 'dead', 'scary', 'fun', 'house', 'killer', 'effects']

Topic 6:
['series', 'episode', 'characters', 'episodes', 'season', 'new', 'original', 'television', 'character', 'shows']


In [81]:
import re
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, confusion_matrix

#... 1) Base cleaning
# we will define variants below and swap them in

def clean_base(text: str) -> str:
    text = text.lower()
    text = re.sub(r"<.*?>", ".", text)          # removing HTML
    text = re.sub(r"[^a-z\s]", " ", text)       # keep letters / spaces only
    text = re.sub(r"\s+", " ", text).strip()
    return text
    
#... 2) Variant: base + remove stopwords (but keep "not")
from sklearn.feature_extraction.text import ENGLISH_STOP_WORDS
STOPWORDS_KEEP_NOT = set(ENGLISH_STOP_WORDS) - {"not", "no", "nor"}

def clean_no_stopwords_keep_not(text: str) -> str:
    text = clean_base(text)
    words = [w for w in text.split() if w not in STOPWORDS_KEEP_NOT]
    return " ".join(words)


#.... 3 Train / evaluate helper ---
def run_sentiment_pipeline(clean_fn, ngram_range=(1,1), min_df=2, max_df=0.9):
    X_text = df["review"]
    y = df["sentiment"]


    X_train_text, X_test_text, y_train, y_test = train_test_split(
        X_text, y, test_size=0.2, random_state=42, stratify=y
    )

    vectorizer = TfidfVectorizer(
        preprocessor=clean_fn,
        ngram_range=ngram_range,
        min_df=min_df,
        max_df=max_df
    )


    X_train = vectorizer.fit_transform(X_train_text)
    X_test = vectorizer.transform(X_test_text)

    model = LogisticRegression(max_iter=1000, random_state=42)
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)

    acc = model.score(X_test, y_test)
    cm = confusion_matrix(y_test, y_pred)
    report = classification_report(y_test, y_pred, output_dict=True)

    return acc, cm, report




In [83]:
# pipeline 1 Stopwords kept, Unigrams

acc1, cm1, rep1 = run_sentiment_pipeline(clean_base, ngram_range=(1,1))
print("Pipeline 1 (base cleaning, unigrams) accuracy:", acc1)
print("Confusion matrix:\n", cm1)

Pipeline 1 (base cleaning, unigrams) accuracy: 0.9005
Confusion matrix:
 [[4481  519]
 [ 476 4524]]


In [87]:
# pipeline 2, base cleaning, bigrams

acc2, cm2, rep2 = run_sentiment_pipeline(clean_base, ngram_range=(1,2))
print("Pipeline 2 (base cleaning, unigrams+bigrams) accuracy:", acc2)
print("Confusion matrix:\n", cm2)

Pipeline 2 (base cleaning, unigrams+bigrams) accuracy: 0.9062
Confusion matrix:
 [[4494  506]
 [ 432 4568]]


In [91]:
# pipeline 3, remove stopwords, keep NOT, bigrams

acc3, cm3, rep3 = run_sentiment_pipeline(clean_no_stopwords_keep_not, ngram_range=(1,2))
print("Pipeline 3 (no stopwords but keep NOT, unigrams+bigrams) accuracy:", acc3)
print("Confusion matrix:\n", cm3)

Pipeline 3 (no stopwords but keep NOT, unigrams+bigrams) accuracy: 0.9007
Confusion matrix:
 [[4455  545]
 [ 448 4552]]


In [95]:
# ensuring SpaCy + English model are available

try: 
    import spacy
    try:
        nlp = spacy.load("en_core_web_sm")
        print("spacy is installed and en_core_web_sm is loaded")
    except OSError:
        print("SpaCy is installed but en_core_web_sm model is missing. Installing model...")
        import sys
        import subprocess
        subprocess.check_call([sys.executable, "-m", "spacy", "download", "en_core_web_sm"])
        nlp = spacy.load("en_core_web_sm")
        print("en_core_web_sm installed and loaded.")

except ImportError:
    print("SpaCy is not installed. Installing SpaCy + model...")
    import sys
    import subprocess
    subprocess.check_call([sys.executable, "-m", "pip", "install", "-q", "spacy"])
    subprocess.check_call([sys.executable, "-m", "spacy", "download", "en_core_web_sm"])
    import spacy
    nlp = spacy.load("en_core_web_sm")
    print("spaCy + en_core_web_sm installed and loaded.")
    

spacy is installed and en_core_web_sm is loaded


In [97]:
import re
from sklearn.feature_extraction.text import ENGLISH_STOP_WORDS
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, confusion_matrix

# Keep negations (important for sentiment)
STOPWORDS_KEEP_NOT = set(ENGLISH_STOP_WORDS) - {"not", "no", "nor"}

def clean_lemmatize(text: str) -> str:
    # 1) lowercase + remove HTML
    text = text.lower()
    text = re.sub(r"<.*?>", " ", text)

    # 2) spaCy lemmatization
    doc = nlp(text)

    # 3) keep useful lemmas only
    tokens = []
    for tok in doc:
        # skip spaces/punct/numbers
        if tok.is_space or tok.is_punct or tok.like_num:
            continue

        lemma = tok.lemma_.strip()

        # remove very short tokens and stopwords (but keep not/no/nor)
        if len(lemma) <= 2:
            continue
        if lemma in STOPWORDS_KEEP_NOT:
            continue

        tokens.append(lemma)

    return " ".join(tokens)

# Vectorize with TF-IDF (unigrams + bigrams)
tfidf_lemma = TfidfVectorizer(
    preprocessor=clean_lemmatize,
    ngram_range=(1, 2),
    min_df=2,
    max_df=0.9
)

X_train_lemma = tfidf_lemma.fit_transform(X_train_text)
X_test_lemma = tfidf_lemma.transform(X_test_text)

model_lemma = LogisticRegression(max_iter=1000, random_state=42)
model_lemma.fit(X_train_lemma, y_train)

y_pred_lemma = model_lemma.predict(X_test_lemma)

print("Pipeline 4 (spaCy lemmatization + TF-IDF bigrams) Accuracy:", model_lemma.score(X_test_lemma, y_test))
print("\nClassification Report:\n")
print(classification_report(y_test, y_pred_lemma))
print("Confusion Matrix:\n")
print(confusion_matrix(y_test, y_pred_lemma))

Pipeline 4 (spaCy lemmatization + TF-IDF bigrams) Accuracy: 0.8964

Classification Report:

              precision    recall  f1-score   support

    negative       0.91      0.88      0.89      5000
    positive       0.89      0.91      0.90      5000

    accuracy                           0.90     10000
   macro avg       0.90      0.90      0.90     10000
weighted avg       0.90      0.90      0.90     10000

Confusion Matrix:

[[4409  591]
 [ 445 4555]]
