In [1]:
from datasets import load_dataset
import pandas as pd

dataset = load_dataset("stanfordnlp/imdb")

train_df = pd.DataFrame(dataset["train"])
test_df = pd.DataFrame(dataset["test"])

train_df.head()


  from .autonotebook import tqdm as notebook_tqdm


Unnamed: 0,text,label
0,I rented I AM CURIOUS-YELLOW from my video sto...,0
1,"""I Am Curious: Yellow"" is a risible and preten...",0
2,If only to avoid making this type of film in t...,0
3,This film was probably inspired by Godard's Ma...,0
4,"Oh, brother...after hearing about this ridicul...",0


In [3]:
import spacy
from tqdm import tqdm

#When on CPU, slow, takes low computing power, lwss accurate
nlp = spacy.load("en_core_web_sm")


#When using GCP, for faster NER and accuracy
#nlp = spacy.load("en_core_web_trf")

def extract_person_entities(text):
    doc = nlp(text)
    return [ent.text for ent in doc.ents if ent.label_ == "PERSON"]

# Apply to dataset
tqdm.pandas()



#Use,if on GCP
#train_df["persons"] = train_df["text"].progress_apply(extract_person_entities)
#test_df["persons"] = test_df["text"].progress_apply(extract_person_entities)

#Else, when on CPU
persons_train = []
for doc in nlp.pipe(train_df["text"], batch_size=32, n_process=4):
    ents = [ent.text for ent in doc.ents if ent.label_ == "PERSON"]
    persons_train.append(ents)

train_df["persons"] = persons_train
train_df.head()

# persons_test = []
# for doc in nlp.pipe(test_df["text"], batch_size=32, n_process=4):
#     ents = [ent.text for ent in doc.ents if ent.label_ == "PERSON"]
#     persons_test.append(ents)
#
# test_df["persons"] = persons_test


Unnamed: 0,text,label,persons
0,I rented I AM CURIOUS-YELLOW from my video sto...,0,"[Lena, Ingmar Bergman, John Ford]"
1,"""I Am Curious: Yellow"" is a risible and preten...",0,"[Vincent Gallo's, johnson, Chloe Sevigny]"
2,If only to avoid making this type of film in t...,0,[]
3,This film was probably inspired by Godard's Ma...,0,"[Godard, Lena Nyman, Godard]"
4,"Oh, brother...after hearing about this ridicul...",0,[Peggy Lee]


In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer

vectorizer = TfidfVectorizer(
    max_features=20_000,
    stop_words="english",
    ngram_range=(1,2)
)

X_train = vectorizer.fit_transform(train_df["text"])
X_test = vectorizer.transform(test_df["text"])

y_train = train_df["label"]
y_test = test_df["label"]


In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, f1_score

log_reg = LogisticRegression(max_iter=3000, n_jobs=-1)
log_reg.fit(X_train, y_train)

pred_lr = log_reg.predict(X_test)

print("=== Logistic Regression Classification Report ===")
print(classification_report(y_test, pred_lr))
print("Macro F1:", f1_score(y_test, pred_lr, average="macro"))


In [None]:
from sklearn.svm import LinearSVC

svm = LinearSVC()
svm.fit(X_train, y_train)

pred_svm = svm.predict(X_test)

print("=== Linear SVM Classification Report ===")
print(classification_report(y_test, pred_svm))
print("Macro F1:", f1_score(y_test, pred_svm, average="macro"))


In [None]:
from collections import defaultdict
import numpy as np

def compute_entity_skew(df, predictions):
    entity_sentiments = defaultdict(list)

    for persons, sentiment in zip(df["persons"], predictions):
        for p in persons:
            entity_sentiments[p].append(sentiment)

    entity_skew = []
    for person, sentiments in entity_sentiments.items():
        avg_sentiment = np.mean(sentiments)
        count = len(sentiments)
        entity_skew.append((person, avg_sentiment, count))

    skew_df = pd.DataFrame(entity_skew, columns=["entity", "avg_sentiment", "count"])
    return skew_df.sort_values(by="count", ascending=False)

# Example using Logistic Regression predictions
skew_df = compute_entity_skew(test_df, pred_lr)
skew_df.head(10)


In [None]:
biased_positive = skew_df[skew_df["avg_sentiment"] > 0.7].head(10)
biased_negative = skew_df[skew_df["avg_sentiment"] < 0.3].head(10)

print("Entities with unusually positive sentiment:")
print(biased_positive)

print("\nEntities with unusually negative sentiment:")
print(biased_negative)
