# Snorkel on IMDb

In [1]:
pip install snorkel datasets scikit-learn pandas



# 1. Load and Explore the IMDb Dataset

In [2]:
from datasets import load_dataset
import pandas as pd
# Load 2000 training and 500 test examples for speed
imdb = load_dataset("imdb")
# Shuffle the data, then select a smaller sample. This ensures a balanced mix
train = pd.DataFrame(imdb["train"].shuffle(seed=42).select(range(2000)))
test = pd.DataFrame(imdb["test"].shuffle(seed=42).select(range(500)))
print("Train size:", len(train), "Test size:", len(test))
train.head()

Train size: 2000 Test size: 500


Unnamed: 0,text,label
0,There is no relation at all between Fortier an...,1
1,This movie is a great. The plot is very true t...,1
2,"George P. Cosmatos' ""Rambo: First Blood Part I...",0
3,In the process of trying to establish the audi...,1
4,"Yeh, I know -- you're quivering with excitemen...",0


# 2. Preprocess Text

In [3]:
import re
def clean_text(text):
  text = re.sub(r"<br\s*/?>", " ", text)
  text = re.sub(r"[^\w\s']", "", text)
  return text.lower()
train["text"] = train["text"].apply(clean_text)
test["text"] = test["text"].apply(clean_text)

# 3. Define Labeling Functions (LFs)
Create simple heuristics:
- LF_positive: labels text as positive if it contains strong positive words
- LF_negative: labels text as negative if it contains strong negative words
- LF_exclaim: positive if contains “!” more than 2 times

In [4]:
from snorkel.labeling import labeling_function, LFAnalysis
from snorkel.labeling.model.label_model import LabelModel

ABSTAIN, NEG, POS = -1, 0, 1
positive_words = {"great","excellent","amazing","wonderful","best","fantastic"}
negative_words = {"bad","terrible","awful","worst","boring","poor"}

@labeling_function()
def lf_positive(x):
  return POS if any(w in x.text.split() for w in positive_words) else ABSTAIN

@labeling_function()
def lf_negative(x):
  return NEG if any(w in x.text.split() for w in negative_words) else ABSTAIN

@labeling_function()
def lf_exclaim(x):
  return POS if x.text.count("!") > 2 else ABSTAIN
lfs = [lf_positive, lf_negative, lf_exclaim]

# Analyze LF Coverage & Conflicts

In [5]:
from snorkel.labeling import PandasLFApplier
applier = PandasLFApplier(lfs)
L_train = applier.apply(train)
LFAnalysis(L_train, lfs).lf_summary()

100%|██████████| 2000/2000 [00:01<00:00, 1998.69it/s]


Unnamed: 0,j,Polarity,Coverage,Overlaps,Conflicts
lf_positive,0,[1],0.45,0.1385,0.1385
lf_negative,1,[0],0.3755,0.1385,0.1385
lf_exclaim,2,[],0.0,0.0,0.0


# 4. Train the LabelModel

In [6]:
label_model = LabelModel(cardinality=2, verbose=True)
label_model.fit(L_train=L_train, n_epochs=500, log_freq=100, seed=42)

# Get probabilistic labels
train_probs = label_model.predict_proba(L_train)
train_preds = label_model.predict(L_train)

100%|██████████| 500/500 [00:02<00:00, 235.20epoch/s]


# 5. Train an End-to-End Classifier
Use a simple logistic regression on TF-IDF features:

In [7]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report

# Vectorize

vectorizer = TfidfVectorizer(max_features=5_000)
X_train = vectorizer.fit_transform(train["text"])
y_train = train_preds


# Create a mask to select only the data points that were NOT abstained on

mask = y_train != ABSTAIN
X_train_filtered = X_train[mask]
y_train_filtered = y_train[mask]

print(f"Original training data points: {len(y_train)}")
print(f"Training data points after filtering abstains: {len(y_train_filtered)}")

# Fit classifier

clf = LogisticRegression(max_iter=200)
clf.fit(X_train_filtered, y_train_filtered)

# Evaluate on test set

X_test = vectorizer.transform(test["text"])
y_test = test["label"]
preds = clf.predict(X_test)
print(classification_report(y_test, preds, target_names=["neg", "pos"]))

Original training data points: 2000
Training data points after filtering abstains: 1374
              precision    recall  f1-score   support

         neg       0.73      0.81      0.77       254
         pos       0.78      0.70      0.73       246

    accuracy                           0.75       500
   macro avg       0.75      0.75      0.75       500
weighted avg       0.75      0.75      0.75       500



# 6. Evaluate Weak Supervision vs. Fully Supervised
For comparison, train the same classifier on 2,000 true labels:

In [8]:
clf_fs = LogisticRegression(max_iter=200)
clf_fs.fit(X_train, train["label"])
fs_preds = clf_fs.predict(X_test)
print("Fully supervised performance:")
print(classification_report(y_test, fs_preds, target_names=["neg", "pos"]))

Fully supervised performance:
              precision    recall  f1-score   support

         neg       0.84      0.81      0.82       254
         pos       0.81      0.84      0.82       246

    accuracy                           0.82       500
   macro avg       0.82      0.82      0.82       500
weighted avg       0.82      0.82      0.82       500

