In [None]:
!pip -q install datasets scikit-learn pandas joblib

In [1]:
from datasets import load_dataset
import pandas as pd
import os, random

# 1) Load IMDB (50k reviews: 25k train, 25k test; labels: 0=neg, 1=pos)
ds = load_dataset("imdb")

# 2) Make a smaller, quick subset first (you can increase later)
def to_df(split, n=8000):  # ~8k rows quick training; change n if you want
    df = pd.DataFrame(split)[["text","label"]]
    # Balance classes roughly
    pos = df[df.label==1].sample(n//2, random_state=42)
    neg = df[df.label==0].sample(n//2, random_state=42)
    return pd.concat([pos, neg]).sample(frac=1, random_state=42).reset_index(drop=True)

train_df = to_df(ds["train"], n=8000)
test_df  = to_df(ds["test"],  n=2000)

# 3) Ensure folders and save CSVs
os.makedirs("/content/data", exist_ok=True)
train_df.to_csv("/content/data/train.csv", index=False)
test_df.to_csv("/content/data/test.csv", index=False)

# 4) Quick peek
print(train_df.head(3))
print("\nLabel counts (train):")
print(train_df.label.value_counts())
print("\nFiles saved at /content/data/")


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


README.md: 0.00B [00:00, ?B/s]

plain_text/train-00000-of-00001.parquet:   0%|          | 0.00/21.0M [00:00<?, ?B/s]

plain_text/test-00000-of-00001.parquet:   0%|          | 0.00/20.5M [00:00<?, ?B/s]

plain_text/unsupervised-00000-of-00001.p(…):   0%|          | 0.00/42.0M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/25000 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/25000 [00:00<?, ? examples/s]

Generating unsupervised split:   0%|          | 0/50000 [00:00<?, ? examples/s]

                                                text  label
0  A movie made for contemporary audience. The ma...      1
1  Moonwalker is a Fantasy Music film staring Mic...      1
2  One measurement for the greatness of a movie i...      1

Label counts (train):
label
1    4000
0    4000
Name: count, dtype: int64

Files saved at /content/data/


In [3]:
import pandas as pd, os

os.makedirs("/content/data", exist_ok=True)

# Demo rows — baad me aap apna data daal sakte ho
rows = [
    ("The movie was fantastic and I loved the acting", "pos"),
    ("Worst product ever, totally disappointed", "neg"),
    ("Service was quick and helpful", "pos"),
    ("The app keeps crashing. Very bad experience", "neg"),
    ("Absolutely brilliant design and performance", "pos"),
    ("Not worth the money", "neg"),
]
df = pd.DataFrame(rows, columns=["text", "label"])
df.to_csv("/content/data/reviews.csv", index=False)

print("Saved at /content/data/reviews.csv")
pd.read_csv("/content/data/reviews.csv").head()


Saved at /content/data/reviews.csv


Unnamed: 0,text,label
0,The movie was fantastic and I loved the acting,pos
1,"Worst product ever, totally disappointed",neg
2,Service was quick and helpful,pos
3,The app keeps crashing. Very bad experience,neg
4,Absolutely brilliant design and performance,pos


In [4]:
# ==== Install (usually already in Colab) ====
!pip -q install scikit-learn joblib pandas

# ==== Imports ====
import os, joblib, pandas as pd, numpy as np
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score

# ==== 0) Sanity checks ====
assert 'df' in globals(), "Dataframe 'df' not found. Make sure you created df earlier."
print("Columns:", df.columns.tolist())
assert 'text' in df.columns and 'label' in df.columns, "Your df must have 'text' and 'label' columns."

# ==== 1) Clean + optional mapping for star ratings ====
work = df[['text','label']].dropna().copy()

# If labels are numeric (e.g., 1-5 stars), map to pos/neg and drop 3-star neutrals
if np.issubdtype(work['label'].dtype, np.number) or work['label'].astype(str).str.isnumeric().all():
    work['label'] = pd.to_numeric(work['label'], errors='coerce')
    # keep only 1,2,4,5; drop 3 as neutral
    work = work[work['label'].isin([1,2,4,5])]
    work['label'] = work['label'].map(lambda x: 'neg' if x in [1,2] else 'pos')

# If labels are already text like "positive/negative" or "pos/neg", normalize to {'pos','neg'}
work['label'] = work['label'].astype(str).str.lower().str.strip()
label_map = {'positive':'pos','negative':'neg','pos':'pos','neg':'neg','1':'neg','0':'neg','true':'pos','false':'neg'}
work['label'] = work['label'].map(lambda x: label_map.get(x, x))
work = work[work['label'].isin(['pos','neg'])]

print("Class counts:\n", work['label'].value_counts())

# ==== 2) Split ====
X_train, X_test, y_train, y_test = train_test_split(
    work['text'], work['label'],
    test_size=0.2, random_state=42, stratify=work['label']
)

# ==== 3) Pipeline (TF-IDF + Logistic Regression) ====
pipe = Pipeline([
    ("tfidf", TfidfVectorizer(
        lowercase=True,
        ngram_range=(1,2),
        max_features=50000,
        min_df=2
    )),
    ("clf", LogisticRegression(max_iter=1000, n_jobs=-1))
])

pipe.fit(X_train, y_train)

# ==== 4) Evaluate ====
y_pred = pipe.predict(X_test)
acc = accuracy_score(y_test, y_pred)
print(f"\nAccuracy: {acc:.4f}\n")
print(classification_report(y_test, y_pred))
print("Confusion matrix:\n", confusion_matrix(y_test, y_pred))

# Show a few example predictions
sample_idx = np.random.choice(len(X_test), size=min(5, len(X_test)), replace=False)
print("\nSample predictions:")
for i in sample_idx:
    txt = X_test.iloc[i][:200].replace("\n"," ")
    print(f"- {txt!r} → {y_pred[i]} (true: {y_test.iloc[i]})")

# ==== 5) Save model ====
os.makedirs("models", exist_ok=True)
joblib.dump(pipe, "models/tfidf_logreg.joblib")
print("\nSaved model to: models/tfidf_logreg.joblib")


Columns: ['text', 'label']
Class counts:
 label
pos    3
neg    3
Name: count, dtype: int64

Accuracy: 1.0000

              precision    recall  f1-score   support

         neg       1.00      1.00      1.00         1
         pos       1.00      1.00      1.00         1

    accuracy                           1.00         2
   macro avg       1.00      1.00      1.00         2
weighted avg       1.00      1.00      1.00         2

Confusion matrix:
 [[1 0]
 [0 1]]

Sample predictions:
- 'The movie was fantastic and I loved the acting' → pos (true: pos)
- 'Not worth the money' → neg (true: neg)

Saved model to: models/tfidf_logreg.joblib


In [5]:
import joblib

# load once
PIPE_PATH = "models/tfidf_logreg.joblib"
pipe = joblib.load(PIPE_PATH)

def predict(text: str):
    if not text.strip():
        return "empty", 0.0
    pred = pipe.predict([text])[0]
    prob = max(pipe.predict_proba([text])[0])
    return pred, float(prob)

# quick tests
tests = [
    "The movie was fantastic and I loved it!",
    "Worst product, waste of money.",
    "It was okay, not great not terrible."
]
for t in tests:
    p, pr = predict(t)
    print(f"{t} -> {p} ({pr:.2%})")


The movie was fantastic and I loved it! -> pos (59.90%)
Worst product, waste of money. -> neg (59.89%)
It was okay, not great not terrible. -> neg (59.89%)
