In [32]:
import os, json, random, joblib, re
import numpy as np, pandas as pd
import matplotlib.pyplot as plt, seaborn as sns
from collections import Counter
from wordcloud import WordCloud
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from sklearn.model_selection import train_test_split, cross_val_score, StratifiedKFold
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.svm import LinearSVC
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score, classification_report, confusion_matrix

SEED = 42
random.seed(SEED); np.random.seed(SEED)

INPUT_PATH = "/mnt/data/amazonreviews.tsv"
OUT_DIR = "/mnt/data/amazon_sentiment_outputs"
os.makedirs(OUT_DIR, exist_ok=True)

In [9]:
# Ensure NLTK assets (first run may download)
nltk_packages = ["stopwords", "punkt", "wordnet", "omw-1.4"]
for pkg in nltk_packages:
    try:
        nltk.data.find(pkg)
    except Exception:
        nltk.download(pkg)

STOPWORDS = set(stopwords.words("english"))
LEMMA = WordNetLemmatizer()

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data] Downloading package omw-1.4 to /root/nltk_data...


In [14]:
#print("STEP 1: Loading dataset...")
df = pd.read_csv('amazonreviews.tsv',sep='\t')
print("Rows:", len(df), "Columns:", df.columns.tolist())


Rows: 10000 Columns: ['label', 'review']


In [16]:
print("STEP 2: Quick cleaning...")
expected = {'label','review'}
if not expected.issubset(df.columns):
    raise ValueError(f"Input must contain columns: {expected}")

df = df.drop_duplicates().reset_index(drop=True)
df = df.dropna(subset=['label']).reset_index(drop=True)
df['review'] = df['review'].fillna("")


STEP 2: Quick cleaning...


In [17]:
# normalize labels to 0/1
df['label'] = df['label'].astype(str).str.strip().str.lower()
label_map = {'pos':1,'positive':1,'neg':0,'negative':0,'1':1,'0':0}
df['label'] = df['label'].map(label_map)
df = df.dropna(subset=['label'])
df['label'] = df['label'].astype(int)
print("After cleaning rows:", len(df), "Label counts:\n", df['label'].value_counts())


After cleaning rows: 10000 Label counts:
 label
0    5097
1    4903
Name: count, dtype: int64


In [18]:
print("STEP 3: EDA...")
df['review_len'] = df['review'].astype(str).apply(len)
print("Review length summary:\n", df['review_len'].describe())

# top tokens sample

STEP 3: EDA...
Review length summary:
 count    10000.000000
mean       438.695400
std        239.241132
min        101.000000
25%        238.000000
50%        391.000000
75%        605.000000
max       1015.000000
Name: review_len, dtype: float64


In [21]:
import nltk
nltk.download('punkt_tab')


[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt_tab.zip.


True

In [22]:
# top tokens sample
sample_text = " ".join(df['review'].astype(str).values[:5000]).lower()
sample_text = re.sub(r"[^a-z0-9\s']", " ", sample_text)
tokens = [t for t in nltk.word_tokenize(sample_text) if t not in STOPWORDS]
print("Top tokens:", Counter(tokens).most_common(15))

Top tokens: [('book', 2952), ("'s", 2836), ("n't", 2763), ('one', 2046), ('like', 1377), ('great', 1352), ('good', 1348), ('would', 1319), ('read', 1224), ('time', 1003), ('get', 974), ('movie', 969), ('really', 784), ('first', 766), ('much', 709)]


In [23]:
# wordclouds (save to OUT_DIR)
def light_prep(s):
    s = str(s).lower()
    s = re.sub(r"[^a-z0-9\s']", " ", s)
    return " ".join([w for w in nltk.word_tokenize(s) if w not in STOPWORDS])

pos_text = " ".join(df[df['label']==1]['review'].astype(str).map(light_prep).values[:5000])
neg_text = " ".join(df[df['label']==0]['review'].astype(str).map(light_prep).values[:5000])
WordCloud(width=600, height=300).generate(pos_text).to_file(os.path.join(OUT_DIR, "wordcloud_pos.png"))
WordCloud(width=600, height=300).generate(neg_text).to_file(os.path.join(OUT_DIR, "wordcloud_neg.png"))


<wordcloud.wordcloud.WordCloud at 0x7f4264b4cb90>

In [24]:
print("STEP 4: Preprocessing (tokenize, remove stopwords, lemmatize)...")
def clean_text(s):
    if pd.isna(s): return ""
    s = str(s).lower()
    s = re.sub(r"<.*?>", " ", s)
    s = re.sub(r"http\S+|www.\S+", " ", s)
    s = re.sub(r"[^a-z0-9\s']", " ", s)
    s = re.sub(r"\s+", " ", s).strip()
    return s

def preprocess_text(s):
    s = clean_text(s)
    toks = nltk.word_tokenize(s)
    toks = [t for t in toks if t not in STOPWORDS]
    toks = [LEMMA.lemmatize(t) for t in toks]
    return " ".join(toks)

# apply (can be slower on large datasets)
df['review_clean'] = df['review'].astype(str).map(preprocess_text)

STEP 4: Preprocessing (tokenize, remove stopwords, lemmatize)...


In [25]:
print("STEP 5: Train/test split...")
X = df['review_clean'].values
y = df['label'].values
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, stratify=y, random_state=SEED)
print("Train size:", len(X_train), "Test size:", len(X_test))

STEP 5: Train/test split...
Train size: 8000 Test size: 2000


In [26]:
print("STEP 6: TF-IDF vectorizer...")
tfidf = TfidfVectorizer(max_features=20000, ngram_range=(1,2), min_df=5)

STEP 6: TF-IDF vectorizer...


In [27]:
print("STEP 7: Build pipelines (Logistic, SVM, MLP)...")
pipelines = {
    "logistic": Pipeline([("tfidf", tfidf), ("clf", LogisticRegression(max_iter=1000, random_state=SEED))]),
    "svm": Pipeline([("tfidf", tfidf), ("clf", LinearSVC(max_iter=10000, random_state=SEED))]),
    "mlp": Pipeline([("tfidf", tfidf), ("clf", MLPClassifier(hidden_layer_sizes=(100,), max_iter=50, random_state=SEED))])
}

STEP 7: Build pipelines (Logistic, SVM, MLP)...


In [29]:
# ----------------- STEP 8: Train & evaluate -----------------
print("STEP 8: Train & evaluate models...")
results = {}
for name, pipe in pipelines.items():
    print(f"Training {name}...")
    pipe.fit(X_train, y_train)
    y_pred = pipe.predict(X_test)
    acc = accuracy_score(y_test, y_pred)
    f1 = f1_score(y_test, y_pred)
    prec = precision_score(y_test, y_pred)
    rec = recall_score(y_test, y_pred)
    cm = confusion_matrix(y_test, y_pred)
    print(f"{name} -> acc:{acc:.4f} f1:{f1:.4f} prec:{prec:.4f} rec:{rec:.4f}")
    results[name] = {
        "accuracy": float(acc),
        "f1": float(f1),
        "precision": float(prec),
        "recall": float(rec),
        "confusion_matrix": cm.tolist()
    }
    joblib.dump(pipe, os.path.join(OUT_DIR, f"{name}_pipeline.joblib"))

STEP 8: Train & evaluate models...
Training logistic...
logistic -> acc:0.8510 f1:0.8480 prec:0.8488 rec:0.8471
Training svm...
svm -> acc:0.8430 f1:0.8403 prec:0.8386 rec:0.8420
Training mlp...




mlp -> acc:0.8315 f1:0.8294 prec:0.8239 rec:0.8349


In [30]:
#cross validation
print("STEP 9: 5-fold CV on training set...")
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=SEED)
cv_summary = {}
for name, pipe in pipelines.items():
    print("CV:", name)
    scores = cross_val_score(pipe, X_train, y_train, cv=cv, scoring='f1', n_jobs=-1)
    cv_summary[name] = {"f1_mean": float(scores.mean()), "f1_std": float(scores.std()), "folds": [float(s) for s in scores.tolist()]}


STEP 9: 5-fold CV on training set...
CV: logistic
CV: svm
CV: mlp


In [31]:
# ----------------- STEP 10: Save metrics & outputs -----------------
print("STEP 10: Save metrics & artifacts...")
def make_json_serializable(o):
    if isinstance(o, dict):
        return {k: make_json_serializable(v) for k,v in o.items()}
    if isinstance(o, list):
        return [make_json_serializable(v) for v in o]
    if isinstance(o, np.generic):
        return o.item()
    return o

out_metrics = {
    "results_test": results,
    "cv_summary": cv_summary,
    "n_rows": int(len(df)),
    "train_size": int(len(X_train)),
    "test_size": int(len(X_test))
}
out_metrics = make_json_serializable(out_metrics)
with open(os.path.join(OUT_DIR, "metrics_summary.json"), "w") as f:
    json.dump(out_metrics, f, indent=2)

# save confusion matrices images
for name, info in results.items():
    cm = np.array(info['confusion_matrix'])
    plt.figure(figsize=(4,3))
    sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', xticklabels=['neg','pos'], yticklabels=['neg','pos'])
    plt.title(f"Confusion - {name}")
    plt.xlabel("Predicted"); plt.ylabel("Actual")
    plt.tight_layout()
    plt.savefig(os.path.join(OUT_DIR, f"confusion_{name}.png"))
    plt.close()

print("Done. Artifacts saved to:", OUT_DIR)
print("Examples: models (*.joblib), wordclouds, confusion matrices, metrics_summary.json")










STEP 10: Save metrics & artifacts...
Done. Artifacts saved to: /mnt/data/amazon_sentiment_outputs
Examples: models (*.joblib), wordclouds, confusion matrices, metrics_summary.json
