In [None]:
# 📌 Install required libraries
!pip install spacy
!python -m spacy download en_core_web_sm

In [None]:
# 📌 Imports and Mount Google Drive
import pandas as pd
import numpy as np
import os
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV, train_test_split
from sklearn.metrics import classification_report, accuracy_score
import spacy
from tqdm import tqdm

# Load SpaCy model
nlp = spacy.load("en_core_web_sm")

# Mount Google Drive to access CSVs
from google.colab import drive
drive.mount('/content/drive')

In [None]:
# 📁 Load and concatenate all CSVs from Google Drive
paths = [f"/content/drive/My Drive/amazon-reviews-{i}.csv" for i in range(1, 9)]
dfs = [pd.read_csv(p) for p in paths]
df = pd.concat(dfs, ignore_index=True)

# 🔍 Show column names and first few rows
df.head()

## 🧹 Text Preprocessing: Lowercase and Lemmatization
We apply SpaCy lemmatization and lowercase text to normalize vocabulary.

In [None]:
# 🧼 Lemmatization & Lowercase
def preprocess_text(text):
    doc = nlp(text.lower())
    return ' '.join([token.lemma_ for token in doc if token.is_alpha])

# Use only first 10,000 for faster vocabulary count
sample_texts = df['reviewText'].dropna().sample(10000, random_state=42)
lemmatized_texts = sample_texts.apply(preprocess_text)

## 📊 Vocabulary Size after Preprocessing
Using `CountVectorizer` to count the unique vocabulary after lowercase + lemmatization.

In [None]:
vectorizer = CountVectorizer()
vectorizer.fit(lemmatized_texts)
vocab_size = len(vectorizer.vocabulary_)
print(f"Vocabulary size after lowercase and lemmatization: {vocab_size}")

## 🤖 Binary Classification: Predict if Review is 5-Star
We build a pipeline with:
- **TfidfVectorizer**
- **LogisticRegression**

And explore hyperparameters:
- `C` for regularization
- `token_pattern`
- `use_idf`
- `ngram_range`

In [None]:
# 🎯 Restrict to 50,000 samples
df = df[['reviewText', 'overall']].dropna()
df = df.sample(50000, random_state=42)

# 🔄 Binary classification target
df['label'] = (df['overall'] == 5).astype(int)
X = df['reviewText']
y = df['label']

# 📈 Split
X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y, random_state=42)

In [None]:
# 📌 Define pipeline
pipeline = Pipeline([
    ('tfidf', TfidfVectorizer()),
    ('clf', LogisticRegression(max_iter=1000))
])

# 🧪 Grid search hyperparameters
param_grid = {
    'tfidf__min_df': [5],
    'tfidf__max_df': [0.9],
    'tfidf__token_pattern': [r'\b[a-zA-Z]{3,}\b'],
    'tfidf__use_idf': [True, False],
    'tfidf__ngram_range': [(1,1), (1,2)],
    'clf__C': [0.01, 0.1, 1, 10],
}

# 🔍 Grid search with 3-fold CV
grid = GridSearchCV(pipeline, param_grid, cv=3, scoring='accuracy', n_jobs=-1, verbose=2)
grid.fit(X_train, y_train)

## ✅ Best Model Results and Evaluation
We now evaluate the best pipeline from the grid search on the test set.

In [None]:
# 📢 Best parameters and accuracy
print("Best Parameters:", grid.best_params_)

# 🧪 Test set performance
y_pred = grid.predict(X_test)
print("Accuracy:", accuracy_score(y_test, y_pred))
print("\nClassification Report:\n", classification_report(y_test, y_pred))