In [1]:
import pandas as pd
import re
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score, classification_report



In [2]:
# Load the dataset
file_path = "Restaurant_Reviews.tsv"
dataset = pd.read_csv(file_path, delimiter='\t', quoting=3)
dataset.head(10)


Unnamed: 0,Review,Liked
0,Wow... Loved this place.,1
1,Crust is not good.,0
2,Not tasty and the texture was just nasty.,0
3,Stopped by during the late May bank holiday of...,1
4,The selection on the menu was great and so wer...,1
5,Now I am getting angry and I want my damn pho.,0
6,Honeslty it didn't taste THAT fresh.),0
7,The potatoes were like rubber and you could te...,0
8,The fries were great too.,1
9,A great touch.,1


In [3]:
# Feature extraction
vectorizer = CountVectorizer(max_features=1500)
X = vectorizer.fit_transform(dataset['Review']).toarray()
y = dataset['Liked']

# Split the data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)



In [4]:
from sklearn.linear_model import LogisticRegression

# Train model
model = LogisticRegression()
model.fit(X_train, y_train)

# Predictions
y_pred = model.predict(X_test)

# Evaluate model
accuracy = accuracy_score(y_test, y_pred)
print(f'Accuracy: {accuracy:.2f}')
print(classification_report(y_test, y_pred))


Accuracy: 0.81
              precision    recall  f1-score   support

           0       0.77      0.88      0.82        96
           1       0.87      0.76      0.81       104

    accuracy                           0.81       200
   macro avg       0.82      0.82      0.81       200
weighted avg       0.82      0.81      0.81       200



In [9]:
# Train model
model = MultinomialNB()
model.fit(X_train, y_train)

# Predictions
y_pred = model.predict(X_test)

# Evaluate model
accuracy = accuracy_score(y_test, y_pred)
report = classification_report(y_test, y_pred)

print(f'Accuracy: {accuracy:.2f}')
print(report)


Accuracy: 0.79
              precision    recall  f1-score   support

           0       0.75      0.83      0.79        96
           1       0.83      0.75      0.79       104

    accuracy                           0.79       200
   macro avg       0.79      0.79      0.79       200
weighted avg       0.79      0.79      0.79       200



In [6]:
from sklearn.svm import SVC

# Train model
model = SVC(kernel='linear')
model.fit(X_train, y_train)

# Predictions
y_pred = model.predict(X_test)

# Evaluate model
accuracy = accuracy_score(y_test, y_pred)
print(f'Accuracy: {accuracy:.2f}')
print(classification_report(y_test, y_pred))


Accuracy: 0.79
              precision    recall  f1-score   support

           0       0.75      0.85      0.80        96
           1       0.84      0.73      0.78       104

    accuracy                           0.79       200
   macro avg       0.79      0.79      0.79       200
weighted avg       0.80      0.79      0.79       200



In [7]:
from sklearn.ensemble import RandomForestClassifier

# Train model
model = RandomForestClassifier(n_estimators=100, random_state=42)
model.fit(X_train, y_train)

# Predictions
y_pred = model.predict(X_test)

# Evaluate model
accuracy = accuracy_score(y_test, y_pred)
print(f'Accuracy: {accuracy:.2f}')
print(classification_report(y_test, y_pred))


Accuracy: 0.76
              precision    recall  f1-score   support

           0       0.70      0.89      0.78        96
           1       0.86      0.64      0.74       104

    accuracy                           0.76       200
   macro avg       0.78      0.76      0.76       200
weighted avg       0.78      0.76      0.76       200



In [10]:
from xgboost import XGBClassifier

# Train model
model = XGBClassifier(use_label_encoder=False, eval_metric='logloss')
model.fit(X_train, y_train)

# Predictions
y_pred = model.predict(X_test)

# Evaluate model
accuracy = accuracy_score(y_test, y_pred)
print(f'Accuracy: {accuracy:.2f}')
print(classification_report(y_test, y_pred))


ModuleNotFoundError: No module named 'xgboost'

In [None]:
#Alternative Machine Learning Models for Sentiment Analysis
#Since Multinomial Naïve Bayes (MultinomialNB) is used, exploring other models can improve accuracy and performance.

#Logistic Regression: A strong baseline for text classification. It is simple, interpretable, and effective for binary sentiment analysis but struggles with highly imbalanced datasets.

#Support Vector Machine (SVM): Works well with high-dimensional text data. It is powerful for small datasets and captures decision boundaries effectively but can be computationally expensive.

#Random Forest: A robust ensemble learning method that handles complex relationships between words well. It is less prone to overfitting compared to Naïve Bayes but may be slower with large feature sets.

#XGBoost: A boosting algorithm designed for structured text data. It provides high accuracy and handles overfitting well but requires tuning for optimal performance.

#Deep Learning Models (LSTM/BERT): These models understand word relationships and contextual meaning in sentiment analysis. They perform exceptionally well but require more data and computational resources.

#Choosing the Right Model
#If you need simplicity and speed, go for Logistic Regression.

#If you want higher accuracy, SVM is a great choice.

#If your dataset is large, XGBoost is optimal.

#If you need interpretability, Random Forest is a good option.

#If you want deep contextual understanding, LSTM/BERT will provide the best results.

In [None]:
# apart from vectorizer which method can be used


# There are several alternative methods to CountVectorizer for converting text into numerical form for machine learning models. Each has its own strengths and is suited to different use cases. Here's a detailed comparison:

# 🔹 1. TfidfVectorizer (Term Frequency - Inverse Document Frequency)
# 📌 Description:
# Unlike CountVectorizer, which counts word frequency, TfidfVectorizer measures how important a word is in a document relative to the entire corpus.

# It reduces the impact of frequently occurring but less meaningful words (like "the", "is").

# ✅ Pros:
# Gives less weight to common words, more to unique ones.

# Improves performance in many NLP tasks.

# 📦 Code Example:
# python
# Copy
# Edit
# from sklearn.feature_extraction.text import TfidfVectorizer

# vectorizer = TfidfVectorizer(max_features=1500)
# X = vectorizer.fit_transform(dataset['Review']).toarray()
# 🔹 2. HashingVectorizer
# 📌 Description:
# Uses a hashing trick to convert words to numbers without storing the vocabulary.

# Fixed-size vector space; very fast and memory-efficient.

# ✅ Pros:
# Good for large-scale applications and online learning.

# No need to keep a vocabulary dictionary.

# ❌ Cons:
# Cannot reverse-map numbers back to words.

# Possible hash collisions.

# 📦 Code Example:
# python
# Copy
# Edit
# from sklearn.feature_extraction.text import HashingVectorizer

# vectorizer = HashingVectorizer(n_features=1500)
# X = vectorizer.transform(dataset['Review']).toarray()
# 🔹 3. Word Embeddings (Word2Vec / GloVe / FastText)
# 📌 Description:
# Word embeddings represent words as dense vectors (e.g., 100 or 300 dimensions) capturing semantic meaning.

# Pre-trained embeddings like GloVe or Word2Vec are often used.

# ✅ Pros:
# Captures context and meaning (e.g., “king” - “man” + “woman” ≈ “queen”).

# Useful for deep learning and NLP tasks.

# ❌ Cons:
# More complex to implement.

# Requires more compute power.

# 📦 Code Example (with Gensim Word2Vec):
# python
# Copy
# Edit
# from gensim.models import Word2Vec
# sentences = [review.split() for review in dataset['Review']]
# model = Word2Vec(sentences, vector_size=100, window=5, min_count=1)
# Or you can use pre-trained GloVe/Word2Vec and average word vectors for each review.

# 🔹 4. Transformers / BERT Embeddings
# 📌 Description:
# State-of-the-art models like BERT, RoBERTa, etc., generate contextual embeddings.

# Words are embedded differently based on their surrounding words.

# ✅ Pros:
# Extremely powerful, captures deep context.

# Great for sentiment analysis and many NLP tasks.

# ❌ Cons:
# Requires GPU or strong CPU.

# More complex to implement.

# 📦 Example with transformers:
# python
# Copy
# Edit
# from transformers import BertTokenizer, BertModel
# import torch

# tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
# model = BertModel.from_pretrained('bert-base-uncased')

# inputs = tokenizer("Sample review text", return_tensors="pt")
# outputs = model(**inputs)
# embedding = outputs.last_hidden_state.mean(dim=1)
# ✅ Summary Table:
# Method	Context-Aware?	Sparse/Dense	Easy to Use	Best For
# CountVectorizer	❌ No	Sparse	✅ Yes	Basic text classification
# TfidfVectorizer	❌ No	Sparse	✅ Yes	Better weighted word importance
# HashingVectorizer	❌ No	Sparse	✅ Yes	Large datasets, fast processing
# Word2Vec / GloVe	✅ Yes (some)	Dense	⚠ Medium	Semantics and similarity
# BERT / Transformers	✅✅ Very Much	Dense	❌ Complex	Advanced NLP tasks
# Let me know your use case (e.g., sentiment analysis, spam detection, chatbot) — I can suggest the best method with example code!



# ✅ By default, CountVectorizer does convert text to lowercase automatically.
# So you don’t have to explicitly convert text to lowercase before using it.

# 🔍 Here's what happens under the hood:
# When you use:

# python
# Copy
# Edit
# vectorizer = CountVectorizer()
# Internally, it uses:

# python
# Copy
# Edit
# preprocessor = text.lower()
# This is controlled by the parameter:

# python
# Copy
# Edit
# lowercase=True  # default behavior
# 🛠️ If you want to disable it for any reason:
# python
# Copy
# Edit
# vectorizer = CountVectorizer(lowercase=False)
# This will preserve original casing (e.g., "Apple" and "apple" will be treated as different words).

# 🧪 Quick Example:
# python
# Copy
# Edit
# from sklearn.feature_extraction.text import CountVectorizer

# docs = ["I Love NLP", "i love nlp"]

# vectorizer = CountVectorizer()
# X = vectorizer.fit_transform(docs)
# print(vectorizer.get_feature_names_out())
# 📤 Output:

# css
# Copy
# Edit
# ['love' 'nlp']
# So both "Love" and "love" are treated the same due to automatic lowercasing.

# 💡 Tip:
# Even though CountVectorizer lowers the case, you might still want to do other cleaning steps manually before vectorizing:

# Removing punctuation

# Removing stopwords

# Lemmatization or stemming

# This is especially useful when you want more control over:

# Cleaning your text

# Improving model performance

# Reducing noise

# 🛠️ NLP Text Preprocessing Pipeline – Step-by-Step
# We'll cover each of these:

# Lowercasing

# Removing punctuation

# Removing stopwords

# Tokenization

# Stemming or Lemmatization

# Rejoining tokens

# We’ll use nltk and re libraries here.

# 🔹 1. Lowercasing
# 📌 Purpose:
# To make the text uniform — "Apple" and "apple" will be treated the same.

# ✅ Code:
# python
# Copy
# Edit
# text = "I Love NLP!"
# text = text.lower()
# print(text)  # Output: "i love nlp!"
# 🔹 2. Removing Punctuation
# 📌 Purpose:
# Punctuation is usually not useful for sentiment analysis, classification, etc.

# ✅ Code:
# python
# Copy
# Edit
# import re

# text = re.sub(r'[^\w\s]', '', text)
# print(text)  # Output: "i love nlp"
# \w matches words (letters/numbers)

# \s matches spaces

# ^ negates — so it removes anything that is NOT a word or space

# 🔹 3. Removing Stopwords
# 📌 Purpose:
# Stopwords like "is", "the", "and", etc., appear very frequently and usually don't carry meaning.

# ✅ Code:
# python
# Copy
# Edit
# import nltk
# from nltk.corpus import stopwords

# nltk.download('stopwords')
# stop_words = set(stopwords.words('english'))

# words = text.split()
# filtered_words = [word for word in words if word not in stop_words]
# print(filtered_words)  # Output: ['love', 'nlp']
# 🔹 4. Tokenization
# 📌 Purpose:
# Splitting a sentence into individual words (tokens).

# ✅ Code (basic):
# python
# Copy
# Edit
# from nltk.tokenize import word_tokenize
# nltk.download('punkt')

# text = "I love NLP"
# tokens = word_tokenize(text.lower())
# print(tokens)  # Output: ['i', 'love', 'nlp']
# 🔹 5. Stemming or Lemmatization
# You can choose one:

# 🟣 5A. Stemming (Porter Stemmer)
# Cuts off word suffixes.

# Less accurate, but faster.

# python
# Copy
# Edit
# from nltk.stem import PorterStemmer
# ps = PorterStemmer()

# stemmed = [ps.stem(word) for word in filtered_words]
# print(stemmed)  # Output: ['love', 'nlp'] (no change in this example)
# 🟢 5B. Lemmatization (WordNet Lemmatizer)
# Smarter – returns actual base/root word (lemma).

# Slower but more accurate.

# python
# Copy
# Edit
# from nltk.stem import WordNetLemmatizer
# nltk.download('wordnet')
# lemmatizer = WordNetLemmatizer()

# lemmatized = [lemmatizer.lemmatize(word) for word in filtered_words]
# print(lemmatized)  # Output: ['love', 'nlp']
# 🔹 6. Rejoining Tokens
# 📌 Purpose:
# After preprocessing, you might want to convert tokens back into full cleaned-up text.

# python
# Copy
# Edit
# cleaned_text = ' '.join(lemmatized)
# print(cleaned_text)  # Output: "love nlp"
# ✅ Full Pipeline Function
# Here’s everything in one function:

# python
# Copy
# Edit
# import re
# import nltk
# from nltk.corpus import stopwords
# from nltk.stem import WordNetLemmatizer
# from nltk.tokenize import word_tokenize

# # Downloads (run once)
# nltk.download('stopwords')
# nltk.download('punkt')
# nltk.download('wordnet')

# def preprocess_text(text):
#     # Lowercase
#     text = text.lower()
    
#     # Remove punctuation
#     text = re.sub(r'[^\w\s]', '', text)
    
#     # Tokenize
#     tokens = word_tokenize(text)
    
#     # Remove stopwords
#     stop_words = set(stopwords.words('english'))
#     tokens = [word for word in tokens if word not in stop_words]
    
#     # Lemmatize
#     lemmatizer = WordNetLemmatizer()
#     tokens = [lemmatizer.lemmatize(word) for word in tokens]
    
#     # Rejoin
#     cleaned_text = ' '.join(tokens)
#     return cleaned_text
# 🔄 Usage Example:
# python
# Copy
# Edit
# dataset['Cleaned_Review'] = dataset['Review'].apply(preprocess_text)
# Then you can vectorize:

# python
# Copy
# Edit
# vectorizer = CountVectorizer(max_features=1500)
# X = vectorizer.fit_transform(dataset['Cleaned_Review']).toarray()

# Let's now create the same custom text preprocessing pipeline using spaCy, which is a powerful and modern NLP library.

# 🚀 Why spaCy?
# It's faster and more accurate than NLTK.

# Automatically handles:

# Tokenization

# Lowercasing

# Lemmatization

# POS tagging (if needed)

# Named Entity Recognition

# Offers cleaner and more efficient code.

# 🧪 Installation (if needed):
# bash
# Copy
# Edit
# pip install spacy
# python -m spacy download en_core_web_sm
# 🛠️ spaCy Preprocessing Pipeline
# Here’s a full function using spaCy for:

# Lowercasing

# Removing punctuation

# Removing stopwords

# Lemmatization

# Rejoining tokens

# ✅ Code:
# python
# Copy
# Edit
# import spacy

# # Load spaCy English model
# nlp = spacy.load('en_core_web_sm')

# def spacy_preprocess(text):
#     doc = nlp(text)
#     tokens = []

#     for token in doc:
#         # Filter out stopwords and punctuation
#         if not token.is_stop and not token.is_punct:
#             # Lemmatize and lowercase
#             tokens.append(token.lemma_.lower())

#     return ' '.join(tokens)
# 🔄 Example:
# python
# Copy
# Edit
# text = "I absolutely loved this NLP course! It's very informative and engaging."
# cleaned = spacy_preprocess(text)
# print(cleaned)
# 📤 Output (approx):

# nginx
# Copy
# Edit
# absolutely love nlp course informative engaging
# 🔁 On Your Dataset:
# python
# Copy
# Edit
# dataset['Cleaned_Review'] = dataset['Review'].apply(spacy_preprocess)
# Then vectorize:

# python
# Copy
# Edit
# from sklearn.feature_extraction.text import CountVectorizer

# vectorizer = CountVectorizer(max_features=1500)
# X = vectorizer.fit_transform(dataset['Cleaned_Review']).toarray()
# ⚖️ spaCy vs NLTK – Quick Summary
# Feature	NLTK	spaCy
# Tokenization	Manual	Built-in and robust
# Lemmatization	Good (WordNet)	Very Good and faster
# POS/NER	Basic	Advanced
# Speed	Slower	Faster
# Learning Curve	Beginner-friendly	More Pythonic and modern
# Use Case	Teaching, research	Production, commercial NLP