In [1]:
# Import Common Libraries
import numpy as np
import pandas as pd
import re
import string

from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

In [8]:
data = {"text": [
    "This is a great product! Highly recommend it.",
    "Worst experience ever. Would not buy again!",
    "Quality is decent, but could be better.",
    "Absolutely loved it! Five stars.",
    "Terrible, I want a refund!"
], "label": [1, 0, 1, 1, 0]}

df = pd.DataFrame(data)
df.head()

Unnamed: 0,text,label
0,This is a great product! Highly recommend it.,1
1,Worst experience ever. Would not buy again!,0
2,"Quality is decent, but could be better.",1
3,Absolutely loved it! Five stars.,1
4,"Terrible, I want a refund!",0


In [21]:
# Import NLP Libaries
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from sklearn.feature_extraction.text import TfidfVectorizer
from gensim.models import Word2Vec

nltk.download('stopwords')
nltk.download('punkt_tab')
nltk.download('wordnet')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...


True

In [12]:
def preprocess_text(text):
    text = text.lower()  # Lowercasing
    text = re.sub(f"[{string.punctuation}]", "", text)  # Remove punctuation
    tokens = word_tokenize(text)  # Tokenization
    tokens = [word for word in tokens if word not in stopwords.words('english')]  # Remove stopwords
    lemmatizer = WordNetLemmatizer()
    tokens = [lemmatizer.lemmatize(word) for word in tokens]  # Lemmatization
    return " ".join(tokens)

In [23]:
df['clean_text'] = df['text'].apply(preprocess_text)

In [44]:
# Feature Extraction using TF-IDF
vectorizer = TfidfVectorizer()
X_tfidf = vectorizer.fit_transform(df['clean_text'])
X_tfidf.toarray()[0]

array([0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. , 0.5, 0.5, 0. , 0.5, 0. ,
       0.5, 0. , 0. , 0. , 0. , 0. , 0. ])

In [57]:
# Feature Extraction using Word2Vec
sentences = [sentence.split() for sentence in df['clean_text']]
word2vec_model = Word2Vec(sentences, vector_size=12, window=5, min_count=1, workers=5)

In [59]:
def get_word2vec_embedding(text):
    words = text.split()
    embeddings = [word2vec_model.wv[word] for word in words if word in word2vec_model.wv]
    return np.mean(embeddings, axis=0) if embeddings else np.zeros(100)

In [60]:
X_word2vec = np.array([get_word2vec_embedding(text) for text in df['clean_text']])

In [26]:
y = df['label']

In [64]:
# Train-Test Split
X_train, X_test, y_train, y_test = train_test_split(X_word2vec, y, test_size=0.2)

# Train a Logistic Regression Model
model = LogisticRegression()
model.fit(X_train, y_train)

# Predict and Evaluate
y_pred = model.predict(X_test)
score = accuracy_score(y_test, y_pred)
print(f'Accuracy :',score)

Accuracy : 0.0
