<a href="https://colab.research.google.com/github/anoushkagarg003/sentiment_analysis_amazon/blob/main/models.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [6]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.svm import LinearSVC
from sklearn.metrics import accuracy_score, classification_report
from sklearn.naive_bayes import MultinomialNB

In [13]:
df=pd.read_csv('clean_text_new.csv')
df['sentiment'] = df['Stars'].apply(lambda x: 'Positive' if x >= 3 else 'Negative')
df['clean_text_train'] = df['clean_text_train'].fillna('')

In [14]:
X = df['clean_text_train']
y = df['sentiment']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=42)
print(X_train)
tfidf_vectorizer = TfidfVectorizer()
X_train_tfidf = tfidf_vectorizer.fit_transform(X_train)
X_test_tfidf = tfidf_vectorizer.transform(X_test)

nb_classifier = MultinomialNB()

nb_classifier.fit(X_train_tfidf, y_train)
accuracy = nb_classifier.score(X_test_tfidf, y_test)
print("Accuracy:", accuracy)

6624    say water resistant not value money carry pair...
5664            inner stitching came inner stitching came
7757                              play video good product
6946                                       small not good
6686    kit bag small size zipper good strap adjustabl...
                              ...                        
5226                             good quality value money
5390                                    good product good
860                excellent bag office purpose excellent
7603    impressive ater absorption capability used tim...
7270    product received good packaging put transparen...
Name: clean_text_train, Length: 7258, dtype: object
Accuracy: 0.8004956629491945


In [15]:
X = df['clean_text_train']  # Input features
y = df['sentiment']  # Target variable

# Preprocessing: Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Preprocessing: Vectorize the text data
vectorizer = TfidfVectorizer()
X_train_vec = vectorizer.fit_transform(X_train)
X_test_vec = vectorizer.transform(X_test)

# Model training: Linear Support Vector Classifier (SVC)
model = LinearSVC()
model.fit(X_train_vec,y_train)
y_pred = model.predict(X_test_vec)
accuracy = accuracy_score(y_test, y_pred)
classification_report_str = classification_report(y_test, y_pred)

print('Accuracy:', accuracy)
print('Classification Report:')
print(classification_report_str)

Accuracy: 0.8629882207067576
Classification Report:
              precision    recall  f1-score   support

    Negative       0.81      0.76      0.79       532
    Positive       0.89      0.91      0.90      1081

    accuracy                           0.86      1613
   macro avg       0.85      0.84      0.84      1613
weighted avg       0.86      0.86      0.86      1613



In [16]:
from sklearn.ensemble import RandomForestClassifier
X = df['clean_text_train']
y = df['sentiment']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

vectorizer = TfidfVectorizer()
X_train_vec = vectorizer.fit_transform(X_train)
X_test_vec = vectorizer.transform(X_test)
model = RandomForestClassifier()
model.fit(X_train_vec, y_train)

y_pred = model.predict(X_test_vec)
accuracy = accuracy_score(y_test, y_pred)
print('Accuracy:', accuracy)

Accuracy: 0.8822070675759455


In [18]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

logreg = LogisticRegression()
logreg.fit(X_train_vec, y_train)
y_pred = logreg.predict(X_test_vec)
print(f"Logistic Regression Accuracy: {accuracy_score(y_test, y_pred)}")


Logistic Regression Accuracy: 0.8636081835089895


In [19]:
from sklearn.neighbors import KNeighborsClassifier

knn = KNeighborsClassifier(n_neighbors=5)
knn.fit(X_train_vec, y_train)
y_pred = knn.predict(X_test_vec)
print(f"KNN Accuracy: {accuracy_score(y_test, y_pred)}")

KNN Accuracy: 0.7730936143831371


In [20]:
from sklearn.ensemble import GradientBoostingClassifier

gbc = GradientBoostingClassifier()
gbc.fit(X_train_vec, y_train)
y_pred = gbc.predict(X_test_vec)
print(f"Gradient Boosting Classifier Accuracy: {accuracy_score(y_test, y_pred)}")

Gradient Boosting Classifier Accuracy: 0.8412895226286423


In [22]:
from transformers import DistilBertTokenizer, DistilBertForSequenceClassification
import torch
# Load the tokenizer and model
tokenizer = DistilBertTokenizer.from_pretrained("distilbert-base-uncased-finetuned-sst-2-english")
model = DistilBertForSequenceClassification.from_pretrained("distilbert-base-uncased-finetuned-sst-2-english")

# Define the function to predict sentiment
def predict_sentiment(text):
    inputs = tokenizer(text, return_tensors="pt", truncation=True, padding=True)
    with torch.no_grad():
        logits = model(**inputs).logits
    predicted_class_id = logits.argmax().item()
    return model.config.id2label[predicted_class_id]
df['new_sentiment'] = df['clean_text_train'].apply(predict_sentiment)

In [24]:
df['new_sentiment']

0       POSITIVE
1       POSITIVE
2       POSITIVE
3       POSITIVE
4       POSITIVE
          ...   
8060    NEGATIVE
8061    POSITIVE
8062    NEGATIVE
8063    NEGATIVE
8064    NEGATIVE
Name: new_sentiment, Length: 8065, dtype: object

In [25]:
df['sentiment'] = df['sentiment'].str.lower()
df['new_sentiment'] = df['new_sentiment'].str.lower()

# Calculate accuracy
accuracy = (df['sentiment'] == df['new_sentiment']).mean() * 100

print(f"Accuracy: {accuracy:.2f}%")

Accuracy: 76.81%
