In [17]:
import warnings
warnings.filterwarnings('ignore')
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
import matplotlib.pyplot as plt
from wordcloud import WordCloud

# This cell imports necessary libraries for data manipulation, text vectorization,
# and visualization. Warnings are ignored to keep the output clean.

In [18]:
import nltk
nltk.download('punkt')
nltk.download('stopwords')
from nltk.corpus import stopwords

# This cell downloads and imports the NLTK library, specifically the 'punkt'
# tokenizer and 'stopwords' corpus, which are used for text preprocessing.

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [19]:
data = pd.read_csv('AmazonReview.csv', quotechar='"')
data.head()

# This cell loads the Amazon review data from a CSV file into a pandas DataFrame
# and displays the first few rows to inspect the data structure.

Unnamed: 0,Review,Sentiment
0,Fast shipping but this product is very cheaply...,1
1,This case takes so long to ship and it's not e...,1
2,Good for not droids. Not good for iPhones. You...,1
3,The cable was not compatible between my macboo...,1
4,The case is nice but did not have a glow light...,1


In [20]:
data.info()

# This cell provides a concise summary of the DataFrame, including the number
# of non-null values in each column and the data types.

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 25000 entries, 0 to 24999
Data columns (total 2 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   Review     24999 non-null  object
 1   Sentiment  25000 non-null  int64 
dtypes: int64(1), object(1)
memory usage: 390.8+ KB


In [21]:
#1,2,3->negative(i.e 0)
data.loc[data['Sentiment']<=3,'Sentiment'] = 0

#4,5->positive(i.e 1)
data.loc[data['Sentiment']>3,'Sentiment'] = 1

# This cell converts the original sentiment ratings (1-5) into a binary
# classification: 0 for negative sentiments (1, 2, and 3) and 1 for positive
# sentiments (4 and 5).

In [22]:
stp_words=stopwords.words('english')
def clean_review(review):
  # Convert review to string before splitting
  cleanreview=" ".join(word for word in str(review).
                       split() if word not in stp_words)
  return cleanreview

data['Review']=data['Review'].apply(clean_review)
data.dropna(inplace=True)

# This cell defines a function to clean the review text by removing English
# stop words. It then applies this function to the 'Review' column and removes
# any rows with missing values.

In [23]:
data.head()

# This cell displays the first few rows of the DataFrame after cleaning the
# 'Review' column and dropping rows with missing values.

Unnamed: 0,Review,Sentiment
0,Fast shipping product cheaply made I brought g...,0
1,This case takes long ship even worth DONT BUY!!!!,0
2,Good droids. Not good iPhones. You cannot use ...,0
3,The cable compatible macbook iphone. Also conn...,0
4,The case nice glow light. I'm disappointed pro...,0


In [24]:
data['Sentiment'].value_counts()

# This cell counts the occurrences of each sentiment label (0 and 1) in the
# 'Sentiment' column to show the distribution of sentiments in the dataset.

Unnamed: 0_level_0,count
Sentiment,Unnamed: 1_level_1
0,15000
1,10000


In [25]:
cv = TfidfVectorizer(max_features=2500)
X = cv.fit_transform(data['Review'] ).toarray()

# This cell initializes a TF-IDF Vectorizer to convert the text reviews into
# numerical feature vectors. It fits the vectorizer to the cleaned reviews and
# transforms them into a matrix of TF-IDF features, limiting the features to
# the top 2500.

In [26]:
from sklearn.model_selection import train_test_split
x_train ,x_test,y_train,y_test=train_test_split(X,data['Sentiment'],
                                                test_size=0.25 ,
                                                random_state=42)

# This cell splits the data into training and testing sets. 75% of the data
# is used for training and 25% for testing, ensuring a random split for
# reproducibility.

In [27]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

model=LogisticRegression()

#Model fitting
model.fit(x_train,y_train)

#testing the model
pred=model.predict(x_test)

#model accuracy
print(accuracy_score(y_test,pred))

# This cell trains a Logistic Regression model on the training data and
# evaluates its accuracy on the testing data.

0.82064


In [30]:
def predict_sentiment(review, vectorizer, model, stop_words):
  """Predicts the sentiment of a given review string."""
  # Preprocess the review (clean and remove stop words)
  cleaned_review = " ".join(word for word in str(review).split() if word not in stop_words)
  # Vectorize the cleaned review using the fitted TF-IDF vectorizer
  review_vector = vectorizer.transform([cleaned_review])
  # Predict the sentiment using the trained model
  prediction = model.predict(review_vector)
  # Return the sentiment label (0 for negative, 1 for positive)
  return "Positive" if prediction[0] == 1 else "Negative"

# Get user input
user_review = input("Enter a review to predict its sentiment: ")

# Predict and print the sentiment
sentiment = predict_sentiment(user_review, cv, model, stp_words)
print(f"The predicted sentiment for the review is: {sentiment}")

# This cell defines a function to predict the sentiment of a new review using
# the trained model. It takes a review string as input, preprocesses it,
# vectorizes it, and then uses the model to predict and print the sentiment
# (Positive or Negative).

Enter a review to predict its sentiment: its good
The predicted sentiment for the review is: Positive
