In [None]:
import numpy as np
import pandas as pd
import re
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
from sklearn.utils import resample

In [None]:
import nltk
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\acer\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [None]:
print(stopwords.words('english'))
print(len(stopwords.words('english')))

['i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', "you're", "you've", "you'll", "you'd", 'your', 'yours', 'yourself', 'yourselves', 'he', 'him', 'his', 'himself', 'she', "she's", 'her', 'hers', 'herself', 'it', "it's", 'its', 'itself', 'they', 'them', 'their', 'theirs', 'themselves', 'what', 'which', 'who', 'whom', 'this', 'that', "that'll", 'these', 'those', 'am', 'is', 'are', 'was', 'were', 'be', 'been', 'being', 'have', 'has', 'had', 'having', 'do', 'does', 'did', 'doing', 'a', 'an', 'the', 'and', 'but', 'if', 'or', 'because', 'as', 'until', 'while', 'of', 'at', 'by', 'for', 'with', 'about', 'against', 'between', 'into', 'through', 'during', 'before', 'after', 'above', 'below', 'to', 'from', 'up', 'down', 'in', 'out', 'on', 'off', 'over', 'under', 'again', 'further', 'then', 'once', 'here', 'there', 'when', 'where', 'why', 'how', 'all', 'any', 'both', 'each', 'few', 'more', 'most', 'other', 'some', 'such', 'no', 'nor', 'not', 'only', 'own', 'same', 'so', 'than', '

In [None]:
#data preprocessing
data=pd.read_csv('D:\project\Dataset.csv')
data.shape

(205052, 6)

In [None]:
data.head()

Unnamed: 0,product_name,product_price,Rate,Review,Summary,Sentiment
0,Candes 12 L Room/Personal Air Cooler??????(Whi...,3999,5,super!,great cooler excellent air flow and for this p...,positive
1,Candes 12 L Room/Personal Air Cooler??????(Whi...,3999,5,awesome,best budget 2 fit cooler nice cooling,positive
2,Candes 12 L Room/Personal Air Cooler??????(Whi...,3999,3,fair,the quality is good but the power of air is de...,positive
3,Candes 12 L Room/Personal Air Cooler??????(Whi...,3999,1,useless product,very bad product its a only a fan,negative
4,Candes 12 L Room/Personal Air Cooler??????(Whi...,3999,3,fair,ok ok product,neutral


In [None]:
data.isnull().sum()


product_name         0
product_price        0
Rate                 0
Review           24664
Summary             11
Sentiment            0
dtype: int64

In [None]:
data['Review'].fillna('No Review', inplace=True)
data = data.dropna(subset=['Summary'])

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  data['Review'].fillna('No Review', inplace=True)


In [None]:
negative_count = len(data[data['Sentiment'] == 'negative'])
print(negative_count)
positive_count = len(data[data['Sentiment'] == 'positive'])
print(positive_count)
neutral_count = len(data[data['Sentiment'] == 'neutral'])
print(neutral_count)

28232
166575
10234


In [None]:
data_negative = data[data['Sentiment'] == 'negative']
data_positive = data[data['Sentiment'] == 'positive']
data_neutral = data[data['Sentiment'] == 'neutral']

In [None]:
data_positive_undersampled = resample(data_positive, 
                                    replace=False,    # Don't sample with replacement
                                    n_samples=negative_count,  # Match the number of negative samples
                                    random_state=42)

In [None]:
data_neutral_oversampled = resample(data_neutral, 
                                  replace=True,     # Sample with replacement to increase count
                                  n_samples=negative_count,  # Match the number of negative samples
                                  random_state=42)

In [None]:
data_balanced = pd.concat([data_negative, data_positive_undersampled, data_neutral_oversampled])


In [None]:
data_balanced = data_balanced.sample(frac=1, random_state=42).reset_index(drop=True)
print(data_balanced['Sentiment'].value_counts())

Sentiment
neutral     28232
negative    28232
positive    28232
Name: count, dtype: int64


In [None]:
#stemming
ps=PorterStemmer()

In [None]:
def stemming(content):
    stemmed_content=re.sub('[^a-zA-Z]',' ',content)
    stemmed_content= stemmed_content.lower()
    stemmed_content=stemmed_content.split()
    stemmed_content=[ps.stem(word) for word in stemmed_content if not word in stopwords.words('english')]
    stemmed_content=' '.join(stemmed_content)

    return stemmed_content

In [None]:
data['stemmed_content']=data['Summary'].apply(stemming)  

In [None]:
data.to_csv("processed_data.csv", index=False)

In [None]:
pdata = pd.read_csv("processed_data.csv")
print(pdata['stemmed_content'])

0         great cooler excel air flow price amaz unbelie...
1                          best budget fit cooler nice cool
2                             qualiti good power air decent
3                                           bad product fan
4                                             ok ok product
                                ...                        
205036                                         good product
205037                                                 nice
205038                                   nice fast deliveri
205039                                       awesom product
205040    good mix bowl includ one disappointmentand sou...
Name: stemmed_content, Length: 205041, dtype: object


In [None]:
print(pdata['Sentiment'])

0         positive
1         positive
2         positive
3         negative
4          neutral
            ...   
205036    positive
205037    positive
205038    positive
205039    positive
205040     neutral
Name: Sentiment, Length: 205041, dtype: object


In [None]:
#seperate data and label
x=pdata['stemmed_content'].values
y=pdata['Sentiment'].values

In [None]:
print(x)

['great cooler excel air flow price amaz unbelievablejust love'
 'best budget fit cooler nice cool' 'qualiti good power air decent' ...
 'nice fast deliveri' 'awesom product'
 'good mix bowl includ one disappointmentand soup bowl small size']


In [None]:
print(y)

['positive' 'positive' 'positive' ... 'positive' 'positive' 'neutral']


In [None]:
#split data in train and test
x_train, x_test, y_train, y_test = train_test_split(x,y,test_size=0.2,stratify=y,random_state=2)

In [None]:
x_train = pd.Series(x_train).fillna("")
x_test = pd.Series(x_test).fillna("")

In [None]:
x_train = [str(doc) for doc in x_train]
x_test = [str(doc) for doc in x_test]

In [None]:
#convert text data to numeric
vectorizer = TfidfVectorizer()
x_train = vectorizer.fit_transform(x_train)
x_test = vectorizer.transform(x_test)

In [None]:
import pickle

In [None]:
pickle.dump(vectorizer,open('vector.pkl','wb'))

In [None]:
print(x_train)

<Compressed Sparse Row sparse matrix of dtype 'float64'
	with 735528 stored elements and shape (164032, 30935)>
  Coords	Values
  (0, 22585)	0.24399655707040324
  (0, 645)	0.2478824683005061
  (0, 22785)	0.40390372493358295
  (0, 15026)	0.23716937481463207
  (0, 19443)	0.27503225451456176
  (0, 30215)	0.3568090093603321
  (0, 8935)	0.32675154949906704
  (0, 8808)	0.3577196355699963
  (0, 27081)	0.21668616210434868
  (0, 26894)	0.36305172913234046
  (0, 9642)	0.20781640178754085
  (1, 17381)	1.0
  (2, 1547)	0.8871001625909581
  (2, 20922)	0.46157697248789986
  (3, 17381)	0.2569589251953826
  (3, 12002)	0.9664223252607701
  (4, 17381)	1.0
  (5, 20922)	0.3838385123356506
  (5, 23769)	0.9234002363265642
  (6, 17381)	0.762646917763201
  (6, 20922)	0.6468150267474382
  (7, 19443)	0.1550728659303979
  (7, 20756)	0.20152272201869587
  (7, 11230)	0.13764330586960818
  (7, 15079)	0.332222770893859
  :	:
  (164019, 8789)	0.9324919085919233
  (164020, 20528)	0.24061481337394672
  (164020, 22676)	0

In [None]:
#training the model
model=LogisticRegression(max_iter=1000)
model.fit(x_train,y_train)

In [None]:
#evaluation for train
x_train_prediction=model.predict(x_train)
training_data_accuracy=accuracy_score(y_train,x_train_prediction)

In [None]:
print('training data accuracy :',training_data_accuracy)

training data accuracy : 0.9240392118611003


In [None]:
#evaluation for test
x_test_prediction=model.predict(x_test)
test_data_accuracy=accuracy_score(y_test,x_test_prediction)

In [None]:
print('test data accuracy :',test_data_accuracy)


test data accuracy : 0.9141895681435782


In [None]:
#filename='train_model.sav'
pickle.dump(model,open('model.pkl','wb'))         

In [None]:
#loading the model
#load_model=pickle.load(open(r'D:\project\train_model.sav','rb'))
load_model=pickle.load(open(r'D:\project\model.pkl','rb'))

In [None]:
x_new=x_test[10]
print(y_test[10])
prediction=load_model.predict(x_new)
print(prediction)

if(prediction[0]=='positive'):
    print("The comment is positive.")
elif(prediction[0]=='negative'):
    print("The comment is negative.")
else:
    print("The comment is neutral.")


positive
['positive']
The comment is positive.
