In [1]:
import pandas as pd
import numpy as np
import gensim
import nltk
from gensim import corpora
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer

# Download NLTK data files
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [2]:
df = pd.read_csv('reviews.csv')
df.head()

Unnamed: 0.1,Unnamed: 0,0
0,0,"After About 10 Days Of Use , I'll Give My Hone..."
1,1,Superb phone and camera quality. I like it.
2,2,"The design of the phone is very unique, curve ..."
3,3,Very nice mobile full waterproof and gorilla g...
4,4,I am very happy I like this phone thank you mi...


In [3]:
df.drop(['Unnamed: 0'], axis=1, inplace=True)

In [4]:

df.rename(columns={'0': 'Reviews'}, inplace=True) #replace with actual names


In [5]:
df.head()

Unnamed: 0,Reviews
0,"After About 10 Days Of Use , I'll Give My Hone..."
1,Superb phone and camera quality. I like it.
2,"The design of the phone is very unique, curve ..."
3,Very nice mobile full waterproof and gorilla g...
4,I am very happy I like this phone thank you mi...


In [6]:
df.shape

(352, 1)

In [7]:
# Initialize stop words and lemmatizer
stop_words = set(stopwords.words('english'))
lemmatizer = WordNetLemmatizer()

def preprocess(text):
    # Tokenize
    tokens = word_tokenize(text.lower())
    # Remove stop words and non-alphabetic tokens
    tokens = [word for word in tokens if word.isalpha() and word not in stop_words]
    # Lemmatize
    tokens = [lemmatizer.lemmatize(word) for word in tokens]
    return tokens

# Apply preprocessing to all reviews
processed_reviews = [preprocess(review) for review in df['Reviews']]


In [8]:
# Create a dictionary representation of the documents
dictionary = corpora.Dictionary(processed_reviews)

# Create a corpus: a list of bag-of-words
corpus = [dictionary.doc2bow(text) for text in processed_reviews]


In [9]:
# Set the number of topics
num_topics = 5

# Train the LDA model
lda_model = gensim.models.ldamodel.LdaModel(corpus, num_topics=num_topics, id2word=dictionary, passes=15)

# Print the topics
topics = lda_model.print_topics(num_words=5)
for topic in topics:
    print(topic)


(0, '0.036*"camera" + 0.024*"battery" + 0.019*"phone" + 0.017*"awesome" + 0.016*"good"')
(1, '0.034*"camera" + 0.024*"phone" + 0.020*"best" + 0.018*"battery" + 0.016*"quality"')
(2, '0.065*"good" + 0.047*"phone" + 0.042*"camera" + 0.030*"battery" + 0.021*"performance"')
(3, '0.017*"camera" + 0.017*"issue" + 0.015*"good" + 0.015*"battery" + 0.014*"mobile"')
(4, '0.028*"excellent" + 0.025*"iphone" + 0.019*"phone" + 0.014*"display" + 0.013*"camera"')


In [10]:
import pyLDAvis
import pyLDAvis.gensim

pyLDAvis.enable_notebook()
vis = pyLDAvis.gensim.prepare(lda_model, corpus, dictionary, mds="mmds", R=30)
vis



In [11]:

num_words = 10
topics = lda_model.show_topics(num_topics=num_topics, num_words=num_words, formatted=False)

# Print topics and their top words
for topic_id, topic_words in topics:
    print(f"Topic {topic_id}: {[word for word, _ in topic_words]}")


Topic 0: ['camera', 'battery', 'phone', 'awesome', 'good', 'display', 'design', 'better', 'performance', 'day']
Topic 1: ['camera', 'phone', 'best', 'battery', 'quality', 'performance', 'day', 'awesome', 'one', 'go']
Topic 2: ['good', 'phone', 'camera', 'battery', 'performance', 'display', 'nice', 'best', 'look', 'quality']
Topic 3: ['camera', 'issue', 'good', 'battery', 'mobile', 'display', 'go', 'price', 'samsung', 'back']
Topic 4: ['excellent', 'iphone', 'phone', 'display', 'camera', 'product', 'using', 'battery', 'day', 'quality']


  and should_run_async(code)


In [12]:
# Set the number of topics
num_topics = 4

# Train the LDA model
lda_model1 = gensim.models.ldamodel.LdaModel(corpus, num_topics=num_topics, id2word=dictionary, passes=15)

# Print the topics
topics = lda_model1.print_topics(num_words=5)
for topic in topics:
    print(topic)

  and should_run_async(code)


(0, '0.036*"phone" + 0.031*"battery" + 0.031*"camera" + 0.020*"quality" + 0.019*"best"')
(1, '0.037*"nice" + 0.029*"camera" + 0.026*"good" + 0.019*"mobile" + 0.018*"display"')
(2, '0.041*"camera" + 0.028*"good" + 0.027*"phone" + 0.022*"iphone" + 0.018*"battery"')
(3, '0.062*"good" + 0.027*"phone" + 0.020*"camera" + 0.019*"battery" + 0.018*"performance"')


In [13]:
pyLDAvis.enable_notebook()
vis = pyLDAvis.gensim.prepare(lda_model1, corpus, dictionary, mds="mmds", R=30)
vis

  and should_run_async(code)


In [14]:

num_words = 10
topics = lda_model1.show_topics(num_topics=num_topics, num_words=num_words, formatted=False)

# Print topics and their top words
for topic_id, topic_words in topics:
    print(f"Topic {topic_id}: {[word for word, _ in topic_words]}")


Topic 0: ['phone', 'battery', 'camera', 'quality', 'best', 'issue', 'awesome', 'day', 'performance', 'good']
Topic 1: ['nice', 'camera', 'good', 'mobile', 'display', 'love', 'phone', 'super', 'battery', 'charger']
Topic 2: ['camera', 'good', 'phone', 'iphone', 'battery', 'look', 'display', 'awesome', 'best', 'performance']
Topic 3: ['good', 'phone', 'camera', 'battery', 'performance', 'excellent', 'also', 'product', 'overall', 'display']


  and should_run_async(code)


In [15]:
def get_dominant_topic(lda_model1, corpus):
    dominant_topics = []
    for bow in corpus:
        topics = lda_model1.get_document_topics(bow)
        dominant_topic = sorted(topics, key=lambda x: x[1], reverse=True)[0][0]
        dominant_topics.append(dominant_topic)
    return dominant_topics

df['dominant_topic'] = get_dominant_topic(lda_model1, corpus)


  and should_run_async(code)


In [16]:
df.head()

  and should_run_async(code)


Unnamed: 0,Reviews,dominant_topic
0,"After About 10 Days Of Use , I'll Give My Hone...",2
1,Superb phone and camera quality. I like it.,0
2,"The design of the phone is very unique, curve ...",0
3,Very nice mobile full waterproof and gorilla g...,1
4,I am very happy I like this phone thank you mi...,2


In [17]:
df['dominant_topic'] = df['dominant_topic'].astype(object)

  and should_run_async(code)


In [18]:
df['dominant_topic'].value_counts()

  and should_run_async(code)


dominant_topic
0    112
2    106
3     70
1     64
Name: count, dtype: int64

In [19]:
df['Reviews'].value_counts()

  and should_run_async(code)


Reviews
Good                                                                                                                                                                                                                                                                                                                                          7
Nice                                                                                                                                                                                                                                                                                                                                          5
Super                                                                                                                                                                                                                                                                                                                           

In [20]:
import nltk
import re
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
nltk.download('stopwords')

  and should_run_async(code)
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [21]:
ps = PorterStemmer() # create an instance of the PorterStemmer class
corpus1 = [] # create an empty list to store the preprocessed tweets
# Iterate over the values in the 'tweets' column directly
for tweet in df['Reviews']: # iterate over the 'tweets' column of the DataFrame
    review = re.sub('[^a-zA-Z]', ' ', tweet) # remove non-alphabetic characters
    review = review.lower() # convert to lowercase
    review = review.split() # split into words
    review = [ps.stem(word) for word in review if not word in stopwords.words('english')] # apply stemming and remove stopwords
    review = ' '.join(review) # join the words back into a single string
    corpus1.append(review) # add the preprocessed tweet to the corpus list

  and should_run_async(code)


In [22]:
corpus1[0:5]

  and should_run_async(code)


['day use give honest review price littl bit higher side cheaper actual price camera qualiti good expect good experi front camera design uniqu look amaz design feel like premium class gadget perform smooth feather touch display notif indic light feel amaz batteri heat issu gener common heavi batteri sta',
 'superb phone camera qualiti like',
 'design phone uniqu curv shape give best contribut design camera qualiti good charg promis minut featur ip right phone work well put water let see perform futur',
 'nice mobil full waterproof gorilla glass like',
 'happi like phone thank mi thank much nice zoom x']

In [23]:
from tensorflow.keras.layers import Embedding
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.preprocessing.text import one_hot
from tensorflow.keras.layers import LSTM
from tensorflow.keras.layers import Dense
from tensorflow.keras.layers import Bidirectional
from tensorflow.keras.layers import Dropout

  and should_run_async(code)


In [24]:
voc_size=500

  and should_run_async(code)


In [25]:
onehot_repr=[one_hot(words,voc_size)for words in corpus1]

  and should_run_async(code)


In [26]:
onehot_repr[:5]

  and should_run_async(code)


[[15,
  254,
  316,
  233,
  312,
  274,
  285,
  272,
  248,
  481,
  233,
  378,
  274,
  161,
  206,
  209,
  157,
  209,
  327,
  438,
  161,
  141,
  87,
  317,
  52,
  141,
  290,
  377,
  488,
  222,
  5,
  465,
  5,
  302,
  457,
  221,
  288,
  412,
  386,
  290,
  52,
  43,
  178,
  454,
  29,
  120,
  12,
  43,
  409],
 [223, 275, 161, 206, 377],
 [141,
  275,
  87,
  74,
  352,
  316,
  439,
  49,
  141,
  161,
  206,
  209,
  161,
  91,
  425,
  484,
  43,
  483,
  275,
  260,
  327,
  200,
  207,
  403,
  27,
  465,
  499],
 [141, 377, 288, 392, 91, 374, 377],
 [17, 377, 275, 297, 187, 297, 87, 141, 494, 468]]

In [27]:
len(max(onehot_repr,key=len))

  and should_run_async(code)


63

In [28]:
sent_length=63 # setting the maximum sentence length to 63.
embedded_docs=pad_sequences(onehot_repr,padding='pre',maxlen=sent_length)

  and should_run_async(code)


In [29]:
embedded_docs

  and should_run_async(code)


array([[  0,   0,   0, ...,  12,  43, 409],
       [  0,   0,   0, ..., 161, 206, 377],
       [  0,   0,   0, ...,  27, 465, 499],
       ...,
       [  0,   0,   0, ..., 161, 130, 439],
       [  0,   0,   0, ..., 438, 438, 224],
       [  0,   0,   0, ..., 161,  87, 141]], dtype=int32)

In [30]:
y=df['dominant_topic']

  and should_run_async(code)


In [31]:
len(embedded_docs),y.shape

  and should_run_async(code)


(352, (352,))

In [32]:
X_final=np.array(embedded_docs) # converting the embedded tweets to a numpy array.
y_final=np.array(y) # converting the labels to a numpy array.

  and should_run_async(code)


In [33]:
X_final.shape,y_final.shape

  and should_run_async(code)


((352, 63), (352,))

In [34]:
from sklearn.model_selection import train_test_split # importing the train_test_split function from scikit-learn.
X_train, X_test, y_train, y_test = train_test_split(X_final, y_final, test_size=0.33, random_state=42)

  and should_run_async(code)


In [35]:
print("X_train data type:", X_train.dtype)
print("y_train data type:", y_train.dtype)
print("X_test data type:", X_test.dtype)
print("y_test data type:", y_test.dtype)

print("Unique values in y_train:", np.unique(y_train))
print("Unique values in y_test:", np.unique(y_test))

X_train data type: int32
y_train data type: object
X_test data type: int32
y_test data type: object
Unique values in y_train: [0 1 2 3]
Unique values in y_test: [0 1 2 3]


  and should_run_async(code)


In [36]:
from sklearn.preprocessing import LabelEncoder # importing the LabelEncoder class from scikit-learn.
le = LabelEncoder() # creating an instance of the LabelEncoder class.
y_train = le.fit_transform(y_train) # fitting the LabelEncoder to the training labels and transforming them.
y_test = le.transform(y_test)

  and should_run_async(code)


In [41]:
from keras.utils import to_categorical

y_train = to_categorical(y_train, num_classes=4)
y_test = to_categorical(y_test, num_classes=4)

  and should_run_async(code)


In [65]:
## Creating model
embedding_vector_features=20 # setting the embedding vector size to 20.
model=Sequential() # creating a sequential model.
model.add(Embedding(voc_size,embedding_vector_features,input_length=sent_length)) # adding an embedding layer.
model.add(LSTM(100)) # adding an LSTM layer.
model.add(Dense(4,activation='softmax')) # adding a dense layer with sigmoid activation function.
model.compile(loss='categorical_crossentropy',optimizer='adam',metrics=['accuracy']) # compiling the model. # Use categorical_crossentropy for multi-class classification
print(model.summary()) # printing the model summary.

  and should_run_async(code)


Model: "sequential_8"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_8 (Embedding)     (None, 63, 20)            10000     
                                                                 
 lstm_8 (LSTM)               (None, 100)               48400     
                                                                 
 dense_8 (Dense)             (None, 4)                 404       
                                                                 
Total params: 58804 (229.70 KB)
Trainable params: 58804 (229.70 KB)
Non-trainable params: 0 (0.00 Byte)
_________________________________________________________________
None


In [66]:
model.fit(X_train,y_train,validation_data=(X_test,y_test),epochs=50,batch_size=100)

Epoch 1/50


  and should_run_async(code)


Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50
Epoch 35/50
Epoch 36/50
Epoch 37/50
Epoch 38/50
Epoch 39/50
Epoch 40/50
Epoch 41/50
Epoch 42/50
Epoch 43/50
Epoch 44/50
Epoch 45/50
Epoch 46/50
Epoch 47/50
Epoch 48/50
Epoch 49/50
Epoch 50/50


<keras.src.callbacks.History at 0x7f2d28627d00>

In [67]:
loss, accuracy = model.evaluate(X_test, y_test)
print(f'Test Accuracy: {accuracy:.2f}')

Test Accuracy: 0.56


  and should_run_async(code)


  and should_run_async(code)
