In [1]:
import pandas as pd

from nltk.stem import PorterStemmer, SnowballStemmer, LancasterStemmer, WordNetLemmatizer
from nltk.tokenize import word_tokenize
from nltk import pos_tag

from nltk.corpus import wordnet

from krovetzstemmer import Stemmer

# Stemming Techniques

In [2]:
df = pd.read_csv("mobile reviews.csv")
df = df[['Review_ID', 'Review_Text']]
df.head()

Unnamed: 0,Review_ID,Review_Text
0,1,The new device is sleek and fast. I love the c...
1,2,"Amazing display and battery life, but the pric..."
2,3,"I had a few issues with the initial setup, but..."
3,4,Solid performance overall; the design is very ...
4,5,The device exceeded my expectations in every way.


In [3]:
df.shape

(50, 2)

In [4]:
df['Review_Text'][4]

'The device exceeded my expectations in every way.'

In [5]:
df['Review_Text'][4]

'The device exceeded my expectations in every way.'

In [6]:
porter = PorterStemmer()

df['Porter_Stem'] = df['Review_Text'].apply(lambda text: " ".join([porter.stem(word) for word in word_tokenize(text.lower())]))

df['Porter_Stem'][4]

'the devic exceed my expect in everi way .'

In [7]:
snowball = SnowballStemmer("english")

df['Snowball_Stem'] = df['Review_Text'].apply(lambda text: " ".join([snowball.stem(word) for word in word_tokenize(text.lower())]))

df['Snowball_Stem'][4]

'the devic exceed my expect in everi way .'

In [8]:
lancaster = LancasterStemmer()

df['Lancaster_Stem'] = df['Review_Text'].apply(lambda text: " ".join([lancaster.stem(word) for word in word_tokenize(text.lower())]))

df['Lancaster_Stem'][4]

'the dev excess my expect in every way .'

In [9]:
krovetz = Stemmer()

df['Krovetz_Stem'] = df['Review_Text'].apply(lambda text: " ".join([krovetz.stem(word) for word in word_tokenize(text.lower())]))

df['Krovetz_Stem'][4]

'the device exceed my expectations in every way .'

# Lemmatization (With POS Tagging)

In [10]:
lemmatizer = WordNetLemmatizer()

In [11]:
pos_tag(['eating', 'device', 'apple'])

[('eating', 'VBG'), ('device', 'NN'), ('apple', 'NN')]

In [12]:
wordnet.VERB

'v'

In [13]:
wordnet.NOUN

'n'

In [14]:
def get_wordnet_pos(tag):
    if tag.startswith('J'):
        return wordnet.ADJ
    elif tag.startswith('V'):
        return wordnet.VERB
    elif tag.startswith('N'):
        return wordnet.NOUN
    elif tag.startswith('R'):
        return wordnet.ADV
    else:
        return wordnet.NOUN

In [15]:
df['Wordnet_Lemma'] = df['Review_Text'].apply(lambda text: " ".join([lemmatizer.lemmatize(word, get_wordnet_pos(pos)) for word, pos in pos_tag(word_tokenize(text.lower()))]))

df['Wordnet_Lemma'][4]

'the device exceed my expectation in every way .'

In [16]:
df['Review_Text'][5]

'Decent phone with great features, though the software could be improved.'

In [17]:
df['Wordnet_Lemma'][5]

'decent phone with great feature , though the software could be improve .'

In [18]:
df['Wordnet_Lemma']

0     the new device be sleek and fast . i love the ...
1     amazing display and battery life , but the pri...
2     i have a few issue with the initial setup , bu...
3     solid performance overall ; the design be very...
4       the device exceed my expectation in every way .
5     decent phone with great feature , though the s...
6     the battery last all day , even with heavy usa...
7     excellent build quality and performance . high...
8     have a minor glitch with the fingerprint senso...
9     a well-designed phone with fast performance . ...
10    the camera be outstanding , perfect for low-li...
11    i be impress with the process power and screen...
12    battery life be superb , though i wish there b...
13    the user interface be intuitive and easy to na...
14    great phone for everyday use , but the storage...
15    very responsive and fast . the design feels pr...
16    the device heat up a bit during heavy gaming s...
17    impress by the high-resolution display and

# NLP - Preprocessing -- Convert Token to Numbers

In [19]:
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer

In [20]:
vectorizer_bow = CountVectorizer()
vectorizer_tfidf = TfidfVectorizer()

In [21]:
df_bow = pd.DataFrame(vectorizer_bow.fit_transform(df["Wordnet_Lemma"]).toarray(),
             columns = vectorizer_bow.get_feature_names_out())

df_bow.head()

Unnamed: 0,additional,all,although,amazing,an,and,appreciate,apps,attention,balanced,...,ve,very,wait,way,well,when,wish,with,work,worth
0,0,0,0,0,0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,1,0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,1,0,0,0,0,0,1,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,1,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,1,0,0,0,0,0,0


In [22]:
df_bow.shape

(50, 204)

In [23]:
df_tfidf = pd.DataFrame(vectorizer_tfidf.fit_transform(df["Wordnet_Lemma"]).toarray(),
             columns = vectorizer_tfidf.get_feature_names_out())

df_tfidf.head()

Unnamed: 0,additional,all,although,amazing,an,and,appreciate,apps,attention,balanced,...,ve,very,wait,way,well,when,wish,with,work,worth
0,0.0,0.0,0.0,0.0,0.0,0.179243,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.420416,0.0,0.173949,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.253225,0.0,0.0,0.0,0.0,0.0,0.160056,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.366543,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.409696,0.0,0.0,0.0,0.0,0.0,0.0


# Word Embeddings (Word2Vec, FastText)

In [24]:
df.head()

Unnamed: 0,Review_ID,Review_Text,Porter_Stem,Snowball_Stem,Lancaster_Stem,Krovetz_Stem,Wordnet_Lemma
0,1,The new device is sleek and fast. I love the c...,the new devic is sleek and fast . i love the c...,the new devic is sleek and fast . i love the c...,the new dev is sleek and fast . i lov the came...,the new device is sleek and fast . i love the ...,the new device be sleek and fast . i love the ...
1,2,"Amazing display and battery life, but the pric...","amaz display and batteri life , but the price ...","amaz display and batteri life , but the price ...","amaz display and battery lif , but the pric is...","amazing display and battery life , but the pri...","amazing display and battery life , but the pri..."
2,3,"I had a few issues with the initial setup, but...","i had a few issu with the initi setup , but cu...","i had a few issu with the initi setup , but cu...","i had a few issu with the init setup , but cus...","i had a few issue with the initial setup , but...","i have a few issue with the initial setup , bu..."
3,4,Solid performance overall; the design is very ...,solid perform overal ; the design is veri mode...,solid perform overal ; the design is veri mode...,solid perform overal ; the design is very mode...,solid performance overall ; the design is very...,solid performance overall ; the design be very...
4,5,The device exceeded my expectations in every way.,the devic exceed my expect in everi way .,the devic exceed my expect in everi way .,the dev excess my expect in every way .,the device exceed my expectations in every way .,the device exceed my expectation in every way .


In [25]:
import gensim

from gensim.models import Word2Vec, FastText

In [26]:
sentences = [word_tokenize(sentence) for sentence in df['Review_Text']]
sentences

[['The',
  'new',
  'device',
  'is',
  'sleek',
  'and',
  'fast',
  '.',
  'I',
  'love',
  'the',
  'camera',
  'quality',
  '!'],
 ['Amazing',
  'display',
  'and',
  'battery',
  'life',
  ',',
  'but',
  'the',
  'price',
  'is',
  'a',
  'bit',
  'high',
  '.'],
 ['I',
  'had',
  'a',
  'few',
  'issues',
  'with',
  'the',
  'initial',
  'setup',
  ',',
  'but',
  'customer',
  'service',
  'was',
  'very',
  'helpful',
  '.'],
 ['Solid',
  'performance',
  'overall',
  ';',
  'the',
  'design',
  'is',
  'very',
  'modern',
  '.'],
 ['The',
  'device',
  'exceeded',
  'my',
  'expectations',
  'in',
  'every',
  'way',
  '.'],
 ['Decent',
  'phone',
  'with',
  'great',
  'features',
  ',',
  'though',
  'the',
  'software',
  'could',
  'be',
  'improved',
  '.'],
 ['The',
  'battery',
  'lasts',
  'all',
  'day',
  ',',
  'even',
  'with',
  'heavy',
  'usage',
  '.'],
 ['Excellent',
  'build',
  'quality',
  'and',
  'performance',
  '.',
  'Highly',
  'recommended',
  '.']

# Word2Vec

In [27]:
word2vec_model = Word2Vec(sentences, vector_size=500, window=5, min_count=1, workers=4)

In [28]:
word2vec_model.wv['Amazing']

array([ 1.34081754e-03, -1.39403157e-03,  9.48144647e-04, -9.33500880e-04,
        4.89454542e-04,  1.59100001e-03, -1.90944306e-03, -2.62169429e-04,
        4.35992086e-04,  1.34749862e-03, -4.52230044e-04, -1.79781788e-03,
       -2.46765383e-04,  7.34595989e-04, -1.93824735e-03,  8.19363049e-04,
       -1.91447872e-03,  9.76860872e-04, -1.43450976e-03, -1.42983918e-03,
       -1.90106302e-03, -7.45133555e-04,  1.21887797e-03,  1.12283707e-03,
        2.68539152e-04,  1.56664450e-04, -1.30921579e-03, -1.61348272e-03,
       -9.69456916e-04, -1.05853355e-03,  1.70831371e-03, -1.33249641e-03,
       -1.52933935e-03, -3.03068839e-04, -9.26394074e-04,  1.27413694e-03,
       -1.91766955e-03, -7.58774928e-04, -3.91113193e-04,  1.04209699e-03,
       -8.04539770e-04,  7.62460055e-04, -9.34003692e-05, -9.53579322e-04,
        1.06479158e-03,  7.56241730e-04,  9.15751734e-04,  2.66282063e-04,
       -1.40488404e-03, -3.82442842e-04, -1.98711036e-03,  2.37217377e-04,
        1.96842523e-03, -

In [29]:
word2vec_model.wv.most_similar('design')

[('of', 0.14227819442749023),
 ('overall', 0.11384469270706177),
 ('robust', 0.1122351586818695),
 ('though', 0.10488753020763397),
 ('’', 0.10427912324666977),
 ('phone', 0.09906166046857834),
 ('with', 0.09876316040754318),
 ('wish', 0.09725506603717804),
 ('stunning', 0.09248220920562744),
 ('power', 0.09165064245462418)]

# FastText

In [32]:
fasttext_model = FastText(sentences, vector_size=100, window=5, min_count=3, workers=4)

In [34]:
# fasttext_model.wv['device']

In [35]:
fasttext_model.wv.most_similar('device')

[('for', 0.15945032238960266),
 ('very', 0.15767408907413483),
 ('though', 0.15703023970127106),
 ('bit', 0.15592797100543976),
 ('fast', 0.12190394103527069),
 (',', 0.1182563453912735),
 ('impressive', 0.11361101269721985),
 ('in', 0.1095050796866417),
 ('minor', 0.09932141751050949),
 ('use', 0.08936243504285812)]

# Transformers (BERT Based Embeddings)

In [36]:
import torch
from transformers import AutoTokenizer, AutoModel

In [37]:
tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")
model = AutoModel.from_pretrained("bert-base-uncased")

In [38]:
def get_bert_embedding(text):
    tokens = tokenizer(text, return_tensors='pt', padding=True, truncation=True)
    with torch.no_grad():
        output = model(**tokens)
    return output.last_hidden_state.mean(dim=1).squeeze().tolist()

In [39]:
df.head()

Unnamed: 0,Review_ID,Review_Text,Porter_Stem,Snowball_Stem,Lancaster_Stem,Krovetz_Stem,Wordnet_Lemma
0,1,The new device is sleek and fast. I love the c...,the new devic is sleek and fast . i love the c...,the new devic is sleek and fast . i love the c...,the new dev is sleek and fast . i lov the came...,the new device is sleek and fast . i love the ...,the new device be sleek and fast . i love the ...
1,2,"Amazing display and battery life, but the pric...","amaz display and batteri life , but the price ...","amaz display and batteri life , but the price ...","amaz display and battery lif , but the pric is...","amazing display and battery life , but the pri...","amazing display and battery life , but the pri..."
2,3,"I had a few issues with the initial setup, but...","i had a few issu with the initi setup , but cu...","i had a few issu with the initi setup , but cu...","i had a few issu with the init setup , but cus...","i had a few issue with the initial setup , but...","i have a few issue with the initial setup , bu..."
3,4,Solid performance overall; the design is very ...,solid perform overal ; the design is veri mode...,solid perform overal ; the design is veri mode...,solid perform overal ; the design is very mode...,solid performance overall ; the design is very...,solid performance overall ; the design be very...
4,5,The device exceeded my expectations in every way.,the devic exceed my expect in everi way .,the devic exceed my expect in everi way .,the dev excess my expect in every way .,the device exceed my expectations in every way .,the device exceed my expectation in every way .


In [40]:
df['BERT_Embeddings'] = df['Review_Text'].apply(get_bert_embedding)

In [41]:
df['BERT_Embeddings'] 

0     [-0.033339910209178925, -0.14492446184158325, ...
1     [-0.1443636417388916, -0.09793177247047424, 0....
2     [-0.014390292577445507, -0.18319793045520782, ...
3     [-0.19074088335037231, -0.05667966976761818, 0...
4     [0.0026267035864293575, 0.1790943592786789, 0....
5     [-0.14330337941646576, -0.3835153877735138, 0....
6     [-0.022971151396632195, -0.040383193641901016,...
7     [-0.2507500946521759, 0.0016181719256564975, 0...
8     [-0.04302635416388512, -0.3294951021671295, 0....
9     [-0.10850157588720322, -0.5656262636184692, 0....
10    [-0.23262397944927216, 0.31259673833847046, 0....
11    [0.1099279373884201, 0.06603480875492096, 0.21...
12    [0.011915333569049835, -0.12445836514234543, 0...
13    [-0.3084065020084381, -0.17804761230945587, 0....
14    [-0.018770933151245117, -0.179813951253891, 0....
15    [-0.22769014537334442, -0.5014808773994446, 0....
16    [0.13212786614894867, -0.3040282726287842, 0.4...
17    [-0.6009488701820374, -0.3065846562385559,