In [25]:
import pandas as pd

from nltk.stem import PorterStemmer, SnowballStemmer, LancasterStemmer, WordNetLemmatizer
from nltk.tokenize import word_tokenize
from nltk import pos_tag

from nltk.corpus import wordnet

from krovetzstemmer import Stemmer

# Stemming Techniques

In [3]:
df = pd.read_csv("mobile reviews.csv")
df = df[['Review_ID', 'Review_Text']]
df.head()

Unnamed: 0,Review_ID,Review_Text
0,1,The new device is sleek and fast. I love the c...
1,2,"Amazing display and battery life, but the pric..."
2,3,"I had a few issues with the initial setup, but..."
3,4,Solid performance overall; the design is very ...
4,5,The device exceeded my expectations in every way.


In [4]:
df.shape

(50, 2)

In [17]:
df['Review_Text'][4]

'The device exceeded my expectations in every way.'

In [18]:
df['Review_Text'][4]

'The device exceeded my expectations in every way.'

In [14]:
porter = PorterStemmer()

df['Porter_Stem'] = df['Review_Text'].apply(lambda text: " ".join([porter.stem(word) for word in word_tokenize(text.lower())]))

df['Porter_Stem'][4]

'the devic exceed my expect in everi way .'

In [15]:
snowball = SnowballStemmer("english")

df['Snowball_Stem'] = df['Review_Text'].apply(lambda text: " ".join([snowball.stem(word) for word in word_tokenize(text.lower())]))

df['Snowball_Stem'][4]

'the devic exceed my expect in everi way .'

In [19]:
lancaster = LancasterStemmer()

df['Lancaster_Stem'] = df['Review_Text'].apply(lambda text: " ".join([lancaster.stem(word) for word in word_tokenize(text.lower())]))

df['Lancaster_Stem'][4]

'the dev excess my expect in every way .'

In [21]:
krovetz = Stemmer()

df['Krovetz_Stem'] = df['Review_Text'].apply(lambda text: " ".join([krovetz.stem(word) for word in word_tokenize(text.lower())]))

df['Krovetz_Stem'][4]

'the device exceed my expectations in every way .'

# Lemmatization (With POS Tagging)

In [22]:
lemmatizer = WordNetLemmatizer()

In [24]:
pos_tag(['eating', 'device', 'apple'])

[('eating', 'VBG'), ('device', 'NN'), ('apple', 'NN')]

In [26]:
wordnet.VERB

'v'

In [28]:
wordnet.NOUN

'n'

In [32]:
def get_wordnet_pos(tag):
    if tag.startswith('J'):
        return wordnet.ADJ
    elif tag.startswith('V'):
        return wordnet.VERB
    elif tag.startswith('N'):
        return wordnet.NOUN
    elif tag.startswith('R'):
        return wordnet.ADV
    else:
        return wordnet.NOUN

In [33]:
df['Wordnet_Lemma'] = df['Review_Text'].apply(lambda text: " ".join([lemmatizer.lemmatize(word, get_wordnet_pos(pos)) for word, pos in pos_tag(word_tokenize(text.lower()))]))

df['Wordnet_Lemma'][4]

'the device exceed my expectation in every way .'

In [34]:
df['Review_Text'][5]

'Decent phone with great features, though the software could be improved.'

In [35]:
df['Wordnet_Lemma'][5]

'decent phone with great feature , though the software could be improve .'

In [40]:
df['Wordnet_Lemma']

0     the new device be sleek and fast . i love the ...
1     amazing display and battery life , but the pri...
2     i have a few issue with the initial setup , bu...
3     solid performance overall ; the design be very...
4       the device exceed my expectation in every way .
5     decent phone with great feature , though the s...
6     the battery last all day , even with heavy usa...
7     excellent build quality and performance . high...
8     have a minor glitch with the fingerprint senso...
9     a well-designed phone with fast performance . ...
10    the camera be outstanding , perfect for low-li...
11    i be impress with the process power and screen...
12    battery life be superb , though i wish there b...
13    the user interface be intuitive and easy to na...
14    great phone for everyday use , but the storage...
15    very responsive and fast . the design feels pr...
16    the device heat up a bit during heavy gaming s...
17    impress by the high-resolution display and

# NLP - Preprocessing -- Convert Token to Numbers

In [36]:
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer

In [37]:
vectorizer_bow = CountVectorizer()
vectorizer_tfidf = TfidfVectorizer()

In [38]:
df_bow = pd.DataFrame(vectorizer_bow.fit_transform(df["Wordnet_Lemma"]).toarray(),
             columns = vectorizer_bow.get_feature_names_out())

df_bow.head()

Unnamed: 0,additional,all,although,amazing,an,and,appreciate,apps,attention,balanced,...,ve,very,wait,way,well,when,wish,with,work,worth
0,0,0,0,0,0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,1,0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,1,0,0,0,0,0,1,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,1,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,1,0,0,0,0,0,0


In [41]:
df_bow.shape

(50, 204)

In [39]:
df_tfidf = pd.DataFrame(vectorizer_tfidf.fit_transform(df["Wordnet_Lemma"]).toarray(),
             columns = vectorizer_tfidf.get_feature_names_out())

df_tfidf.head()

Unnamed: 0,additional,all,although,amazing,an,and,appreciate,apps,attention,balanced,...,ve,very,wait,way,well,when,wish,with,work,worth
0,0.0,0.0,0.0,0.0,0.0,0.179243,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.420416,0.0,0.173949,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.253225,0.0,0.0,0.0,0.0,0.0,0.160056,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.366543,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.409696,0.0,0.0,0.0,0.0,0.0,0.0
