In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

In [4]:
df=pd.read_csv('/content/Alexa-Dataset - Alexa-Dataset.csv')

In [5]:
df.head()

Unnamed: 0,rating,date,variation,verified_reviews,feedback
0,5,31-Jul-18,Charcoal Fabric,Love my Echo!,1
1,5,31-Jul-18,Charcoal Fabric,Loved it!,1
2,4,31-Jul-18,Walnut Finish,"Sometimes while playing a game, you can answer...",1
3,5,31-Jul-18,Charcoal Fabric,I have had a lot of fun with this thing. My 4 ...,1
4,5,31-Jul-18,Charcoal Fabric,Music,1


In [6]:
df.isnull().sum()

rating               0
date                 0
variation            0
verified_reviews    80
feedback             0
dtype: int64

In [7]:
df['verified_reviews']=df['verified_reviews'].fillna("missing")
df.isnull().sum()

rating              0
date                0
variation           0
verified_reviews    0
feedback            0
dtype: int64

# (I) Remove all punctuations from review text.

In [8]:
import string
df['cleaned_text'] = df['verified_reviews'].apply(lambda x: x.translate(str.maketrans('', '', string.punctuation)))
df.head()

Unnamed: 0,rating,date,variation,verified_reviews,feedback,cleaned_text
0,5,31-Jul-18,Charcoal Fabric,Love my Echo!,1,Love my Echo
1,5,31-Jul-18,Charcoal Fabric,Loved it!,1,Loved it
2,4,31-Jul-18,Walnut Finish,"Sometimes while playing a game, you can answer...",1,Sometimes while playing a game you can answer ...
3,5,31-Jul-18,Charcoal Fabric,I have had a lot of fun with this thing. My 4 ...,1,I have had a lot of fun with this thing My 4 y...
4,5,31-Jul-18,Charcoal Fabric,Music,1,Music


# (II) Tokenize the review text into words.

In [9]:
import nltk
nltk.download('punkt')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


True

In [10]:
from nltk.tokenize import word_tokenize
df['tokenized_text'] = df['cleaned_text'].apply(lambda x:word_tokenize(x))
df['tokenized_text']

0                                        [Love, my, Echo]
1                                             [Loved, it]
2       [Sometimes, while, playing, a, game, you, can,...
3       [I, have, had, a, lot, of, fun, with, this, th...
4                                                 [Music]
                              ...                        
3145    [Perfect, for, kids, adults, and, everyone, in...
3146    [Listening, to, music, searching, locations, c...
3147    [I, do, love, these, things, i, have, them, ru...
3148    [Only, complaint, I, have, is, that, the, soun...
3149                                               [Good]
Name: tokenized_text, Length: 3150, dtype: object

# (III) Remove the Stopwords from the tokenized text.

In [11]:
from nltk.corpus import stopwords

In [12]:
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

In [13]:
sw = set(stopwords.words('english'))

In [14]:
df['filtered_text'] = df['tokenized_text'].apply(lambda x: [word for word in x if word not in sw])

In [15]:
df['filtered_text']


0                                            [Love, Echo]
1                                                 [Loved]
2       [Sometimes, playing, game, answer, question, c...
3       [I, lot, fun, thing, My, 4, yr, old, learns, d...
4                                                 [Music]
                              ...                        
3145                    [Perfect, kids, adults, everyone]
3146    [Listening, music, searching, locations, check...
3147    [I, love, things, running, entire, home, TV, l...
3148    [Only, complaint, I, sound, quality, isnt, gre...
3149                                               [Good]
Name: filtered_text, Length: 3150, dtype: object

# (IV) Perform stemming & lemmatization on the review text.

In [16]:
from nltk.stem import PorterStemmer

In [17]:
ps = PorterStemmer()
df['stemmed_text'] = df['filtered_text'].apply(lambda x: [ps.stem(word) for word in x])
df['stemmed_text']

0                                            [love, echo]
1                                                  [love]
2       [sometim, play, game, answer, question, correc...
3       [i, lot, fun, thing, my, 4, yr, old, learn, di...
4                                                 [music]
                              ...                        
3145                       [perfect, kid, adult, everyon]
3146    [listen, music, search, locat, check, time, lo...
3147    [i, love, thing, run, entir, home, tv, light, ...
3148    [onli, complaint, i, sound, qualiti, isnt, gre...
3149                                               [good]
Name: stemmed_text, Length: 3150, dtype: object

In [18]:
nltk.download('wordnet')


[nltk_data] Downloading package wordnet to /root/nltk_data...


True

In [19]:
from nltk.stem import PorterStemmer, WordNetLemmatizer
wordnet_lemmatizer = WordNetLemmatizer()
df['lemmatized_text'] = df['filtered_text'].apply(lambda x: [wordnet_lemmatizer.lemmatize(word) for word in x])

In [20]:
df['lemmatized_text']

0                                            [Love, Echo]
1                                                 [Loved]
2       [Sometimes, playing, game, answer, question, c...
3       [I, lot, fun, thing, My, 4, yr, old, learns, d...
4                                                 [Music]
                              ...                        
3145                      [Perfect, kid, adult, everyone]
3146    [Listening, music, searching, location, checki...
3147    [I, love, thing, running, entire, home, TV, li...
3148    [Only, complaint, I, sound, quality, isnt, gre...
3149                                               [Good]
Name: lemmatized_text, Length: 3150, dtype: object

# (V) Perform the word vectorization on review text using Bag of Words technique.

In [32]:
from sklearn.feature_extraction.text import CountVectorizer
# Initialize CountVectorizer with stop words removed
vectorizer = CountVectorizer(stop_words='english')

X = vectorizer.fit_transform(df['cleaned_text'])


df_bow = pd.DataFrame(X.toarray(),columns=vectorizer.get_feature_names_out())



# OR

In [None]:
from sklearn.feature_extraction.text import CountVectorizer

lemmatized_texts = df['lemmatized_text']
lemmatized_texts_joined = [' '.join(text) for text in lemmatized_texts]
vectorizer = CountVectorizer(stop_words='english')


In [33]:

df_bow


Unnamed: 0,072318,10,100,1000,100x,1010,1030pm,11,1100sf,1220,...,yr,yrs,yup,zero,zigbee,zonkedout,zwave,zzzz,zzzzzzz,útil
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,1,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3145,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3146,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3147,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3148,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


# (VI) Create representation of Review Text by calculating Term Frequency and Inverse Document Frequency (TF-IDF)

In [39]:
from sklearn.feature_extraction.text import TfidfVectorizer
tfidf = TfidfVectorizer()
df_tfidf = tfidf.fit_transform(df['cleaned_text'])

In [37]:
df_tfidf = df_tfidf.toarray()

In [38]:
df_tfidf = pd.DataFrame(df_tfidf,columns=tfidf.get_feature_names_out())
df_tfidf.head()


Unnamed: 0,072318,10,100,1000,100x,1010,1030pm,11,1100sf,1220,...,yr,yrs,yup,zero,zigbee,zonkedout,zwave,zzzz,zzzzzzz,útil
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.307379,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
