In [1]:
import pandas as pd
import numpy as np

from nltk.corpus import stopwords 
from gensim.parsing.preprocessing import PorterStemmer, remove_stopwords
import string 
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

from tensorflow.keras.layers import Dense, Input, LSTM, Embedding, Dropout
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.layers import GlobalMaxPool1D
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.models import Model

In [2]:
df_senti = pd.read_csv('data/NLP1.csv')
df_senti.head()

Unnamed: 0,Review,Star
0,This is my 1st ever Echo 🥺 I'm amazed by its c...,4.4
1,"Excellent product, super quality. It capable t...",5.0
2,So it is my first Echo Dot.It definitely looks...,2.0
3,I was expecting more as it's 4th gen... But di...,5.0
4,Love the new revamped version of Alexa. All y...,5.0


In [3]:
df_senti['Emotion'] = df_senti['Star'].apply(lambda x: 'positive' if x>3 else 'negative')
df_senti.sample(5)

Unnamed: 0,Review,Star,Emotion
284,Recently started using this shampoo after rese...,5.0,positive
339,Searching for the shampoo which is best for ou...,5.0,positive
32,"Friends, got this Alexa model from Amazon onli...",4.4,positive
327,Indulekha bringha hairfall shampoo contains 9f...,1.0,negative
312,I have tried a number of. expensive shampoos t...,5.0,positive


### Clearning of Data

##### Tokenization

In [5]:
from nltk.tokenize import RegexpTokenizer

In [6]:
token_reviews = list()
Reviews = df_senti['Review']
for review in Reviews:
    token_reviews.append(RegexpTokenizer('\w+').tokenize(review))
    
print(token_reviews[:2])

[['This', 'is', 'my', '1st', 'ever', 'Echo', 'I', 'm', 'amazed', 'by', 'its', 'capabilities', 'however', 'I', 'still', 'do', 'not', 'feel', 'any', 'necessities', 'of', 'it', 'The', 'more', 'you', 'want', 'to', 'be', 'dependent', 'you', 'become', 'dependent', 'It', 's', 'really', 'an', 'impulsive', 'purchase', 'But', 'still', 'you', 'should', 'experience', 'echo', 'devices', 'at', 'least', 'once', 'Just', 'to', 'experience', 'the', 'technology', 'In', 'many', 'things', 'it', 'is', 'useful', 'like', 'asking', 'questions', 'weather', 'alarm', 'news', 'etc', 'Now', 'its', 'pros', '1', 'Love', 'the', 'design', '2', 'Has', 'a', 'good', 'weight', '3', 'Sturdy', 'built', '4', 'Good', 'bass', 'for', 'such', 'a', '1', '6', 'driver', '5', 'Highs', 'with', 'just', '0', '8', 'tweeter', 'is', 'adding', 'unbelievable', 'clarity', 'to', 'the', 'sound', '6', 'Can', 'easily', 'pair', 'to', 'the', 'Alexa', 'app', '7', 'Priced', 'moderately', '8', 'You', 'can', 'make', 'connections', 'to', 'smart', 'plug'

##### Lower Case

In [7]:
lower_reviews = []
for review in token_reviews:
    lower_reviews.append([word.lower() for word in review])
    
print(lower_reviews[:2])

[['this', 'is', 'my', '1st', 'ever', 'echo', 'i', 'm', 'amazed', 'by', 'its', 'capabilities', 'however', 'i', 'still', 'do', 'not', 'feel', 'any', 'necessities', 'of', 'it', 'the', 'more', 'you', 'want', 'to', 'be', 'dependent', 'you', 'become', 'dependent', 'it', 's', 'really', 'an', 'impulsive', 'purchase', 'but', 'still', 'you', 'should', 'experience', 'echo', 'devices', 'at', 'least', 'once', 'just', 'to', 'experience', 'the', 'technology', 'in', 'many', 'things', 'it', 'is', 'useful', 'like', 'asking', 'questions', 'weather', 'alarm', 'news', 'etc', 'now', 'its', 'pros', '1', 'love', 'the', 'design', '2', 'has', 'a', 'good', 'weight', '3', 'sturdy', 'built', '4', 'good', 'bass', 'for', 'such', 'a', '1', '6', 'driver', '5', 'highs', 'with', 'just', '0', '8', 'tweeter', 'is', 'adding', 'unbelievable', 'clarity', 'to', 'the', 'sound', '6', 'can', 'easily', 'pair', 'to', 'the', 'alexa', 'app', '7', 'priced', 'moderately', '8', 'you', 'can', 'make', 'connections', 'to', 'smart', 'plug'

##### Remove Punctuation

In [8]:
import nltk
nltk.download('punkt')

[nltk_data] Downloading package punkt to C:\ProgramData\Anaconda3\envs
[nltk_data]     \tensorflow_gpu\lib\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [10]:
punct_list = [";",":","!","?","/","\\",",","#","@","$","&",")","(","\""]

In [12]:
punct_reviews = []
for review in lower_reviews:
    punct_reviews.append([word for word in review if word not in punct_list])

print(punct_reviews[:2])

[['this', 'is', 'my', '1st', 'ever', 'echo', 'i', 'm', 'amazed', 'by', 'its', 'capabilities', 'however', 'i', 'still', 'do', 'not', 'feel', 'any', 'necessities', 'of', 'it', 'the', 'more', 'you', 'want', 'to', 'be', 'dependent', 'you', 'become', 'dependent', 'it', 's', 'really', 'an', 'impulsive', 'purchase', 'but', 'still', 'you', 'should', 'experience', 'echo', 'devices', 'at', 'least', 'once', 'just', 'to', 'experience', 'the', 'technology', 'in', 'many', 'things', 'it', 'is', 'useful', 'like', 'asking', 'questions', 'weather', 'alarm', 'news', 'etc', 'now', 'its', 'pros', '1', 'love', 'the', 'design', '2', 'has', 'a', 'good', 'weight', '3', 'sturdy', 'built', '4', 'good', 'bass', 'for', 'such', 'a', '1', '6', 'driver', '5', 'highs', 'with', 'just', '0', '8', 'tweeter', 'is', 'adding', 'unbelievable', 'clarity', 'to', 'the', 'sound', '6', 'can', 'easily', 'pair', 'to', 'the', 'alexa', 'app', '7', 'priced', 'moderately', '8', 'you', 'can', 'make', 'connections', 'to', 'smart', 'plug'

##### Removing Stop Words

In [23]:
punct_reviews[0].apply(remove_stopwords)

AttributeError: 'list' object has no attribute 'apply'

In [13]:
from nltk.corpus import stopwords
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to C:\ProgramData\Anaconda3\
[nltk_data]     envs\tensorflow_gpu\lib\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [14]:
print(stopwords.words('english'))

['i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', "you're", "you've", "you'll", "you'd", 'your', 'yours', 'yourself', 'yourselves', 'he', 'him', 'his', 'himself', 'she', "she's", 'her', 'hers', 'herself', 'it', "it's", 'its', 'itself', 'they', 'them', 'their', 'theirs', 'themselves', 'what', 'which', 'who', 'whom', 'this', 'that', "that'll", 'these', 'those', 'am', 'is', 'are', 'was', 'were', 'be', 'been', 'being', 'have', 'has', 'had', 'having', 'do', 'does', 'did', 'doing', 'a', 'an', 'the', 'and', 'but', 'if', 'or', 'because', 'as', 'until', 'while', 'of', 'at', 'by', 'for', 'with', 'about', 'against', 'between', 'into', 'through', 'during', 'before', 'after', 'above', 'below', 'to', 'from', 'up', 'down', 'in', 'out', 'on', 'off', 'over', 'under', 'again', 'further', 'then', 'once', 'here', 'there', 'when', 'where', 'why', 'how', 'all', 'any', 'both', 'each', 'few', 'more', 'most', 'other', 'some', 'such', 'no', 'nor', 'not', 'only', 'own', 'same', 'so', 'than', '

In [15]:
stop_words = set(stopwords.words('english'))

In [17]:
stopword_reviews = []
for review in punct_reviews:
    stopword_reviews.append([word for word in review if word not in stop_words])

print(stopword_reviews[:2])

[['1st', 'ever', 'echo', 'amazed', 'capabilities', 'however', 'still', 'feel', 'necessities', 'want', 'dependent', 'become', 'dependent', 'really', 'impulsive', 'purchase', 'still', 'experience', 'echo', 'devices', 'least', 'experience', 'technology', 'many', 'things', 'useful', 'like', 'asking', 'questions', 'weather', 'alarm', 'news', 'etc', 'pros', '1', 'love', 'design', '2', 'good', 'weight', '3', 'sturdy', 'built', '4', 'good', 'bass', '1', '6', 'driver', '5', 'highs', '0', '8', 'tweeter', 'adding', 'unbelievable', 'clarity', 'sound', '6', 'easily', 'pair', 'alexa', 'app', '7', 'priced', 'moderately', '8', 'make', 'connections', 'smart', 'plug', 'smart', 'bulbs', 'easily', '9', 'play', 'apple', 'music', 'u', 'hv', 'subscription', '10', 'auto', 'pairing', 'light', 'cuts', '11', 'excellent', 'sound', 'quality', 'wize', '12', '5', 'feet', 'power', 'adaptor', 'cable13', 'line', '3', '5mm', 'connecting', 'external', 'speakers', 'ht', 'cons', '1', 'need', 'continuous', 'power', '2', 'ne

##### Stemming

In [18]:
p_stemmer = PorterStemmer()

In [20]:
p_stemmer.stem_documents(stopword_reviews[0])

['1st',
 'ever',
 'echo',
 'amaz',
 'capabl',
 'howev',
 'still',
 'feel',
 'necess',
 'want',
 'depend',
 'becom',
 'depend',
 'realli',
 'impuls',
 'purchas',
 'still',
 'experi',
 'echo',
 'devic',
 'least',
 'experi',
 'technolog',
 'mani',
 'thing',
 'us',
 'like',
 'ask',
 'question',
 'weather',
 'alarm',
 'new',
 'etc',
 'pro',
 '1',
 'love',
 'design',
 '2',
 'good',
 'weight',
 '3',
 'sturdi',
 'built',
 '4',
 'good',
 'bass',
 '1',
 '6',
 'driver',
 '5',
 'high',
 '0',
 '8',
 'tweeter',
 'ad',
 'unbeliev',
 'clariti',
 'sound',
 '6',
 'easili',
 'pair',
 'alexa',
 'app',
 '7',
 'price',
 'moder',
 '8',
 'make',
 'connect',
 'smart',
 'plug',
 'smart',
 'bulb',
 'easili',
 '9',
 'plai',
 'appl',
 'music',
 'u',
 'hv',
 'subscript',
 '10',
 'auto',
 'pair',
 'light',
 'cut',
 '11',
 'excel',
 'sound',
 'qualiti',
 'wize',
 '12',
 '5',
 'feet',
 'power',
 'adaptor',
 'cable13',
 'line',
 '3',
 '5mm',
 'connect',
 'extern',
 'speaker',
 'ht',
 'con',
 '1',
 'need',
 'continu',
 

In [None]:
docs = imdb['review'].str.lower().str.replace('[^a-z\s]', '')
docs = pd.Series(stemmer.stem_documents(docs))
docs = docs.apply(remove_stopwords)