### <center> Text Analysis</center>
1. import packages
2. sentiment analysis
3. topic modeling

In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

# sentiment analysis packages
import re
import nltk
nltk.download('punkt')
from nltk.tokenize import word_tokenize
from nltk import pos_tag
nltk.download('stopwords')
from nltk.corpus import stopwords
nltk.download('wordnet')
from nltk.corpus import wordnet
from nltk.stem import PorterStemmer
from textblob import TextBlob

# topic modeling packages
import gensim
from gensim import corpora

reviews_sample = pd.read_csv('product_reviews.csv')[['Rating','Content']]



[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


#### <center>Sentiment Analysis</center>

In [3]:
# 1.1 test preprocessing

# remove punctuation and numbers, lower case the text
def clean(text):
    text = re.sub('[^A-Za-z]+', ' ', text) # remove non-letters
    text = text.lower()
    return text

reviews_sample['Cleaned Reviews'] = reviews_sample['Content'].apply(clean)

# select the reviews we care about
reviews_sample = reviews_sample[reviews_sample['Cleaned Reviews'].str.contains('kindle')]

# tokenize, remove stop words, stem
ps = PorterStemmer() 
def token_stop_stem(text):
    tokens = word_tokenize(text) # tokenize the text
    newlist = []
    new_review = ""
    for word in tokens:
        if word not in set(stopwords.words('english')): # remove stop words
            word_stem = ps.stem(word) # stem each word
            newlist.append(word_stem)
            new_review = new_review + " " + word_stem
    return new_review

reviews_sample['Final Reviews'] = reviews_sample['Cleaned Reviews'].apply(token_stop_stem)
reviews_sample.head(2)


Unnamed: 0,Rating,Content,Cleaned Reviews,Final Reviews
1,2,The operating system is is an early android. Y...,the operating system is is an early android yo...,oper system earli android cant use download k...
2,2,I have been reading on Kindle since the Kindle...,i have been reading on kindle since the kindle...,read kindl sinc kindl releas pretti heavili i...


In [4]:
# 1.2 analyzing polarity

def getPolarityScore(review):
    return TextBlob(review).sentiment.polarity

def getPolarity(score):
    if score < 0:
        return 'Negative'
    elif score == 0:
        return 'Neutral'
    else:
        return 'Positive'

reviews_sample['Score'] = reviews_sample['Final Reviews'].apply(getPolarityScore) 
reviews_sample['Polarity'] = reviews_sample['Score'].apply(getPolarity)

#### <center>Topic Modeling</center>

In [5]:
# 2.1 preprocessing

# tokenize, remove stop words, return tokens
def token_stop(text):
    tokens = word_tokenize(text) # tokenize the text
    newlist = []
    for word in tokens:
        if word not in set(stopwords.words('english')): # remove stop words
            newlist.append(word)
    return newlist

reviews_sample['LDA Reviews'] = reviews_sample['Cleaned Reviews'].apply(token_stop)

In [6]:
# 2.2 feeding the model

#generating the bag of words
dict_ = corpora.Dictionary(reviews_sample['LDA Reviews']) 

#map the bag of words to their index
doc_term_matrix = [dict_.doc2bow(i) for i in reviews_sample['LDA Reviews']]

#applying the model
lda = gensim.models.LdaModel
lda_model = lda(doc_term_matrix,
    num_topics=5,id2word = dict_,iterations=400,passes=20,
    eval_every=None,  # don't evaluate model perplexity, takes too much time
    random_state=9651
)

lda_model.print_topics(num_words=5)


[(0,
  '0.025*"screen" + 0.012*"kindle" + 0.012*"reading" + 0.012*"battery" + 0.011*"touch"'),
 (1,
  '0.050*"kindle" + 0.046*"fire" + 0.017*"books" + 0.015*"love" + 0.015*"amazon"'),
 (2,
  '0.015*"tablet" + 0.013*"android" + 0.012*"device" + 0.012*"app" + 0.012*"apps"'),
 (3,
  '0.030*"kindle" + 0.019*"fire" + 0.018*"amazon" + 0.011*"one" + 0.010*"get"'),
 (4,
  '0.067*"ipad" + 0.023*"fire" + 0.023*"kindle" + 0.015*"great" + 0.014*"tablet"')]