In [1]:
# Import Dependencies and modules
%matplotlib inline
import numpy as np
import pandas as pd
import scipy
import sklearn
import spacy
import matplotlib.pyplot as plt
import seaborn as sns
import re
import string
from string import punctuation
from collections import Counter
from io import StringIO
from nltk.corpus import stopwords
import nltk
import glob
import errno
import os
import json

In [2]:
train_df = pd.read_csv('train.csv')

In [3]:
# Assign labels based on ratings for positive and negative.
train_df['sentiment'] = np.where(train_df['rating']>3, 'positive', 'negative')
train_df['numeric_sentiment'] = np.where(train_df['sentiment']=='positive', 1, 0)

In [4]:
# Remove null rows
train_df = train_df.dropna()

In [5]:
len(train_df)

123907

In [6]:
from spacy.lang.en import English

spacy.load('en')
parser = English()

# Function to tokenize text
def tokenize(text):
    lda_tokens = []
    tokens  = parser(text)
    for token in tokens:
        if token.orth_.isspace():
            continue
        elif token.like_url:
            lda_tokens.append('URL')
        elif token.orth_.startswith('@'):
            lda_tokens.append('SCREEN_NAME')
        else:
            lda_tokens.append(token.lower_)
    return lda_tokens

In [7]:
# Download wordnet to find meaning of words, synonyms and antonyms
nltk.download('wordnet')

[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/ayankarim/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [8]:
from nltk.corpus import wordnet as wn

# Function to lemmatize and more words to their root
def get_lemma(word):
    lemma = wn.morphy(word)
    if lemma is None:
        return word
    else:
        return lemma

In [9]:
# Compile set of stopwords
nltk.download('stopwords')
en_stop = set(nltk.corpus.stopwords.words('english'))

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/ayankarim/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [10]:
def prepare_text_for_lda(text):
    tokens = tokenize(text)
    tokens = [token for token in tokens if len(token) > 4]
    tokens = [token for token in tokens if token not in en_stop]
    tokens = [get_lemma(token) for token in tokens]
    return tokens

In [11]:
text_data = []

# Prepare training set for LDA
tokens = train_df['reviews'].apply(lambda x: prepare_text_for_lda(x))

# Append tokenized text to list of tokenized data
null = tokens.apply(lambda x: text_data.append(x))

In [12]:
from gensim import corpora

# Assemble tokenized text data into a dictionary
dictionary = corpora.Dictionary(text_data)

In [13]:
# Create Bag of Words corpus from text data
corpus = [dictionary.doc2bow(text) for text in text_data]

In [21]:
from gensim import models

# Create TF-IDF vectors from our bag of words
tfidf = models.TfidfModel(corpus)
corpus_tfidf = tfidf[corpus]

from pprint import pprint

for doc in corpus_tfidf:
    pprint(doc)
    break

[(0, 0.2971015349024831),
 (1, 0.2829494137811665),
 (2, 0.2321433031173297),
 (3, 0.19882151854791358),
 (4, 0.21382809584578655),
 (5, 0.3589515305482139),
 (6, 0.2250311418544782),
 (7, 0.31047777115186354),
 (8, 0.10265436888018073),
 (9, 0.13924146238697405),
 (10, 0.14800153997554158),
 (11, 0.28094425063675976),
 (12, 0.21890549088517897),
 (13, 0.1721957881901213),
 (14, 0.3891339875656897),
 (15, 0.11756723195577365),
 (16, 0.20752621743387425)]


In [22]:
import pickle

pickle.dump(corpus, open('corpus.pkl', 'wb'))
pickle.dump(corpus_tfidf, open('corpus_tfidf.pkl', 'wb'))
dictionary.save('dictionary.gensim')


# 3 Topics

In [24]:
import gensim

NUM_TOPICS = 3

# Extract Topics
ldamodel = gensim.models.ldamodel.LdaModel(corpus, num_topics = NUM_TOPICS, id2word = dictionary, passes=15)
ldamodel.save('model3.gensim')

# Print terms for topics
topics = ldamodel.print_topics(num_words=10)
for topic in topics:
    print(topic)

(0, '0.100*"phone" + 0.017*"unlock" + 0.015*"samsung" + 0.013*"would" + 0.012*"return" + 0.011*"mobile" + 0.011*"amazon" + 0.011*"buy" + 0.010*"charger" + 0.010*"problem"')
(1, '0.080*"phone" + 0.022*"battery" + 0.019*"screen" + 0.014*"charge" + 0.012*"samsung" + 0.011*"camera" + 0.009*"feature" + 0.008*"really" + 0.007*"would" + 0.007*"galaxy"')
(2, '0.126*"phone" + 0.095*"great" + 0.058*"works" + 0.032*"product" + 0.032*"excellent" + 0.024*"perfect" + 0.022*"happy" + 0.018*"recommend" + 0.017*"everything" + 0.017*"price"')


# 5 Topics

In [17]:
import gensim

NUM_TOPICS = 5

# Extract Topics
ldamodel = gensim.models.ldamodel.LdaModel(corpus, num_topics = NUM_TOPICS, id2word = dictionary, passes=15)
ldamodel.save('model5.gensim')

# Print terms for topics
topics = ldamodel.print_topics(num_words=10)
for topic in topics:
    print(topic)

(0, '0.181*"phone" + 0.117*"great" + 0.067*"works" + 0.027*"happy" + 0.021*"everything" + 0.020*"expect" + 0.020*"price" + 0.016*"arrive" + 0.016*"purchase" + 0.015*"love"')
(1, '0.075*"phone" + 0.033*"samsung" + 0.015*"camera" + 0.014*"galaxy" + 0.013*"feature" + 0.012*"mobile" + 0.011*"screen" + 0.009*"android" + 0.008*"better" + 0.007*"model"')
(2, '0.120*"phone" + 0.029*"battery" + 0.018*"charge" + 0.018*"would" + 0.015*"screen" + 0.015*"problem" + 0.012*"month" + 0.012*"buy" + 0.012*"return" + 0.009*"issue"')
(3, '0.081*"unlock" + 0.032*"international" + 0.025*"version" + 0.022*"excelente" + 0.014*"factory" + 0.014*"language" + 0.013*"amaze" + 0.013*"excelent" + 0.013*"english" + 0.013*"lock"')
(4, '0.103*"product" + 0.075*"excellent" + 0.052*"recommend" + 0.038*"charger" + 0.027*"perfect" + 0.019*"seller" + 0.019*"would" + 0.017*"cellphone" + 0.016*"original" + 0.015*"adapter"')


# 7 Topics

In [18]:
import gensim

NUM_TOPICS = 7

# Extract Topics
ldamodel = gensim.models.ldamodel.LdaModel(corpus, num_topics = NUM_TOPICS, id2word = dictionary, passes=15)
ldamodel.save('model7.gensim')

# Print terms for topics
topics = ldamodel.print_topics(num_words=10)
for topic in topics:
    print(topic)

(0, '0.098*"unlock" + 0.066*"phone" + 0.039*"love" + 0.032*"buy" + 0.032*"version" + 0.023*"mobile" + 0.017*"factory" + 0.017*"language" + 0.016*"country" + 0.015*"english"')
(1, '0.119*"battery" + 0.074*"phone" + 0.073*"charge" + 0.050*"charger" + 0.019*"excelente" + 0.015*"screen" + 0.013*"adapter" + 0.012*"last" + 0.012*"expectation" + 0.011*"black"')
(2, '0.179*"phone" + 0.150*"great" + 0.054*"works" + 0.047*"excellent" + 0.043*"product" + 0.028*"price" + 0.023*"expect" + 0.018*"awesome" + 0.015*"thanks" + 0.013*"quality"')
(3, '0.098*"perfect" + 0.087*"works" + 0.041*"perfectly" + 0.030*"everything" + 0.030*"exactly" + 0.029*"describe" + 0.027*"condition" + 0.025*"beautiful" + 0.021*"thank" + 0.019*"manual"')
(4, '0.105*"phone" + 0.037*"purchase" + 0.031*"recommend" + 0.029*"would" + 0.028*"happy" + 0.023*"seller" + 0.023*"product" + 0.021*"receive" + 0.021*"order" + 0.018*"brand"')
(5, '0.035*"camera" + 0.031*"screen" + 0.030*"samsung" + 0.029*"feature" + 0.024*"galaxy" + 0.020*"

# 10 Topics

In [19]:
import gensim

NUM_TOPICS = 10

# Extract Topics
ldamodel = gensim.models.ldamodel.LdaModel(corpus, num_topics = NUM_TOPICS, id2word = dictionary, passes=15)
ldamodel.save('model10.gensim')

# Print terms for topics
topics = ldamodel.print_topics(num_words=10)
for topic in topics:
    print(topic)

(0, '0.235*"great" + 0.231*"phone" + 0.100*"works" + 0.035*"price" + 0.031*"expect" + 0.027*"awesome" + 0.018*"look" + 0.013*"everything" + 0.011*"brand" + 0.011*"problem"')
(1, '0.092*"phone" + 0.027*"camera" + 0.024*"feature" + 0.020*"galaxy" + 0.019*"really" + 0.017*"samsung" + 0.016*"android" + 0.014*"better" + 0.013*"picture" + 0.013*"still"')
(2, '0.185*"screen" + 0.039*"button" + 0.035*"touch" + 0.027*"cellphone" + 0.020*"speaker" + 0.020*"drop" + 0.018*"sound" + 0.016*"scratch" + 0.015*"volume" + 0.015*"small"')
(3, '0.112*"phone" + 0.061*"battery" + 0.046*"charge" + 0.033*"problem" + 0.032*"charger" + 0.023*"working" + 0.020*"month" + 0.018*"issue" + 0.016*"would" + 0.013*"start"')
(4, '0.063*"samsung" + 0.021*"version" + 0.019*"international" + 0.017*"warranty" + 0.016*"device" + 0.013*"model" + 0.009*"update" + 0.009*"galaxy" + 0.009*"google" + 0.008*"country"')
(5, '0.135*"phone" + 0.039*"unlock" + 0.030*"service" + 0.030*"mobile" + 0.021*"verizon" + 0.019*"network" + 0.019

In [20]:
# Create Visualizations of topic clusters using pyLDAvis



# ... Perhaps you should just do the topic modelling on your scraped dataset
# Label your dataset with the topics using the highest probability of the terms being in a given topic

# Use the train dataset for training on sentiment labels
# After an appropriate accuracy (above 98%)
# Use the trained model to assign sentiments to scraped data

# MAY NEED TO SCRAPE TEST DATA. PERHAPS TWO DIFFERENT PHONES!!!!!