In [1]:
# Import Dependencies and modules
%matplotlib inline
import numpy as np
import pandas as pd
import scipy
import sklearn
import spacy
import matplotlib.pyplot as plt
import seaborn as sns
import re
import string
from string import punctuation
from collections import Counter
from io import StringIO
from nltk.corpus import stopwords
import nltk
import glob
import errno
import os
import json

In [2]:
train_df = pd.read_csv('train.csv')

In [3]:
# Assign labels based on ratings for positive and negative.
train_df['sentiment'] = np.where(train_df['rating']>3, 'positive', 'negative')
train_df['numeric_sentiment'] = np.where(train_df['sentiment']=='positive', 1, 0)

In [4]:
# Remove null rows
train_df = train_df.dropna()

In [5]:
train_df.head()

Unnamed: 0,product_name,brand_name,rating,reviews,sentiment,numeric_sentiment
0,"""CLEAR CLEAN ESN"" Sprint EPIC 4G Galaxy SPH-D7...",Samsung,5,I feel so LUCKY to have found this used (phone...,positive,1
1,"""CLEAR CLEAN ESN"" Sprint EPIC 4G Galaxy SPH-D7...",Samsung,4,"nice phone, nice up grade from my pantach revu...",positive,1
2,"""CLEAR CLEAN ESN"" Sprint EPIC 4G Galaxy SPH-D7...",Samsung,5,Very pleased,positive,1
3,"""CLEAR CLEAN ESN"" Sprint EPIC 4G Galaxy SPH-D7...",Samsung,4,It works good but it goes slow sometimes but i...,positive,1
4,"""CLEAR CLEAN ESN"" Sprint EPIC 4G Galaxy SPH-D7...",Samsung,4,Great phone to replace my lost phone. The only...,positive,1


In [6]:
from spacy.lang.en import English

spacy.load('en')
parser = English()

# Function to tokenize text
def tokenize(text):
    lda_tokens = []
    tokens  = parser(text)
    for token in tokens:
        if token.orth_.isspace():
            continue
        elif token.like_url:
            lda_tokens.append('URL')
        elif token.orth_.startswith('@'):
            lda_tokens.append('SCREEN_NAME')
        else:
            lda_tokens.append(token.lower_)
    return lda_tokens

In [7]:
# Download wordnet to find meaning of words, synonyms and antonyms
nltk.download('wordnet')

[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/ayankarim/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [8]:
from nltk.corpus import wordnet as wn

# Function to lemmatize and more words to their root
def get_lemma(word):
    lemma = wn.morphy(word)
    if lemma is None:
        return word
    else:
        return lemma

In [9]:
# Compile set of stopwords
nltk.download('stopwords')
en_stop = set(nltk.corpus.stopwords.words('english'))

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/ayankarim/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [10]:
def prepare_text_for_lda(text):
    tokens = tokenize(text)
    tokens = [token for token in tokens if len(token) > 4]
    tokens = [token for token in tokens if token not in en_stop]
    tokens = [get_lemma(token) for token in tokens]
    return tokens

In [11]:
text_data = []

# Prepare training set for LDA
tokens = train_df['reviews'].apply(lambda x: prepare_text_for_lda(x))

# Append tokenized text to list of tokenized data
null = tokens.apply(lambda x: text_data.append(x))