In [1]:
import json
from langdetect import detect

import nltk

from nltk.corpus import wordnet as wn

from gensim import corpora, models

from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize

import re
import sys

Slow version of gensim.models.doc2vec is being used


In [2]:
# Files
TEXT_FILE = 'review_copy.txt'
OUTPUT_FILE = 'review_normal.json'

In [3]:
# parses the data
reviews = []
with open(TEXT_FILE, 'rb') as f:
    for line in f:
        data = json.loads(line)
        reviews.append(data)

In [4]:
for review in reviews:
    for key in ('title','author', 'date_stayed', 'offering_id', 'num_helpful_votes', 'date', 'id', 'via_mobile'):
        if key in review:
            del review[key]
    review['text']=review['text'].lower()
    if not detect(review['text'])=='en':
        reviews.remove(review)
    if len(review['ratings'])!=7:
        reviews.remove(review)

In [5]:
# collects all the textual reviews in a list
all_reviews=[]
for review in reviews:
    all_reviews.append(review['text'])

In [6]:
# Based on some papers I looked at, I manually removed some of the english stopwords which could affect our emotion analysis 
stop_words = set(stopwords.words('english'))-set(('no', 'not', 'didn', 'doesn', 'don', 'down', 'hasn', 'haven'))
word_tokens = []
for review in all_reviews:
    word_tokens.append(word_tokenize(review))

In [7]:
filtered_sentences=[]
for word_token in word_tokens:
    filtered_sentences.append([w for w in word_token if not w in stop_words])

In [8]:
# removes the non-alphabetic characters 
isalpha_filtered_sentences=[]
for sentence in filtered_sentences:
    isalpha_sentence=[]
    for word in sentence:
        #isalpha_sentence.append(regex.sub('', word))
        if word.isalpha()==True:
            isalpha_sentence.append(word)
    isalpha_filtered_sentences.append(isalpha_sentence)

In [9]:
all_reviews=isalpha_filtered_sentences

In [10]:
# POS-Tagging and Lemmatization
un2wn_mapping = {"VERB" : wn.VERB, "NOUN" : wn.NOUN, "ADJ" : wn.ADJ, "ADV" : wn.ADV}

docs = []

for review in all_reviews:
    lemmatized_reviews = []
    for w, p in nltk.pos_tag(review, tagset="universal"):
        if p in un2wn_mapping.keys():
            lemma = nltk.WordNetLemmatizer().lemmatize(w, pos = un2wn_mapping[p])
        else:
            lemma = nltk.WordNetLemmatizer().lemmatize(w)
            
        lemmatized_reviews.append(lemma.lower())  # case insensitive
        
    docs.append(lemmatized_reviews)

In [11]:
with open(OUTPUT_FILE, 'w') as f:
    json.dump(docs, f)