In [24]:
import json
from langdetect import detect

from gensim import corpora, models

import nltk
from nltk.corpus import wordnet as wn
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize

import re
import sys

In [2]:
# Files
TEXT_FILE = 'review_copy.txt'
OUTPUT_FILE = 'review_sentences.json'

In [3]:
# parses the data
reviews = []
with open(TEXT_FILE, 'rb') as f:
    for line in f:
        data = json.loads(line)
        reviews.append(data)

In [4]:
for review in reviews:
    for key in ('title','author', 'date_stayed', 'offering_id', 'num_helpful_votes', 'date', 'id', 'via_mobile'):
        if key in review:
            del review[key]
    review['text']=review['text'].lower()
    if not detect(review['text'])=='en':
        reviews.remove(review)
    if len(review['ratings'])!=7:
        reviews.remove(review)

In [11]:
# collects all the textual reviews in a list of lists with sentences
all_reviews = []
for review in reviews:
    all_reviews.append(re.split(r' *[\.\?!][\'"\)\]]* *', review['text']))

In [16]:
word_tokens = []

for review in all_reviews:
    word_tokens_sentence = []
    for sentence in review:
        word_tokens_sentence.append(word_tokenize(sentence))
    word_tokens.append(word_tokens_sentence)

In [19]:
# Based on some papers we looked at, we manually removed some of the english stopwords which could affect our emotion analysis 
stop_words = set(stopwords.words('english'))-set(('no', 'not', 'didn', 'doesn', 'don', 'down', 'hasn', 'haven'))

filtered_reviews = []
for word_token in word_tokens:
    filtered_reviews_sentence = []
    for word_token_sentence in word_token:
        filtered_reviews_sentence.append([w for w in word_token_sentence if not w in stop_words])
    filtered_reviews.append(filtered_reviews_sentence)

In [21]:
# removes the non-alphabetic characters 
isalpha_filtered_reviews = []
for review in filtered_reviews:
    isalpha_filtered_sentences = []
    for sentence in review:
        isalpha_sentence = []
        for word in sentence:
            #isalpha_review.append(regex.sub('', word))
            if word.isalpha() == True:
                isalpha_sentence.append(word)
        isalpha_filtered_sentences.append(isalpha_sentence)
    isalpha_filtered_reviews.append(isalpha_filtered_sentences)

In [22]:
all_reviews = isalpha_filtered_reviews

In [26]:
# POS-Tagging and Lemmatization
un2wn_mapping = {"VERB" : wn.VERB, "NOUN" : wn.NOUN, "ADJ" : wn.ADJ, "ADV" : wn.ADV}

docs = []

for review in all_reviews:
    lemmatized_reviews = []
    for sentence in review:
        lemmatized_sentence = []
        for w, p in nltk.pos_tag(sentence, tagset="universal"):
            if p in un2wn_mapping.keys():
                lemma = nltk.WordNetLemmatizer().lemmatize(w, pos = un2wn_mapping[p])
            else:
                lemma = nltk.WordNetLemmatizer().lemmatize(w)

            lemmatized_sentence.append(lemma.lower())  # case insensitive
        lemmatized_reviews.append(lemmatized_sentence)
    docs.append(lemmatized_reviews)

In [27]:
with open(OUTPUT_FILE, 'w') as f:
    json.dump(docs, f)