#### Data from [Yelp Dataset JSON](https://www.yelp.com/dataset)

In [1]:
# Import dependencies
import pandas as pd
import json

# Import NLTK
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer 


In [2]:
# Create and Open data file
data_file = open('../../Data/archive/yelp_academic_dataset_review.json', encoding = 'utf8')
data = []

# Read in data
for line in data_file:
    data.append(json.loads(line))

review_df = pd.DataFrame(data)
data_file.close()

In [None]:
# review_df.head()

In [22]:
review_df.columns

Index(['review_id', 'user_id', 'business_id', 'stars', 'useful', 'funny',
       'cool', 'text', 'date'],
      dtype='object')

In [3]:
# Drop reviews from before 1 June 2019 at midnight
review_df = review_df[review_df['date'] >= '2019-06-01 00:00:00']

In [4]:
# Sort reviews temporally
review_df = review_df.sort_values(by = 'date')

In [5]:
review_df.head()

Unnamed: 0,review_id,user_id,business_id,stars,useful,funny,cool,text,date
5220767,YNt_oiBLUSbmVgrEzgVhqQ,m6pbhQsplRN__dWGfSDiuw,wP2ok4O0GsR9td7Iiv1zKA,5.0,0,0,0,My friend and I were searching for a quick and...,2019-06-01 00:00:27
1878123,_AnR2n91AwsB0XPlkVNuFQ,qDmEz_StNWi9ZF17h3olRg,VQcCL9PiNL_wkGf-uF3fjg,5.0,0,0,0,This place was awesome! From the wait staff to...,2019-06-01 00:00:37
4166158,pO6s7ZVlCAGGRBQwPkTD5Q,vzymudSlj6Gpk_jjHrz1gA,DpgizymrlpkAc-dAnFBWFQ,5.0,1,0,2,I never write a review but wanted to make sure...,2019-06-01 00:00:40
3201198,l2Gx9_IttTrkIH370QdjEQ,YSW26aHwfMTy2KscDq-ODQ,gP_oWJykA2RocIs_GurKWQ,5.0,0,0,0,This place is so good. We had never been there...,2019-06-01 00:00:44
3896993,rq7PtoOMU2ELcsJtYLmjDw,9gZ4R3EHF__2S9gSRHdr3A,sKDqswbR_bwxLueSOHqqNA,1.0,0,0,0,Horrible service. We ordered at 6 pm through t...,2019-06-01 00:01:00


In [6]:
# Set stop words
stop_words = set(stopwords.words('english'))

In [7]:
# Drop all positive reviews and reset index
bad_review_df = review_df[review_df['stars'] < 3.0].reset_index()

In [None]:
# new_text = nltk.Text(test)

In [None]:
# new_text.concordance('artichoke')

In [18]:
# Drop unnecessary columns
bad_review_df.drop(['review_id', 'user_id', 'useful', 'funny', 'cool'], axis = 1, inplace = True)

446108

In [15]:
bad_review_df.iloc[[10000]]

10000    Everything went great with the oil change up u...
Name: text, dtype: object

In [None]:
# Take only the review coulumn
reviews = bad_review_df['text']


In [None]:
reviews[12]

In [None]:
# Define function to ensure each word is lemmatized as the proper part of speech
def lemma_pos(text):
    
    # Dependencies
    from nltk.corpus import wordnet as wn
    from nltk.stem.wordnet import WordNetLemmatizer
    from nltk import word_tokenize, pos_tag
    from collections import defaultdict

    # Create tag map
    tag_map = defaultdict(lambda : wn.NOUN)
    tag_map['J'] = wn.ADJ
    tag_map['V'] = wn.VERB
    tag_map['R'] = wn.ADV


    tokens = word_tokenize(text)
    lmtzr = WordNetLemmatizer()

    # Initialize list
    lemmas = []

    for token, tag in pos_tag(tokens):
        lemma = lmtzr.lemmatize(token, tag_map[tag[0]])
        lemmas.append(lemma)
    
    return lemmas

In [None]:
# Defining function to word tokenize and lemmatize each review, then check for sense words

def create_word_list(review):
    
    # Define stop words
    stop_words = set(stopwords.words('english'))

    # Word tokenize review
    review_words = word_tokenize(review)

    # Filter words 
    filtered_words = []

    filtered_words = [
        word for word in review_words if word.casefold() not in stop_words
    ]

    # Lemmatize filtered words
    lemmas = lemmas_pos(filtered_words)
    
    # Check for sense words
    
    
    