In [None]:
import pandas as pd
import numpy as np
from tqdm import tqdm
import plotly.graph_objects as go
import plotly.express as px
from itertools import islice
import matplotlib.lines as mlines
import seaborn as sns
import matplotlib.pyplot as plt

import datetime as dt
import re

#### Loading the dataset

In [None]:
for i in tqdm(range(1, 10)):
    url = '/Users/prajwalkhot/Documents/IDS/Project/Yelp/yelp_dataset/Review_files/yelp_academic_dataset_review'+str(i)+'.json'
    globals()['df%s' % i]  = pd.read_json(url) 

In [None]:
frames = [df1, df2, df3, df4, df5, df6, df7, df8, df9]
reviews = pd.concat(frames)
reviews

In [None]:
reviews = reviews[['review_id', 'date', 'stars', 'text', 'cool', 'funny', 'useful', 'business_id', 'user_id']]
reviews['month'] = reviews['date'].dt.to_period('M')
reviews['date'] = reviews['date'].dt.date
reviews

In [None]:
review_stars = reviews['stars'].value_counts()
review_stars_percent = review_stars.apply(lambda i : i / len(reviews) * 100)
review_stars_percent

In [None]:
fig = px.bar(x=review_stars_percent.index, y=review_stars_percent)
fig.update_layout(
    width = 500,
    xaxis_title="Stars",
    yaxis_title="Percentage of reviews",
)
fig.show()

#### Extracting month and year from the date

In [None]:
review_monthly = reviews.groupby(['month']).count().reset_index(drop= False)
review_monthly['month'] = review_monthly.month.astype(str)
review_monthly

In [None]:
fig = px.line(x=review_monthly['month'], y=review_monthly['review_id'])
fig.update_layout(
    xaxis_title="Stars",
    yaxis_title="Percentage of reviews",
)
fig.show()

## 3 Text Mining
#### Removing Stopwords

In [None]:
from nltk.corpus import stopwords
from sklearn.feature_extraction._stop_words import ENGLISH_STOP_WORDS
my_stop_words = set(stopwords.words('english') + list(ENGLISH_STOP_WORDS) + ['super', 'duper', 'place'])

In [None]:
from wordcloud import WordCloud# concatenate all the reviews into one single string 
full_text = ' '.join(reviews['text'])
cloud_no_stopword = WordCloud(background_color='white', stopwords=my_stop_words).generate(full_text)
plt.imshow(cloud_no_stopword, interpolation='bilinear')
plt.axis('off')
plt.show()

In [None]:
reviews_sample = reviews.copy()

#### Tokenizing

In [None]:
from nltk.tokenize import word_tokenize
from nltk import FreqDist
full_text = ' '.join(reviews_sample['text'])
lower_full_text = full_text.lower()
word_tokens = word_tokenize(lower_full_text)
tokens = list()

In [None]:
for word in tqdm(word_tokens):
    if word.isalpha() and word not in my_stop_words:
        tokens.append(word)
        token_dist = FreqDist(tokens)
dist = pd.DataFrame(token_dist.most_common(20),columns=['Word', 'Frequency'])

In [None]:
tokens

In [None]:
dist

#### Stemming

In [None]:
from nltk.stem import PorterStemmer
porter = PorterStemmer()
stemmed_tokens =[porter.stem(word) for word in tokens]
stemmed_token_dist = FreqDist(stemmed_tokens)
stemmed_dist = pd.DataFrame(stemmed_token_dist.most_common(20),columns=['Word', 'Frequency'])

In [None]:
stemmed_token_dist

#### Creating bi-grams

In [None]:
from sklearn.feature_extraction.text import CountVectorizer
vect = CountVectorizer(stop_words=my_stop_words, ngram_range=(2,2))
bigrams = vect.fit_transform(reviews_sample['text'])
bigram_df = pd.DataFrame(bigrams.toarray(), columns=vect.get_feature_names())
bigram_frequency = pd.DataFrame(bigram_df.sum(axis=0)).reset_index()
bigram_frequency.columns = ['bigram', 'frequency']
bigram_frequency = bigram_frequency.sort_values(by='frequency', ascending=False).head(20)

In [None]:
bigram_frequency

In [None]:
from nltk.tokenize import sent_tokenize
good_reviews = ' '.join(reviews_sample.text)# split the long string into sentences
sentences_good = sent_tokenize(good_reviews)
good_token_clean = list()# get tokens for each sentence
for sentence in tqdm(sentences_good):
    eng_word = re.findall(r'[A-Za-z\-]+', sentence)
    good_token_clean.append([i.lower() for i in eng_word if i.lower() not in my_stop_words])

In [None]:
good_reviews

In [None]:
good_token_clean

#### Word2Vec method

In [None]:
from gensim.models import Word2Vec
model_ted = Word2Vec(sentences=good_token_clean, window=10, min_count=1, workers=4, sg=0)
model_ted.predict_output_word(['service'], topn=10)

In [None]:
# Load SentimentIntensityAnalyzer
import nltk
from nltk.sentiment.vader import SentimentIntensityAnalyzer# Instantiate new SentimentIntensityAnalyzer
nltk.download('vader_lexicon')
sid = SentimentIntensityAnalyzer()# Generate sentiment scores
sentiment_scores = reviews_sample['text'].apply(sid.polarity_scores)
sentiment = sentiment_scores.apply(lambda x: x['compound'])
# monthly_sentiment = sentiment.resample('M').mean()

In [None]:
sentiment_scores

In [None]:
sentiment

In [None]:
reviews_sample

In [None]:
type(sentiment[0])

In [None]:
for i in tqdm(sentiment.index):
    if sentiment[i] >= 0:
        reviews_sample['label'][i] = 1
    elif sentiment[i] < 0:
        reviews_sample['label'][i] = -1
#         print(sentiment[i])

In [None]:
reviews_sample