**yelp_academic_dataset_review.json**
Contains full review text data including the user_id that wrote the review and the business_id the review is written for.


In [None]:
!conda update -y tqdm 

Collecting package metadata (current_repodata.json): done
Solving environment: done

## Package Plan ##

  environment location: /home/ahfeng/anaconda3/envs/bigdata

  added / updated specs:
    - tqdm


The following packages will be downloaded:

    package                    |            build
    ---------------------------|-----------------
    attrs-21.2.0               |     pyhd3eb1b0_0          46 KB
    bleach-4.0.0               |     pyhd3eb1b0_0         113 KB
    importlib-metadata-4.8.2   |   py39h06a4308_0          39 KB
    importlib_metadata-4.8.2   |       hd3eb1b0_0          12 KB
    jinja2-3.0.2               |     pyhd3eb1b0_0         110 KB
    jsonschema-3.2.0           |     pyhd3eb1b0_2          47 KB
    jupyter_client-7.0.6       |     pyhd3eb1b0_0          90 KB
    jupyter_core-4.9.1         |   py39h06a4308_0          75 KB
    markupsafe-2.0.1           |   py39h27cfd23_0          22 KB
    mistune-0.8.4              |py39h27cfd23_1000          57 KB
  

```
{
    // string, 22 character unique review id
    "review_id": "zdSx_SD6obEhz9VrW9uAWA",

    // string, 22 character unique user id, maps to the user in user.json
    "user_id": "Ha3iJu77CxlrFm-vQRs_8g",

    // string, 22 character business id, maps to business in business.json
    "business_id": "tnhfDv5Il8EaGSXZGiuQGg",

    // integer, star rating
    "stars": 4,

    // string, date formatted YYYY-MM-DD
    "date": "2016-03-09",

    // string, the review itself
    "text": "Great place to hang out after work: the prices are decent, and the ambience is fun. It's a bit loud, but very lively. The staff is friendly, and the food is good. They have a good selection of drinks.",

    // integer, number of useful votes received
    "useful": 0,

    // integer, number of funny votes received
    "funny": 0,

    // integer, number of cool votes received
    "cool": 0
}
```

# Spark Setup 

The following blocks of code will set up our Spark connection. 

In [None]:
# Spark setup
%%capture
!apt install libkrb5-dev
!wget https://downloads.apache.org/spark/spark-3.0.1/spark-3.0.1-bin-hadoop3.2.tgz
!tar xf spark-3.0.1-bin-hadoop3.2.tgz
!pip install findspark
!pip install sparkmagic
!pip install pyspark
!pip install spacy
! pip install pyspark --user
! pip install seaborn --user
! pip install plotly --user
! pip install imageio --user
! pip install folium --user

In [None]:
%%capture
!apt update
!apt install gcc python-dev libkrb5-dev

In [None]:
from pyspark.sql import SparkSession
from pyspark.sql.types import *
import pyspark.sql.functions as F

import os

spark = SparkSession.builder.appName('bigdata').getOrCreate()


21/12/12 14:34:28 WARN Utils: Your hostname, lambda-quad resolves to a loopback address: 127.0.1.1; using 10.103.79.83 instead (on interface wlp69s0)
21/12/12 14:34:28 WARN Utils: Set SPARK_LOCAL_IP if you need to bind to another address
Using Spark's default log4j profile: org/apache/spark/log4j-defaults.properties
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
21/12/12 14:34:28 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


In [None]:
%load_ext sparkmagic.magics

In [None]:
!pip install pymongo

Collecting pymongo
  Downloading pymongo-4.0-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (459 kB)
[K     |████████████████████████████████| 459 kB 7.0 MB/s eta 0:00:01
[?25hInstalling collected packages: pymongo
Successfully installed pymongo-4.0


In [None]:
import numpy as np
import pandas as pd
import matplotlib

#misc
import gc
import time
import warnings
from IPython.display import Image as I

#NLP
import spacy

#graph section
import pandas as pd
import numpy as np

# JSON parsing
import json

# Pandas SQL
!pip install pandasql
import pandasql as ps

# HTML parsing

import urllib

# SQLite RDBMS
import sqlite3

# SparkFiles
from pyspark import SparkFiles

# NoSQL DB
from pymongo import MongoClient
from pymongo.errors import DuplicateKeyError, OperationFailure

import os
os.environ["SPARK_HOME"] = "/content/spark-3.0.1-bin-hadoop3.2"
os.environ["JAVA_HOME"] = "/usr/lib/jvm/java-8-openjdk-amd64"
import pyspark
from pyspark.sql import SQLContext

try:
    if(spark == None):
        spark = SparkSession.builder.appName('Initial').getOrCreate()
        sqlContext=SQLContext(spark)
except NameError:
    spark = SparkSession.builder.appName('Initial').getOrCreate()
    sqlContext=SQLContext(spark)




# Downloading Yelp Data
We stored the Yelp review dataset in Google Drive. The following 2 cells will download and unzip the data for use in Colab. 

In [None]:
from google_drive_downloader import GoogleDriveDownloader as gdd

gdd.download_file_from_google_drive(file_id='1wKAbVYj_OUImwzcYPYrdenVekBpkaZzY',
                                    dest_path='/content/yelp_dataset.tgz')

Downloading 1wKAbVYj_OUImwzcYPYrdenVekBpkaZzY into /content/yelp_dataset.tgz... 

In [None]:
!tar -xf yelp_dataset.tgz

#Preprocessing and Loading the Yelp Review Dataset into Spark


In [None]:
#Create spark dataframes
yelp_review_sdf = spark.read.json('yelp_academic_dataset_review.json')
business_sdf = spark.read.json('yelp_academic_dataset_business.json')
user_sdf = spark.read.json('yelp_academic_dataset_user.json')

21/12/12 14:34:42 WARN package: Truncated the string representation of a plan since it was too large. This behavior can be adjusted by setting 'spark.sql.debug.maxToStringFields'.
                                                                                

In [None]:
#Check types of data in the spark dataframe
print(yelp_review_sdf.dtypes)
print(business_sdf.dtypes)
print(user_sdf.dtypes)

[('business_id', 'string'), ('cool', 'bigint'), ('date', 'string'), ('funny', 'bigint'), ('review_id', 'string'), ('stars', 'double'), ('text', 'string'), ('useful', 'bigint'), ('user_id', 'string')]
[('address', 'string'), ('attributes', 'struct<AcceptsInsurance:string,AgesAllowed:string,Alcohol:string,Ambience:string,BYOB:string,BYOBCorkage:string,BestNights:string,BikeParking:string,BusinessAcceptsBitcoin:string,BusinessAcceptsCreditCards:string,BusinessParking:string,ByAppointmentOnly:string,Caters:string,CoatCheck:string,Corkage:string,DietaryRestrictions:string,DogsAllowed:string,DriveThru:string,GoodForDancing:string,GoodForKids:string,GoodForMeal:string,HairSpecializesIn:string,HappyHour:string,HasTV:string,Music:string,NoiseLevel:string,Open24Hours:string,OutdoorSeating:string,RestaurantsAttire:string,RestaurantsCounterService:string,RestaurantsDelivery:string,RestaurantsGoodForGroups:string,RestaurantsPriceRange2:string,RestaurantsReservations:string,RestaurantsTableService:s

In [None]:
#Check sizes spark dataframes
review_size = yelp_review_sdf.count()
business_size = business_sdf.count()
user_size = user_sdf.count()

                                                                                

In [None]:
#Print sizes 
print(review_size)
print(business_size)
print(user_size)

8635403
160585
2189457


In [None]:
#Create views of the dataframes

#business_id and user_id will be used to join with other dataframes
#text and stars will be used for sentiment analysis
#data will be used in EDA
yelp_review_sdf.createOrReplaceTempView('yelp')
query = '''SELECT text, stars, business_id, date, user_id
FROM yelp
'''
#Filter yelp_review_sdf
filtered_yelp_review_sdf = spark.sql(query)

business_sdf.createOrReplaceTempView('business')
query = '''SELECT business_id, name, state, review_count, stars as business_stars, categories
FROM business
'''
#Filter business_sdf
filtered_business_sdf = spark.sql(query)

user_sdf.createOrReplaceTempView('user')
query = '''SELECT user_id, review_count, average_stars
FROM user
'''
#Filter business_sdf
filtered_user_sdf = spark.sql(query)

In [None]:
#Drop NA's from dataframes 
filtered_yelp_review_sdf = filtered_yelp_review_sdf.dropna()
filtered_business_sdf = filtered_business_sdf.dropna()
filtered_user_sdf = filtered_user_sdf.dropna()

#Visualization 1 (Review Dataset)
Let's check the distribution of stars in the dataset

In [None]:
#Group by stars
star_distribution = cleaned_yelp_review_sdf.groupBy('stars').count()
star_distribution.show()

NameError: name 'cleaned_yelp_review_sdf' is not defined

In [None]:
#Convert to pandas to create barplot
star_distribution_df = star_distribution.toPandas()

In [None]:
#Scale counts to millions
star_distribution_df['count']  = star_distribution_df['count'] / 1000000

In [None]:
#Check mean value of stars 
from pyspark.sql.functions import mean as _mean, col

stars_mean = cleaned_yelp_review_sdf.select(
    _mean(col('stars')).alias('mean')).collect()

mean = stars_mean[0]['mean']
print("Mean stars for a review: ", mean)

##Distribution of Stars in Yelp Review

In [None]:
import seaborn as sns
import matplotlib.pyplot as plt

dist = sns.barplot(x = star_distribution_df['stars'], y = star_distribution_df['count'], color="salmon", saturation=.5)
dist.set(xlabel='Star Ratings', ylabel='Count (in millions)', title="Distribution of Stars in Yelp Reviews")
plt.show()

Above we see users are almost 2x as likely to leave a 5 star review than the next most popular rating (4 stars).

#Visualization 2 (Review Dataset)

Let's check the age of the data to get a sense of when these reviews were written


###Reviews Per Month


In [None]:
filtered_yelp_review_sdf.createOrReplaceTempView("table")
query = """
SELECT stars, TO_DATE(date) as date
FROM table
"""

review_date_sdf = spark.sql(query)

review_date_sdf.createOrReplaceTempView("dates")
query = """
SELECT MONTH(date) as month, YEAR(date) as year, COUNT(stars) as num_reviews
FROM dates
GROUP BY MONTH(date), YEAR(date)
"""

dates_sdf = spark.sql(query)

In [None]:
dates_sdf.show()

In [None]:
#Convert to pandas
dates_df = dates_sdf.toPandas()

In [None]:
#Convert to datetime
dates_df['date'] = pd.to_datetime(dates_df[['year', 'month']].assign(DAY=1))

Let's see the date of the first and last review in the dataset

In [None]:
first_review_date = min(dates_df['date'])
last_review_date = max(dates_df['date'])
print(first_review_date)
print(last_review_date)

In [None]:
# PLOTTING reviews per month 
import matplotlib.pyplot as plt
import seaborn as sns

fig, ax = plt.subplots(figsize = (20,12))    
fig = sns.barplot(x = 'date', y = "num_reviews", data = dates_df, 
                  estimator = sum, ci = None, ax=ax)

x_dates = dates_df['date'].dt.strftime('%Y-%m-%d').sort_values().unique()
ax.set_xticklabels(labels=x_dates, rotation=45, ha='right')

new_ticks = [i.get_text() for i in fig.get_xticklabels()]
plt.xticks(range(0, len(new_ticks), 6), new_ticks[::6])

Hard to see trends in the data when plotting per month. Let's check out the reviews per year.

###Reviews Per Year

In [None]:
review_date_sdf.createOrReplaceTempView("dates")
query = """
SELECT YEAR(date) as year, COUNT(stars) as num_reviews
FROM dates
GROUP BY YEAR(date)
"""

date_year_sdf = spark.sql(query)

date_year_df = date_year_sdf.toPandas()

In [None]:
#Convert to datetime
date_year_df['date'] = pd.to_datetime(dates_df[['year']].assign(MONTH=1,DAY=1))

In [None]:
#Generate plot 
import matplotlib.pyplot as plt
import seaborn as sns

fig, ax = plt.subplots(figsize = (20,12))    
fig = sns.barplot(x = 'year', y = "num_reviews", data = date_year_df, 
                   ax=ax)

Here we get a better idea of the timeframe the reviews are coming from. 2020 is an outlier because of the Covid-19 outbreak, and data was only partially recorded for 2021.

Add in another graph from the user data. Check yelping_since


#Visualization 3 (User Datset)

This ended up not working due to extreme outliers. Most users left less than 100 reviews, while some left as many as 15,000.

In [None]:
#Check the distribution of users's contributions to the dataset
review_count_df = filtered_user_sdf.select('review_count').toPandas()

In [None]:
#Summary of the df
review_count_df.describe()

#Visualization 4 (Business Dataset)
Let's see where our data is coming from 

##Reviews per State

In [None]:
filtered_business_sdf.createOrReplaceTempView('bus')

query = """
SELECT state, COUNT(review_count) as review_count
FROM bus
GROUP BY state
"""

state_sdf = spark.sql(query)
state_df = state_sdf.toPandas()

In [None]:
import plotly.express as px  # Be sure to import express
fig = px.choropleth(state_df,  # Input Pandas DataFrame
                    locations="state",  # DataFrame column with locations
                    color="review_count",  # DataFrame column with color values
                    hover_name="state", # DataFrame column hover info
                    locationmode = 'USA-states') # Set to plot as US States
fig.update_layout(
    title_text = 'Reviews per State', # Create a Title
    geo_scope='usa',  # Plot only the USA instead of globe
)
fig.show()  # Output the plot to the screen

##Distribution of Star Counts for Businesses


In [None]:
filtered_business_sdf.createOrReplaceTempView('bus')

query = """
SELECT COUNT(business_id) as count, business_stars
FROM bus
GROUP BY business_stars
"""

business_stars_df = spark.sql(query).toPandas()

In [None]:
sns.barplot(data=business_stars_df, x = 'business_stars', y = 'count')

#Good Business

##20 Most Reviewed Businesses



In [None]:
filtered_business_sdf.createOrReplaceTempView('bus')

query = """
SELECT name, review_count as num_reviews, business_stars
FROM bus
ORDER BY review_count DESC
LIMIT 20
"""
most_reviewed_business_df = spark.sql(query).toPandas()

In [None]:
most_reviewed_business_df

##20 Most Reviewed Businesses with a 5 star rating

In [None]:
filtered_business_sdf.createOrReplaceTempView('bus')

query = """
SELECT name, review_count as num_reviews, business_stars, categories
FROM bus
WHERE business_stars = 5
ORDER BY review_count DESC
LIMIT 20
"""
most_popular_business_df = spark.sql(query).toPandas()

In [None]:
most_popular_business_df

##Popular Categories of the Top Rated Businesses in the Dataset

###Cleaning and Processing Categories 

In [None]:
#Explode the categories column 
mpb_df = pd.DataFrame(most_popular_business_df.categories.str.split(',').tolist(), index=most_popular_business_df.name).stack()
mpb_df = mpb_df.reset_index()[[0, 'name']] # var1 variable is currently labeled 0
mpb_df.columns = ['category', 'name'] # renaming var1
mpb_df = mpb_df.astype(str)
mpb_df

In [None]:
#Merge categories and popular businesses
most_popular_categories_df = mpb_df.merge(most_popular_business_df, on = 'name')

In [None]:
#Drop Unnecessary column 
most_popular_categories_df = most_popular_categories_df.drop(['categories'], axis = 1)

In [None]:
#Strip Whitespace
most_popular_categories_df['category'] = most_popular_categories_df['category'].str.strip()

###10 Most Popular Categories Among the 20 Most Reviewed Businesses with 5 Star Ratings:

In [None]:
#get top 10 most popular categories
n = 10
most_popular_categories_df['category'].value_counts()[:n]

#Bad for Businesss
Let's see what makes a business bad

##20 Most Reviewed Businesses with Less than 1.5 Stars

In [None]:
filtered_business_sdf.createOrReplaceTempView('bus')

query = """
SELECT name, review_count as num_reviews, business_stars, categories
FROM bus
WHERE business_stars <= 1.5
ORDER BY review_count DESC
LIMIT 20
"""
least_popular_business_df = spark.sql(query).toPandas()

In [None]:
least_popular_business_df

##Popular Categories of the Worst Rated Businesses

###Cleaning and Processing Categories


In [None]:
#Explode the categories column 
lpb_df = pd.DataFrame(least_popular_business_df.categories.str.split(',').tolist(), index=least_popular_business_df.name).stack()
lpb_df = lpb_df.reset_index()[[0, 'name']] # var1 variable is currently labeled 0
lpb_df.columns = ['category', 'name'] # renaming var1
lpb_df = lpb_df.astype(str)
lpb_df

In [None]:
#Merge categories and popular businesses
least_popular_business_df = lpb_df.merge(least_popular_business_df, on = 'name')

In [None]:
#Drop columns 
least_popular_business_df = least_popular_business_df.drop(['categories'], axis = 1)

In [None]:
#Strip whitespace 
least_popular_business_df['category'] = least_popular_business_df['category'].str.strip()

In [None]:
least_popular_business_df

###10 Most Popular Categories Among the 20 Most Reviewed Businesses with 5 Star Ratings:

In [None]:
# get top 10 least popular categories
n = 10
least_popular_business_df['category'].value_counts()[:n]

Looks like these are the categories of larger corporations. Also seems like travel related businesses tend to get the worst ratings. 

In [None]:
sentiment_query = '''SELECT text, stars
FROM yelp
WHERE stars<>3
'''
yelp_sdf = spark.sql(sentiment_query)

In [None]:
#Added
#Sample here to work in pandas for rest of notebook
sampled_sdf = yelp_sdf.sample(fraction=.05,seed=69)
yelp_df = sampled_sdf.toPandas()

                                                                                

In [None]:
#Added
#Lower case all reviews
yelp_df['cleaned_reviews'] = yelp_df['text'].apply(lambda x: " ".join(x.lower() for x in x.split()))

In [None]:
#Added
#Remove punctuation from all reviews 
yelp_df['cleaned_reviews'] = yelp_df['cleaned_reviews'].str.replace('[^\w\s]','')

  yelp_df['cleaned_reviews'] = yelp_df['cleaned_reviews'].str.replace('[^\w\s]','')


In [None]:
#use spacy to tokenize
# !python -m spacy download en_core_web_sm
import spacy
nlp = spacy.load("en_core_web_sm")
# yelp_df['tokens'] = yelp_df['text'].apply(nlp)

In [None]:
yelp_df

Unnamed: 0,text,stars,cleaned_reviews
0,Was not that great. We sat for like 15 mins be...,2.0,was not that great we sat for like 15 mins bef...
1,"My brother in law suggested this place, and we...",1.0,my brother in law suggested this place and we ...
2,We were seated immediately but unfortunately i...,1.0,we were seated immediately but unfortunately i...
3,Here's a breakdown of what my boyfriend and I ...,2.0,heres a breakdown of what my boyfriend and i s...
4,Eating healthy I had the turkey burger it was ...,4.0,eating healthy i had the turkey burger it was ...
...,...,...,...
385393,King burrito is my go to Burritos when I'm nea...,5.0,king burrito is my go to burritos when im near...
385394,"Wow, such a lovely spot right off the highway ...",5.0,wow such a lovely spot right off the highway t...
385395,Where do I start with Buranko? Everything I tr...,5.0,where do i start with buranko everything i tri...
385396,Wouldn't even let a pregnant woman use the bat...,1.0,wouldnt even let a pregnant woman use the bath...


In [None]:
# text_and_stars_sdf = spark.sql(query)
# text_and_stars_sdf.dtypes

In [None]:
#View the length of our dataframe 
# text_and_stars_sdf.count()

In [None]:

# text_and_stars_sdf.show()

#Cleaning the Yelp Review Dataset

For our analysis we are only interested in the text of the reviews and the stars that the user rates the review.


In [None]:
#Cleaning text in reviews
#Step 1: Convert all text in each review to lower case

# from pyspark.sql import functions as F

# columnName = "text";
# cleaned_sdf = text_and_stars_sdf.withColumn("lowerCaseText", F.lower(F.col("text")));
# cleaned_sdf.show();

In [None]:
from pyspark.ml.feature import Tokenizer, StopWordsRemover

# #Tokenize the text 
# tokenizer = Tokenizer(inputCol="lowerCaseText", outputCol="tokens")
# tokenized = tokenizer.transform(cleaned_sdf)

# #Remove stop words 
# remover = StopWordsRemover(inputCol="tokens", outputCol="cleanTokens")
# filtered_sdf = remover.transform(tokenized)

In [None]:
# filtered_sdf.show()

***Extract text and star count columns.***

***We are only interested in positive and negative reviews for this project. We classify negative reviews as those that receive 1 or 2 star ratings and we classify positive reviews as those that receive 4 or 5 star reviews. Here we will remove reviews with 3 star reviews.***

# Feature Engineering


##Use regex to clean up the text body of each review. 

#Exploratory Data Analysis 



#Modeling

##Load the data into pandas
We load a sample of the dataset into pandas to use for modeling.

In [None]:
# sampled_sdf = filtered_sdf.sample(fraction=.05,seed=69)
# review_df = sampled_sdf.toPandas()

In [None]:
review_df = yelp_df

In [None]:
review_df['sentiment'] = review_df['stars'].apply(lambda x: np.sign(x-3))

In [None]:
review_df

Unnamed: 0,text,stars,cleaned_reviews,sentiment
0,Was not that great. We sat for like 15 mins be...,2.0,was not that great we sat for like 15 mins bef...,-1.0
1,"My brother in law suggested this place, and we...",1.0,my brother in law suggested this place and we ...,-1.0
2,We were seated immediately but unfortunately i...,1.0,we were seated immediately but unfortunately i...,-1.0
3,Here's a breakdown of what my boyfriend and I ...,2.0,heres a breakdown of what my boyfriend and i s...,-1.0
4,Eating healthy I had the turkey burger it was ...,4.0,eating healthy i had the turkey burger it was ...,1.0
...,...,...,...,...
385393,King burrito is my go to Burritos when I'm nea...,5.0,king burrito is my go to burritos when im near...,1.0
385394,"Wow, such a lovely spot right off the highway ...",5.0,wow such a lovely spot right off the highway t...,1.0
385395,Where do I start with Buranko? Everything I tr...,5.0,where do i start with buranko everything i tri...,1.0
385396,Wouldn't even let a pregnant woman use the bat...,1.0,wouldnt even let a pregnant woman use the bath...,-1.0


##Afinn Baseline

In [None]:
!pip install afinn
from afinn import Afinn

Collecting afinn
  Using cached afinn-0.1.tar.gz (52 kB)
Building wheels for collected packages: afinn
  Building wheel for afinn (setup.py) ... [?25ldone
[?25h  Created wheel for afinn: filename=afinn-0.1-py3-none-any.whl size=53448 sha256=3bff2cfb4107660c9f304ae4436fd235fb2d04c9d36330cd6cb4fdf4a6948df3
  Stored in directory: /home/ahfeng/.cache/pip/wheels/79/91/ee/8374d9bc8c6c0896a2db75afdfd63d43653902407a0e76cd94
Successfully built afinn
Installing collected packages: afinn
Successfully installed afinn-0.1


In [None]:
afinn = Afinn(language = 'en')
review_df['afinn'] = review_df['text'].apply(afinn.score)
review_df

Unnamed: 0,text,stars,cleaned_reviews,sentiment,afinn
0,Was not that great. We sat for like 15 mins be...,2.0,was not that great we sat for like 15 mins bef...,-1.0,13.0
1,"My brother in law suggested this place, and we...",1.0,my brother in law suggested this place and we ...,-1.0,0.0
2,We were seated immediately but unfortunately i...,1.0,we were seated immediately but unfortunately i...,-1.0,7.0
3,Here's a breakdown of what my boyfriend and I ...,2.0,heres a breakdown of what my boyfriend and i s...,-1.0,0.0
4,Eating healthy I had the turkey burger it was ...,4.0,eating healthy i had the turkey burger it was ...,1.0,7.0
...,...,...,...,...,...
385393,King burrito is my go to Burritos when I'm nea...,5.0,king burrito is my go to burritos when im near...,1.0,15.0
385394,"Wow, such a lovely spot right off the highway ...",5.0,wow such a lovely spot right off the highway t...,1.0,22.0
385395,Where do I start with Buranko? Everything I tr...,5.0,where do i start with buranko everything i tri...,1.0,7.0
385396,Wouldn't even let a pregnant woman use the bat...,1.0,wouldnt even let a pregnant woman use the bath...,-1.0,10.0


In [None]:
review_df['afinn_clean'] = review_df['cleaned_reviews'].apply(afinn.score)

In [None]:

review_df['afinn_pred'] = review_df['afinn'].apply(lambda x: np.sign(x))
review_df['afinn_clean_pred'] = review_df['afinn_clean'].apply(lambda x: np.sign(x))
review_df

Unnamed: 0,text,stars,cleaned_reviews,sentiment,afinn,afinn_clean,afinn_pred,afinn_clean_pred
0,Was not that great. We sat for like 15 mins be...,2.0,was not that great we sat for like 15 mins bef...,-1.0,13.0,13.0,1.0,1.0
1,"My brother in law suggested this place, and we...",1.0,my brother in law suggested this place and we ...,-1.0,0.0,0.0,0.0,0.0
2,We were seated immediately but unfortunately i...,1.0,we were seated immediately but unfortunately i...,-1.0,7.0,7.0,1.0,1.0
3,Here's a breakdown of what my boyfriend and I ...,2.0,heres a breakdown of what my boyfriend and i s...,-1.0,0.0,-2.0,0.0,-1.0
4,Eating healthy I had the turkey burger it was ...,4.0,eating healthy i had the turkey burger it was ...,1.0,7.0,7.0,1.0,1.0
...,...,...,...,...,...,...,...,...
385393,King burrito is my go to Burritos when I'm nea...,5.0,king burrito is my go to burritos when im near...,1.0,15.0,15.0,1.0,1.0
385394,"Wow, such a lovely spot right off the highway ...",5.0,wow such a lovely spot right off the highway t...,1.0,22.0,22.0,1.0,1.0
385395,Where do I start with Buranko? Everything I tr...,5.0,where do i start with buranko everything i tri...,1.0,7.0,7.0,1.0,1.0
385396,Wouldn't even let a pregnant woman use the bat...,1.0,wouldnt even let a pregnant woman use the bath...,-1.0,10.0,1.0,1.0,1.0


In [None]:
print(f"uncleaned accuracy: {len(review_df[review_df['sentiment']==review_df['afinn_pred']])/len(review_df)}")
print(f"cleaned accuracy: {len(review_df[review_df['sentiment']==review_df['afinn_clean_pred']])/len(review_df)}")

uncleaned accuracy: 0.8344853891301979
cleaned accuracy: 0.836989294184194


##Word-level RNN

In [None]:
train_df, val_df, test_df = \
              np.split(review_df.sample(frac=1, random_state=42), 
                       [int(.6*len(review_df)), int(.8*len(review_df))])

In [None]:
!which python

/home/ahfeng/anaconda3/envs/bigdata/bin/python


In [None]:
!pip install torchtext==0.11.0
!pip install sklearn



In [None]:
!python -m spacy download en_core_web_sm

Collecting en-core-web-sm==3.2.0
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.2.0/en_core_web_sm-3.2.0-py3-none-any.whl (13.9 MB)
[K     |████████████████████████████████| 13.9 MB 5.5 MB/s eta 0:00:01
[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('en_core_web_sm')


In [None]:
import torch.nn as nn
from torch.autograd import Variable
import torch
import torch.nn.functional as F
from torchtext.legacy import data

In [None]:
# from https://github.com/bentrevett/pytorch-sentiment-analysis
class RNN(nn.Module):
    def __init__(self, input_dim, embedding_dim, hidden_dim, output_dim):
        
        super().__init__()
        
        self.embedding = nn.Embedding(input_dim, embedding_dim)
        
        self.rnn = nn.RNN(embedding_dim, hidden_dim)
        
        self.fc = nn.Linear(hidden_dim, output_dim)
        
    def forward(self, text):

        #text = [sent len, batch size]
        
        embedded = self.embedding(text)
        
        #embedded = [sent len, batch size, emb dim]
        
        output, hidden = self.rnn(embedded)
        
        #output = [sent len, batch size, hid dim]
        #hidden = [1, batch size, hid dim]
        
        assert torch.equal(output[-1,:,:], hidden.squeeze(0))
        
        return self.fc(hidden.squeeze(0))

In [None]:
# from https://gist.github.com/nissan/ccb0553edb6abafd20c3dec34ee8099d
class DataFrameDataset(data.Dataset):

    def __init__(self, df, text_field, label_field, is_test=False, **kwargs):
        fields = [('text', text_field), ('label', label_field)]
        examples = []
        for i, row in df.iterrows():
            label = row.sentiment if not is_test else None
            text = row.text
            examples.append(data.Example.fromlist([text, label], fields))

        super().__init__(examples, fields, **kwargs)

    @staticmethod
    def sort_key(ex):
        return len(ex.text)

    @classmethod
    def splits(cls, text_field, label_field, train_df, val_df=None, test_df=None, **kwargs):
        train_data, val_data, test_data = (None, None, None)

        if train_df is not None:
            train_data = cls(train_df.copy(), text_field, label_field, **kwargs)
        if val_df is not None:
            val_data = cls(val_df.copy(), text_field, label_field, **kwargs)
        if test_df is not None:
            test_data = cls(test_df.copy(), text_field, label_field, **kwargs) #keep is_text false since we have labels

        return tuple(d for d in (train_data, val_data, test_data) if d is not None)

In [None]:


SEED = 42069

torch.manual_seed(SEED)
torch.backends.cudnn.deterministic = True

TEXT = data.Field(tokenize = 'spacy',
                  tokenizer_language = 'en_core_web_sm')
LABEL = data.LabelField(dtype = torch.float)

In [None]:
train_data, val_data, test_data = DataFrameDataset.splits(text_field=TEXT, label_field=LABEL, train_df=train_df, val_df=val_df, test_df=test_df)

In [None]:
print(vars(train_data.examples[0]))

{'text': ['As', 'far', 'as', 'Carl', "'s", 'Jr', 'goes', ',', 'I', "'ve", 'been', 'to', 'better', 'locations', '.', 'The', 'service', 'is', 'pretty', 'bad', 'here', 'and', 'there', "'s", 'about', 'a', '50', '%', 'chance', 'your', 'food', 'will', 'be', 'hot', 'and', 'fresh', '.', 'The', 'food', 'here', 'tastes', 'fine', ',', 'it', "'s", 'pretty', 'consistent', 'with', 'the', 'rest', 'of', 'the', 'Carl', "'s", 'Jr', 'in', 'the', 'area', '.'], 'label': -1.0}


In [None]:
MAX_VOCAB_SIZE = 25_000

TEXT.build_vocab(train_data, max_size = MAX_VOCAB_SIZE)
LABEL.build_vocab(train_data)

In [None]:
print(f"Unique tokens in TEXT vocabulary: {len(TEXT.vocab)}")
print(f"Unique tokens in LABEL vocabulary: {len(LABEL.vocab)}")

Unique tokens in TEXT vocabulary: 25002
Unique tokens in LABEL vocabulary: 2


In [None]:
print(TEXT.vocab.stoi['<unk>'])

0


In [None]:
print(LABEL.vocab.freqs)

Counter({1.0: 171804, -1.0: 59434})


In [None]:
BATCH_SIZE = 64

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

train_iterator, val_iterator, test_iterator = data.BucketIterator.splits(
    (train_data, val_data, test_data), 
    batch_size = BATCH_SIZE,
    device = device)

In [None]:
INPUT_DIM = len(TEXT.vocab)
EMBEDDING_DIM = 100
HIDDEN_DIM = 256
OUTPUT_DIM = 1

model = RNN(INPUT_DIM, EMBEDDING_DIM, HIDDEN_DIM, OUTPUT_DIM)

In [None]:
def count_parameters(model):
    return sum(p.numel() for p in model.parameters() if p.requires_grad)

print(f'The model has {count_parameters(model):,} trainable parameters')

The model has 2,592,105 trainable parameters


In [None]:
import torch.optim as optim

optimizer = optim.SGD(model.parameters(), lr=1e-3)

In [None]:
criterion = nn.BCEWithLogitsLoss()

In [None]:
model = model.to(device)
criterion = criterion.to(device)

In [None]:
def binary_accuracy(preds, y):
    """
    Returns accuracy per batch, i.e. if you get 8/10 right, this returns 0.8, NOT 8
    """

    #round predictions to the closest integer
    rounded_preds = torch.round(torch.sigmoid(preds))
    correct = (rounded_preds == y).float() #convert into float for division 
    acc = correct.sum() / len(correct)
    return acc

In [None]:
def train(model, iterator, optimizer, criterion):
    
    epoch_loss = 0
    epoch_acc = 0
    
    model.train()
    
    for batch in iterator:
        
        optimizer.zero_grad()
                
        predictions = model(batch.text).squeeze(1)
        
        loss = criterion(predictions, batch.label)
        
        acc = binary_accuracy(predictions, batch.label)
        
        loss.backward()
        
        optimizer.step()
        
        epoch_loss += loss.item()
        epoch_acc += acc.item()
        
    return epoch_loss / len(iterator), epoch_acc / len(iterator)

In [None]:
def evaluate(model, iterator, criterion):
    
    epoch_loss = 0
    epoch_acc = 0
    
    model.eval()
    
    with torch.no_grad():
    
        for batch in iterator:

            predictions = model(batch.text).squeeze(1)
            
            loss = criterion(predictions, batch.label)
            
            acc = binary_accuracy(predictions, batch.label)

            epoch_loss += loss.item()
            epoch_acc += acc.item()
        
    return epoch_loss / len(iterator), epoch_acc / len(iterator)

In [None]:
import time

def epoch_time(start_time, end_time):
    elapsed_time = end_time - start_time
    elapsed_mins = int(elapsed_time / 60)
    elapsed_secs = int(elapsed_time - (elapsed_mins * 60))
    return elapsed_mins, elapsed_secs

In [None]:
N_EPOCHS = 5

best_val_loss = float('inf')

for epoch in range(N_EPOCHS):

    start_time = time.time()
    
    train_loss, train_acc = train(model, train_iterator, optimizer, criterion)
    val_loss, val_acc = evaluate(model, val_iterator, criterion)
    
    end_time = time.time()

    epoch_mins, epoch_secs = epoch_time(start_time, end_time)
    
    if val_loss < best_val_loss:
        best_val_loss = val_loss
        torch.save(model.state_dict(), 'tut1-model.pt')
    
    print(f'Epoch: {epoch+1:02} | Epoch Time: {epoch_mins}m {epoch_secs}s')
    print(f'\tTrain Loss: {train_loss:.3f} | Train Acc: {train_acc*100:.2f}%')
    print(f'\t Val. Loss: {val_loss:.3f} |  Val. Acc: {val_acc*100:.2f}%')

Epoch: 01 | Epoch Time: 1m 32s
	Train Loss: 0.568 | Train Acc: 74.37%
	 Val. Loss: 0.675 |  Val. Acc: 57.03%
Epoch: 02 | Epoch Time: 1m 30s
	Train Loss: 0.567 | Train Acc: 74.43%
	 Val. Loss: 0.681 |  Val. Acc: 52.66%
Epoch: 03 | Epoch Time: 1m 29s
	Train Loss: 0.567 | Train Acc: 74.43%
	 Val. Loss: 0.684 |  Val. Acc: 51.20%
Epoch: 04 | Epoch Time: 1m 30s
	Train Loss: 0.567 | Train Acc: 74.43%
	 Val. Loss: 0.683 |  Val. Acc: 51.02%
Epoch: 05 | Epoch Time: 1m 30s
	Train Loss: 0.568 | Train Acc: 74.42%
	 Val. Loss: 0.683 |  Val. Acc: 50.85%


In [None]:
model.load_state_dict(torch.load('tut1-model.pt'))

test_loss, test_acc = evaluate(model, test_iterator, criterion)

print(f'Test Loss: {test_loss:.3f} | Test Acc: {test_acc*100:.2f}%')

Test Loss: 0.676 | Test Acc: 56.72%


##Improved RNN

In [None]:
import torch.nn as nn

class RNN(nn.Module):
    def __init__(self, vocab_size, embedding_dim, hidden_dim, output_dim, n_layers, 
                 bidirectional, dropout, pad_idx):
        
        super().__init__()
        
        self.embedding = nn.Embedding(vocab_size, embedding_dim, padding_idx = pad_idx)
        
        self.rnn = nn.LSTM(embedding_dim, 
                           hidden_dim, 
                           num_layers=n_layers, 
                           bidirectional=bidirectional, 
                           dropout=dropout)
        
        self.fc = nn.Linear(hidden_dim * 2, output_dim)
        
        self.dropout = nn.Dropout(dropout)
        
    def forward(self, text, text_lengths):
        
        #text = [sent len, batch size]
        
        embedded = self.dropout(self.embedding(text))
        
        #embedded = [sent len, batch size, emb dim]
        
        #pack sequence
        # lengths need to be on CPU!
        packed_embedded = nn.utils.rnn.pack_padded_sequence(embedded, text_lengths.to('cpu'))
        
        packed_output, (hidden, cell) = self.rnn(packed_embedded)
        
        #unpack sequence
        output, output_lengths = nn.utils.rnn.pad_packed_sequence(packed_output)

        #output = [sent len, batch size, hid dim * num directions]
        #output over padding tokens are zero tensors
        
        #hidden = [num layers * num directions, batch size, hid dim]
        #cell = [num layers * num directions, batch size, hid dim]
        
        #concat the final forward (hidden[-2,:,:]) and backward (hidden[-1,:,:]) hidden layers
        #and apply dropout
        
        hidden = self.dropout(torch.cat((hidden[-2,:,:], hidden[-1,:,:]), dim = 1))
                
        #hidden = [batch size, hid dim * num directions]
            
        return self.fc(hidden)

In [None]:
from torchtext.legacy import data

SEED = 69

torch.manual_seed(SEED)
torch.backends.cudnn.deterministic = True

TEXT = data.Field(tokenize = 'spacy',
                  tokenizer_language = 'en_core_web_sm',
                  include_lengths = True)

LABEL = data.LabelField(dtype = torch.float)

In [None]:
train_data, val_data, test_data = DataFrameDataset.splits(text_field=TEXT, label_field=LABEL, train_df=train_df, val_df=val_df, test_df=test_df)

In [None]:
MAX_VOCAB_SIZE = 25_000

TEXT.build_vocab(train_data, 
                 max_size = MAX_VOCAB_SIZE, 
                 vectors = "glove.6B.100d", 
                 unk_init = torch.Tensor.normal_)

LABEL.build_vocab(train_data)

In [None]:
BATCH_SIZE = 64

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

train_iterator, val_iterator, test_iterator = data.BucketIterator.splits(
    (train_data, val_data, test_data), 
    batch_size = BATCH_SIZE,
    sort_within_batch = True,
    device = device)

In [None]:
INPUT_DIM = len(TEXT.vocab)
EMBEDDING_DIM = 100
HIDDEN_DIM = 256
OUTPUT_DIM = 1
N_LAYERS = 2
BIDIRECTIONAL = True
DROPOUT = 0.5
PAD_IDX = TEXT.vocab.stoi[TEXT.pad_token]

model = RNN(INPUT_DIM, 
            EMBEDDING_DIM, 
            HIDDEN_DIM, 
            OUTPUT_DIM, 
            N_LAYERS, 
            BIDIRECTIONAL, 
            DROPOUT, 
            PAD_IDX)

In [None]:
def count_parameters(model):
    return sum(p.numel() for p in model.parameters() if p.requires_grad)

print(f'The model has {count_parameters(model):,} trainable parameters')

The model has 4,810,857 trainable parameters


In [None]:
pretrained_embeddings = TEXT.vocab.vectors

print(pretrained_embeddings.shape)

torch.Size([25002, 100])


In [None]:
model.embedding.weight.data.copy_(pretrained_embeddings)

tensor([[-0.5300, -1.3035,  0.4438,  ...,  0.6058,  1.5191,  0.4225],
        [-1.0078,  0.8051,  0.7826,  ..., -0.7693, -0.1897, -0.3624],
        [-0.3398,  0.2094,  0.4635,  ..., -0.2339,  0.4730, -0.0288],
        ...,
        [ 0.8651,  1.3278, -0.7483,  ..., -2.5835,  0.2068,  0.8006],
        [ 0.4914,  0.7875,  2.1520,  ...,  1.1039, -2.2060,  0.4389],
        [ 0.4178,  1.0698, -1.3060,  ...,  1.0009,  0.9013,  0.2731]])

In [None]:
UNK_IDX = TEXT.vocab.stoi[TEXT.unk_token]

model.embedding.weight.data[UNK_IDX] = torch.zeros(EMBEDDING_DIM)
model.embedding.weight.data[PAD_IDX] = torch.zeros(EMBEDDING_DIM)

print(model.embedding.weight.data)

tensor([[ 0.0000,  0.0000,  0.0000,  ...,  0.0000,  0.0000,  0.0000],
        [ 0.0000,  0.0000,  0.0000,  ...,  0.0000,  0.0000,  0.0000],
        [-0.3398,  0.2094,  0.4635,  ..., -0.2339,  0.4730, -0.0288],
        ...,
        [ 0.8651,  1.3278, -0.7483,  ..., -2.5835,  0.2068,  0.8006],
        [ 0.4914,  0.7875,  2.1520,  ...,  1.1039, -2.2060,  0.4389],
        [ 0.4178,  1.0698, -1.3060,  ...,  1.0009,  0.9013,  0.2731]])


In [None]:
import torch.optim as optim

optimizer = optim.Adam(model.parameters())

In [None]:
criterion = nn.BCEWithLogitsLoss()

model = model.to(device)
criterion = criterion.to(device)

In [None]:
def train(model, iterator, optimizer, criterion):
    
    epoch_loss = 0
    epoch_acc = 0
    
    model.train()
    
    for batch in iterator:
        
        optimizer.zero_grad()
        
        text, text_lengths = batch.text
        
        predictions = model(text, text_lengths).squeeze(1)
        
        loss = criterion(predictions, batch.label)
        
        acc = binary_accuracy(predictions, batch.label)
        
        loss.backward()
        
        optimizer.step()
        
        epoch_loss += loss.item()
        epoch_acc += acc.item()
        
    return epoch_loss / len(iterator), epoch_acc / len(iterator)

In [None]:
def evaluate(model, iterator, criterion):
    
    epoch_loss = 0
    epoch_acc = 0
    
    model.eval()
    
    with torch.no_grad():
    
        for batch in iterator:

            text, text_lengths = batch.text
            
            predictions = model(text, text_lengths).squeeze(1)
            
            loss = criterion(predictions, batch.label)
            
            acc = binary_accuracy(predictions, batch.label)

            epoch_loss += loss.item()
            epoch_acc += acc.item()
        
    return epoch_loss / len(iterator), epoch_acc / len(iterator)

In [None]:
import time

def epoch_time(start_time, end_time):
    elapsed_time = end_time - start_time
    elapsed_mins = int(elapsed_time / 60)
    elapsed_secs = int(elapsed_time - (elapsed_mins * 60))
    return elapsed_mins, elapsed_secs

In [None]:
N_EPOCHS = 5

best_val_loss = float('inf')

for epoch in range(N_EPOCHS):

    start_time = time.time()
    
    train_loss, train_acc = train(model, train_iterator, optimizer, criterion)
    val_loss, val_acc = evaluate(model, val_iterator, criterion)
    
    end_time = time.time()

    epoch_mins, epoch_secs = epoch_time(start_time, end_time)
    
    if val_loss < best_val_loss:
        best_val_loss = val_loss
        torch.save(model.state_dict(), 'tut2-model.pt')
    
    print(f'Epoch: {epoch+1:02} | Epoch Time: {epoch_mins}m {epoch_secs}s')
    print(f'\tTrain Loss: {train_loss:.3f} | Train Acc: {train_acc*100:.2f}%')
    print(f'\t Val. Loss: {val_loss:.3f} |  Val. Acc: {val_acc*100:.2f}%')

Epoch: 01 | Epoch Time: 1m 24s
	Train Loss: 0.222 | Train Acc: 90.72%
	 Val. Loss: 0.112 |  Val. Acc: 95.91%
Epoch: 02 | Epoch Time: 1m 25s
	Train Loss: 0.107 | Train Acc: 96.00%
	 Val. Loss: 0.089 |  Val. Acc: 96.86%
Epoch: 03 | Epoch Time: 1m 26s
	Train Loss: 0.085 | Train Acc: 96.84%
	 Val. Loss: 0.079 |  Val. Acc: 97.07%
Epoch: 04 | Epoch Time: 1m 24s
	Train Loss: 0.074 | Train Acc: 97.30%
	 Val. Loss: 0.083 |  Val. Acc: 97.03%
Epoch: 05 | Epoch Time: 1m 24s
	Train Loss: 0.066 | Train Acc: 97.62%
	 Val. Loss: 0.074 |  Val. Acc: 97.33%


In [None]:
model.load_state_dict(torch.load('tut2-model.pt'))

test_loss, test_acc = evaluate(model, test_iterator, criterion)

print(f'Test Loss: {test_loss:.3f} | Test Acc: {test_acc*100:.2f}%')

Test Loss: 0.074 | Test Acc: 97.35%


In [None]:
import spacy
nlp = spacy.load('en_core_web_sm')

def predict_sentiment(model, sentence):
    model.eval()
    tokenized = [tok.text for tok in nlp.tokenizer(sentence)]
    indexed = [TEXT.vocab.stoi[t] for t in tokenized]
    length = [len(indexed)]
    tensor = torch.LongTensor(indexed).to(device)
    tensor = tensor.unsqueeze(1)
    length_tensor = torch.LongTensor(length)
    prediction = torch.sigmoid(model(tensor, length_tensor))
    return prediction.item()

In [None]:
predict_sentiment(model, "This film is terrible")

0.9976223111152649

In [None]:
predict_sentiment(model, "What the fuck did you just fucking say about me, you little bitch? I'll have you know I graduated top of my class in the Navy Seals, and I've been involved in numerous secret raids on Al-Quaeda, and I have over 300 confirmed kills. ")

0.6374624371528625

##Bert Embedding RNN

In [None]:
!pip install transformers



In [None]:
!pip install ipywidgets widgetsnbextension pandas-profiling






In [None]:
!jupyter nbextension enable --py widgetsnbextension

Enabling notebook extension jupyter-js-widgets/extension...
      - Validating: [32mOK[0m


In [None]:
import torch

import random
import numpy as np

SEED = 1234

random.seed(SEED)
np.random.seed(SEED)
torch.manual_seed(SEED)
torch.backends.cudnn.deterministic = True

In [None]:
!pip install IProgress



In [None]:
from transformers import BertTokenizer
from ipywidgets import FloatProgress

tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

In [None]:
init_token = tokenizer.cls_token
eos_token = tokenizer.sep_token
pad_token = tokenizer.pad_token
unk_token = tokenizer.unk_token

print(init_token, eos_token, pad_token, unk_token)

[CLS] [SEP] [PAD] [UNK]


In [None]:
init_token_idx = tokenizer.convert_tokens_to_ids(init_token)
eos_token_idx = tokenizer.convert_tokens_to_ids(eos_token)
pad_token_idx = tokenizer.convert_tokens_to_ids(pad_token)
unk_token_idx = tokenizer.convert_tokens_to_ids(unk_token)

print(init_token_idx, eos_token_idx, pad_token_idx, unk_token_idx)

101 102 0 100


In [None]:
max_input_length = tokenizer.max_model_input_sizes['bert-base-uncased']

print(max_input_length)

512


In [None]:
def tokenize_and_cut(sentence):
    tokens = tokenizer.tokenize(sentence) 
    tokens = tokens[:max_input_length-2]
    return tokens

In [None]:
from torchtext.legacy import data

TEXT = data.Field(batch_first = True,
                  use_vocab = False,
                  tokenize = tokenize_and_cut,
                  preprocessing = tokenizer.convert_tokens_to_ids,
                  init_token = init_token_idx,
                  eos_token = eos_token_idx,
                  pad_token = pad_token_idx,
                  unk_token = unk_token_idx)

LABEL = data.LabelField(dtype = torch.float)

In [None]:
train_data, val_data, test_data = DataFrameDataset.splits(text_field=TEXT, label_field=LABEL, train_df=train_df, val_df=val_df, test_df=test_df)

In [None]:
print(f"Number of training examples: {len(train_data)}")
print(f"Number of validation examples: {len(val_data)}")
print(f"Number of testing examples: {len(test_data)}")

Number of training examples: 231238
Number of validation examples: 77080
Number of testing examples: 77080


In [None]:
print(vars(train_data.examples[6]))

{'text': [2307, 3962, 2000, 6865, 2041, 2007, 2814, 1998, 2131, 1037, 4658, 6700, 3334, 11529, 3347, 21209, 1012, 1996, 25545, 2003, 2428, 4658, 1011, 1996, 3681, 2031, 13297, 3645, 2007, 22243, 3221, 1012, 1996, 27612, 2024, 6625, 1012, 2256, 8241, 2001, 2200, 5379, 1012, 1045, 2347, 1005, 1056, 2205, 10326, 2006, 1996, 4777, 1006, 1996, 28774, 8717, 3211, 14894, 2001, 2205, 11259, 1007, 2021, 1996, 8974, 2020, 9805, 18879, 1012], 'label': 1.0}


In [None]:
tokens = tokenizer.convert_ids_to_tokens(vars(train_data.examples[6])['text'])

print(tokens)

['great', 'spot', 'to', 'hang', 'out', 'with', 'friends', 'and', 'get', 'a', 'cool', 'hips', '##ter', 'dive', 'bar', 'vibe', '.', 'the', 'decor', 'is', 'really', 'cool', '-', 'the', 'walls', 'have', 'airplane', 'windows', 'with', 'mirrored', 'glass', '.', 'the', 'booths', 'are', 'comfortable', '.', 'our', 'server', 'was', 'very', 'friendly', '.', 'i', 'wasn', "'", 't', 'too', 'keen', 'on', 'the', 'wings', '(', 'the', 'ter', '##iya', '##ki', 'flavor', 'was', 'too', 'subtle', ')', 'but', 'the', 'drinks', 'were', 'yu', '##mmy', '.']


In [None]:
LABEL.build_vocab(train_data)

In [None]:
BATCH_SIZE = 64

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

train_iterator, val_iterator, test_iterator = data.BucketIterator.splits(
    (train_data, val_data, test_data), 
    batch_size = BATCH_SIZE, 
    device = device)

In [None]:
from transformers import BertTokenizer, BertModel

bert = BertModel.from_pretrained('bert-base-uncased')

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertModel: ['cls.predictions.decoder.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.bias', 'cls.seq_relationship.weight', 'cls.predictions.transform.LayerNorm.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [None]:
import torch.nn as nn

class BERTGRUSentiment(nn.Module):
    def __init__(self,
                 bert,
                 hidden_dim,
                 output_dim,
                 n_layers,
                 bidirectional,
                 dropout):
        
        super().__init__()
        
        self.bert = bert
        
        embedding_dim = bert.config.to_dict()['hidden_size']
        
        self.rnn = nn.GRU(embedding_dim,
                          hidden_dim,
                          num_layers = n_layers,
                          bidirectional = bidirectional,
                          batch_first = True,
                          dropout = 0 if n_layers < 2 else dropout)
        
        self.out = nn.Linear(hidden_dim * 2 if bidirectional else hidden_dim, output_dim)
        
        self.dropout = nn.Dropout(dropout)
        
    def forward(self, text):
        
        #text = [batch size, sent len]
                
        with torch.no_grad():
            embedded = self.bert(text)[0]
                
        #embedded = [batch size, sent len, emb dim]
        
        _, hidden = self.rnn(embedded)
        
        #hidden = [n layers * n directions, batch size, emb dim]
        
        if self.rnn.bidirectional:
            hidden = self.dropout(torch.cat((hidden[-2,:,:], hidden[-1,:,:]), dim = 1))
        else:
            hidden = self.dropout(hidden[-1,:,:])
                
        #hidden = [batch size, hid dim]
        
        output = self.out(hidden)
        
        #output = [batch size, out dim]
        
        return output

In [None]:
HIDDEN_DIM = 256
OUTPUT_DIM = 1
N_LAYERS = 2
BIDIRECTIONAL = True
DROPOUT = 0.25

model = BERTGRUSentiment(bert,
                         HIDDEN_DIM,
                         OUTPUT_DIM,
                         N_LAYERS,
                         BIDIRECTIONAL,
                         DROPOUT)

In [None]:
def count_parameters(model):
    return sum(p.numel() for p in model.parameters() if p.requires_grad)

print(f'The model has {count_parameters(model):,} trainable parameters')

The model has 112,241,409 trainable parameters


In [None]:
for name, param in model.named_parameters():                
    if name.startswith('bert'):
        param.requires_grad = False

In [None]:
print(f'The model has {count_parameters(model):,} trainable parameters')

The model has 2,759,169 trainable parameters


In [None]:
for name, param in model.named_parameters():                
    if param.requires_grad:
        print(name)

rnn.weight_ih_l0
rnn.weight_hh_l0
rnn.bias_ih_l0
rnn.bias_hh_l0
rnn.weight_ih_l0_reverse
rnn.weight_hh_l0_reverse
rnn.bias_ih_l0_reverse
rnn.bias_hh_l0_reverse
rnn.weight_ih_l1
rnn.weight_hh_l1
rnn.bias_ih_l1
rnn.bias_hh_l1
rnn.weight_ih_l1_reverse
rnn.weight_hh_l1_reverse
rnn.bias_ih_l1_reverse
rnn.bias_hh_l1_reverse
out.weight
out.bias


In [None]:
import torch.optim as optim

optimizer = optim.Adam(model.parameters())

In [None]:
criterion = nn.BCEWithLogitsLoss()

In [None]:
model = model.to(device)
criterion = criterion.to(device)

In [None]:
def binary_accuracy(preds, y):
    """
    Returns accuracy per batch, i.e. if you get 8/10 right, this returns 0.8, NOT 8
    """

    #round predictions to the closest integer
    rounded_preds = torch.round(torch.sigmoid(preds))
    correct = (rounded_preds == y).float() #convert into float for division 
    acc = correct.sum() / len(correct)
    return acc

In [None]:
def train(model, iterator, optimizer, criterion):
    
    epoch_loss = 0
    epoch_acc = 0
    
    model.train()
    
    for batch in iterator:
        
        optimizer.zero_grad()
        
        predictions = model(batch.text).squeeze(1)
        
        loss = criterion(predictions, batch.label)
        
        acc = binary_accuracy(predictions, batch.label)
        
        loss.backward()
        
        optimizer.step()
        
        epoch_loss += loss.item()
        epoch_acc += acc.item()
        
    return epoch_loss / len(iterator), epoch_acc / len(iterator)

In [None]:
def evaluate(model, iterator, criterion):
    
    epoch_loss = 0
    epoch_acc = 0
    
    model.eval()
    
    with torch.no_grad():
    
        for batch in iterator:

            predictions = model(batch.text).squeeze(1)
            
            loss = criterion(predictions, batch.label)
            
            acc = binary_accuracy(predictions, batch.label)

            epoch_loss += loss.item()
            epoch_acc += acc.item()
        
    return epoch_loss / len(iterator), epoch_acc / len(iterator)

In [None]:
import time

def epoch_time(start_time, end_time):
    elapsed_time = end_time - start_time
    elapsed_mins = int(elapsed_time / 60)
    elapsed_secs = int(elapsed_time - (elapsed_mins * 60))
    return elapsed_mins, elapsed_secs

In [None]:
torch.cuda.empty_cache()

In [None]:
N_EPOCHS = 5

best_val_loss = float('inf')

for epoch in range(N_EPOCHS):
    
    start_time = time.time()
    
    train_loss, train_acc = train(model, train_iterator, optimizer, criterion)
    val_loss, val_acc = evaluate(model, val_iterator, criterion)
        
    end_time = time.time()
        
    epoch_mins, epoch_secs = epoch_time(start_time, end_time)
        
    if val_loss < best_val_loss:
        best_val_loss = val_loss
        torch.save(model.state_dict(), 'tut6-model.pt')
    
    print(f'Epoch: {epoch+1:02} | Epoch Time: {epoch_mins}m {epoch_secs}s')
    print(f'\tTrain Loss: {train_loss:.3f} | Train Acc: {train_acc*100:.2f}%')
    print(f'\t Val. Loss: {val_loss:.3f} |  Val. Acc: {val_acc*100:.2f}%')

Epoch: 01 | Epoch Time: 47m 44s
	Train Loss: 0.083 | Train Acc: 96.91%
	 Val. Loss: 0.076 |  Val. Acc: 97.17%
Epoch: 02 | Epoch Time: 47m 59s
	Train Loss: 0.074 | Train Acc: 97.27%
	 Val. Loss: 0.098 |  Val. Acc: 96.31%
Epoch: 03 | Epoch Time: 48m 46s
	Train Loss: 0.068 | Train Acc: 97.48%
	 Val. Loss: 0.070 |  Val. Acc: 97.49%
Epoch: 04 | Epoch Time: 48m 20s
	Train Loss: 0.064 | Train Acc: 97.66%
	 Val. Loss: 0.069 |  Val. Acc: 97.42%
Epoch: 05 | Epoch Time: 47m 35s
	Train Loss: 0.062 | Train Acc: 97.73%
	 Val. Loss: 0.074 |  Val. Acc: 97.40%


In [None]:
model.load_state_dict(torch.load('tut6-model.pt'))

test_loss, test_acc = evaluate(model, test_iterator, criterion)

print(f'Test Loss: {test_loss:.3f} | Test Acc: {test_acc*100:.2f}%')

Test Loss: 0.069 | Test Acc: 97.42%


In [None]:
model.load_state_dict(torch.load('tut6-model.pt'))

test_loss, test_acc = evaluate(model, test_iterator, criterion)

print(f'Test Loss: {test_loss:.3f} | Test Acc: {test_acc*100:.2f}%')

Test Loss: 0.069 | Test Acc: 97.42%
