In [None]:
# Load Packages # 

import os 
import pandas as pd
import numpy as np 
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer

from tqdm import tqdm_notebook

from bs4 import BeautifulSoup
from urllib.request import urlopen
import time

In [None]:
# Returns a single number ([-1.0,+1.0]) representing the sentiment of every IMDb user review of the TV show

def getSentiment(imdbTag):
    html = urlopen('https://www.imdb.com/title/' + imdbTag + '/reviews')
    bs = BeautifulSoup(html, 'html.parser')
    reviews = bs.find_all('div',{'class':'text show-more__control'})
    compounds = []
    for review in reviews:
        score = SentimentIntensityAnalyzer().polarity_scores(str(review))['compound']
        compounds.append(score)
        
    if(len(compounds) == 0):
        return 0
    return sum(compounds)/len(compounds)

In [None]:
# Runs the sentiment analysis for every row
# counter parameter is the row number to start on, 0 to start, but higher later on when the connection is severed

def getSentimentForAll(counter):
    try:
        for i in tqdm_notebook(range(counter,len(linear.index))):
            
            imdbSentiment = getSentiment(linear['tconst'][i])
            linear.iloc[i]['imdbSentiment'] = imdbSentiment
            counter += 1
    # Catch the severed connection exception, wait 5 minutes, and then resume at the place where it stopped
    except ConnectionResetError:
        time.sleep(300)
        getSentimentForAll(counter)                        

In [None]:
# Load in Linear Ratings Data

linear = pd.read_csv('/filepath/to/the/data', compression = 'gzip', usecols=['SPT Program Name','Rtg','Daypart'])

In [None]:
# Load in IMDB data

imdbData = pd.read_csv('/filepath/to/the/data', sep='\t',usecols=['tconst','primaryTitle','originalTitle'],low_memory=True)

In [None]:
# These need to be strings for later

linear['SPT Program Name'] = linear['SPT Program Name'].astype('str')
imdbData['primaryTitle'] = imdbData['primaryTitle'].astype('str')

In [None]:
linear.groupby(['SPT Program Name'],as_index=False).mean().head(5)

In [None]:
imdbData.groupby(['primaryTitle'],as_index=False).head(5)

In [None]:
linear = linear.drop_duplicates(subset='SPT Program Name')

In [None]:
imdbData = imdbData.drop_duplicates(subset='primaryTitle')

In [None]:
# Inner join the IMDb dataset with the linear TV dataset, every row has an IMDb tag now

linear = linear.merge(imdbData, left_on='SPT Program Name', right_on='primaryTitle',how='inner')

In [None]:
# Add a new column for IMDb review sentiment to be filled in later

linear['imdbSentiment'] = ''
cols = ['SPT Program Name','tconst','Rtg','imdbSentiment','Daypart','primaryTitle','originalTitle']
linear = linear[cols]

In [None]:
# Run the getSentiment function for every TV show in the dataset. IMDb.com will sever the connection at some point due to the volume of requests

counter = 0
getSentimentForAll(counter)    

In [None]:
linear.shape

In [None]:
# Write to a new .csv
linear.to_csv('/filepath/to/put/the/file')