In [1]:
# Load Packages # 

import os 
import pandas as pd
import numpy as np 
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer

from tqdm import tqdm_notebook

from bs4 import BeautifulSoup
from urllib.request import urlopen
import time

In [2]:
# Returns a single number ([-1.0,+1.0]) representing the sentiment of every IMDb user review of the TV show

def getSentiment(imdbTag):
    html = urlopen('https://www.imdb.com/title/' + imdbTag + '/reviews')
    bs = BeautifulSoup(html, 'html.parser')
    reviews = bs.find_all('div',{'class':'text show-more__control'})
    compounds = []
    for review in reviews:
        score = SentimentIntensityAnalyzer().polarity_scores(str(review))['compound']
        compounds.append(score)
        
    if(len(compounds) == 0):
        return 0
    return sum(compounds)/len(compounds)

In [3]:
# Runs the sentiment analysis for every row
# counter parameter is the row number to start on, 0 to start, but higher later on when the connection is severed

def getSentimentForAll(counter):
    try:
        for i in tqdm_notebook(range(counter,len(linear.index))):
            
            imdbSentiment = getSentiment(linear['tconst'][i])
            linear.iloc[i]['imdbSentiment'] = imdbSentiment
            counter += 1
    # Catch the severed connection exception, wait 5 minutes, and then resume at the place where it stopped
    except ConnectionResetError:
        time.sleep(300)
        getSentimentForAll(counter)                        

In [4]:
# Load in Linear Ratings Data

linear = pd.read_csv("/Users/andy/Documents/USC/DSO 574 Using Big Data/Sony Streaming Project/Linear TV Data + Code/Linear_Data.csv.gz", compression = 'gzip', usecols=['SPT Program Name','Rtg','Daypart'])

In [5]:
# Load in IMDB data

imdbData = pd.read_csv('/Volumes/DATASETS/DSO574Proj/Sony/title.basics.tsv', sep='\t',usecols=['tconst','primaryTitle','originalTitle'],low_memory=True)

In [6]:
# These need to be strings for later

linear['SPT Program Name'] = linear['SPT Program Name'].astype('str')
imdbData['primaryTitle'] = imdbData['primaryTitle'].astype('str')

In [7]:
linear.groupby(['SPT Program Name'],as_index=False).mean().head(5)

Unnamed: 0,SPT Program Name,Rtg
0,#KillerPost,0.034337
1,#RichKids of Beverly Hills,0.011944
2,100 Code,0.028528
3,12 Monkeys,0.095381
4,1969,0.464267


In [8]:
imdbData.groupby(['primaryTitle'],as_index=False).head(5)

Unnamed: 0,tconst,primaryTitle,originalTitle
0,tt0000001,Carmencita,Carmencita
1,tt0000002,Le clown et ses chiens,Le clown et ses chiens
2,tt0000003,Pauvre Pierrot,Pauvre Pierrot
3,tt0000004,Un bon bock,Un bon bock
4,tt0000005,Blacksmith Scene,Blacksmith Scene
5,tt0000006,Chinese Opium Den,Chinese Opium Den
6,tt0000007,Corbett and Courtney Before the Kinetograph,Corbett and Courtney Before the Kinetograph
7,tt0000008,Edison Kinetoscopic Record of a Sneeze,Edison Kinetoscopic Record of a Sneeze
8,tt0000009,Miss Jerry,Miss Jerry
9,tt0000010,Exiting the Factory,La sortie de l'usine Lumière à Lyon


In [9]:
linear = linear.drop_duplicates(subset='SPT Program Name')

In [10]:
imdbData = imdbData.drop_duplicates(subset='primaryTitle')

In [11]:
# Inner join the IMDb dataset with the linear TV dataset, every row has an IMDb tag now

linear = linear.merge(imdbData, left_on='SPT Program Name', right_on='primaryTitle',how='inner')

In [12]:
# Add a new column for IMDb review sentiment to be filled in later

linear['imdbSentiment'] = ''
cols = ['SPT Program Name','tconst','Rtg','imdbSentiment','Daypart','primaryTitle','originalTitle']
linear = linear[cols]

In [13]:
# Run the getSentiment function for every TV show in the dataset. IMDb.com will sever the connection at some point due to the volume of requests

counter = 0
getSentimentForAll(counter)    

HBox(children=(IntProgress(value=0, max=910), HTML(value='')))

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  if __name__ == '__main__':





In [14]:
linear.shape

(910, 7)

In [15]:
# Write to a new .csv
linear.to_csv('/Users/andy/Desktop/ratingsWithSentiment.csv')