In [1]:
%matplotlib inline
import numpy as np
import scipy as sp
import matplotlib as mpl
import matplotlib.cm as cm
import matplotlib.pyplot as plt
import pandas as pd
import time
import json
import statsmodels.api as sm
from statsmodels.formula.api import glm, ols

pd.set_option('display.width', 500)
pd.set_option('display.max_columns', 100)
pd.set_option('display.notebook_repr_html', True)
import seaborn as sns
sns.set_style("whitegrid")
sns.set_context("poster")
import math

In [2]:
dictionary=pd.read_csv("dictionary.csv")

with open("IMDB_dftouse_dict.json", "r") as fd:
    IMDB = json.load(fd)
IMDB_df = pd.DataFrame(IMDB)

In [3]:
valences = {word: valence for word, valence in zip(dictionary.words.values,dictionary.valences.values)}

In [4]:
dictionary.head()

Unnamed: 0,valences,words
0,-2,abandon
1,-2,abandoned
2,-2,abandons
3,-2,abducted
4,-2,abduction


In [5]:
IMDB_df.head()

Unnamed: 0,movie_id,movie_name,positive,stars,text,url
0,10027,Titanic,True,7,"Sure, Titanic was a good movie, the first time...",http://www.imdb.com/title/tt0120338/usercommen...
1,10028,Titanic,True,10,When I saw this movie I was stunned by what a ...,http://www.imdb.com/title/tt0120338/usercommen...
2,10029,Titanic,True,10,Why do people bitch about this movie and not a...,http://www.imdb.com/title/tt0120338/usercommen...
3,10030,Titanic,True,10,"What's inexplicable? Firstly, the hatred towar...",http://www.imdb.com/title/tt0120338/usercommen...
4,10031,Titanic,True,10,"Previously, I wrote that I loved ""Titanic"", cr...",http://www.imdb.com/title/tt0120338/usercommen...


In [6]:
from sklearn.feature_extraction import text
stopwords = text.ENGLISH_STOP_WORDS
punctuation = list('.,;:!?()[]{}`''\"@#$%^&*+-|-=~_')

In [95]:
def getValenceInfo(text, valenceDict):
    total_words = len(text.split())
    total, count_relevant = 0, 0
    for word in text.split():
        if word in valenceDict.keys():
            count_relevant += 1
            total += valenceDict[word]
    # return total valence, average valence, length of review in words, fraction of contributing words
    if count_relevant != 0: 
        avg_valence = 1.*total/count_relevant
    else: 
        avg_valence = 0
    return total, avg_valence, total_words, 1.*count_relevant / total_words

def removeStopWords(text, stopwords = stopwords):
    new_text = ""
    for word in text.split():
        if word not in stopwords:
            while len(word) != 0 and word[-1] in punctuation:
                word = word[:len(word)-1]
            new_text += word + ' '
    return new_text

In [96]:
def getAllInfo(df, valenceDict, stopwords): 
    valence_suml = []
    valence_avgl = []
    review_lenl = []
    review_fractionl = []
    for i, row in df.iterrows():
        cleaned_review = removeStopWords(row['text'], stopwords)
        valence_sum, valence_avg, review_len, review_fraction = getValenceInfo(cleaned_review, valenceDict)
        valence_suml.append(valence_sum)
        valence_avgl.append(valence_avg)
        review_lenl.append(review_len)
        review_fractionl.append(review_fraction)
    return pd.DataFrame({'valence_sum': valence_suml, 'valence_avg':valence_avgl ,'review_len': review_lenl, 
                         'review_fraction': review_fractionl})

In [97]:
word = ""
len(word)

0

In [98]:
IMDB_df.shape

(4111, 6)

In [99]:
%%time
new_df = getAllInfo(IMDB_df, valences, stopwords)

CPU times: user 2min 55s, sys: 1.39 s, total: 2min 56s
Wall time: 3min 2s




In [100]:
new_df

Unnamed: 0,review_fraction,review_len,valence_avg,valence_sum
0,0.083333,120,1.400000,14
1,0.123077,65,1.375000,11
2,0.173333,75,0.384615,5
3,0.106383,235,-0.080000,-2
4,0.052980,302,0.312500,5
5,0.088235,34,2.666667,8
6,0.169811,159,1.185185,32
7,0.115702,121,2.000000,28
8,0.102151,186,1.368421,26
9,0.111702,188,1.142857,24


In [70]:
test_df = IMDB_df[IMDB_df.movie_id == 10027]

In [72]:
test_df

Unnamed: 0,movie_id,movie_name,positive,stars,text,url
0,10027,Titanic,True,7,"Sure, Titanic was a good movie, the first time...",http://www.imdb.com/title/tt0120338/usercommen...


In [108]:
nostop = removeStopWords(IMDB_df[IMDB_df.movie_id == 10027].text[0], stopwords).lower()
nostop

u"sure titanic good movie time it really second time opinion film definetly change the time movie underlying love-story think ooh romantic the second time (and i think this just annoying just sit watching movie thinking when d**n ship going sink and impressive times the acting film bad definetly great either was i glad dicaprio did win oscar film i mean does think is anthony hopkins denzel washington he does 1 half-good movie won't film $20 million and suprised hardly films it but about eyes worst character film kate winslet's performance hand wonderful i tink director talented film magnitude together there lesson learned movie love-stories is filmmakers shouldn't try add crummy romance single movie out possible 100 i film mere 71 "

In [109]:
getValenceInfo(nostop, happiness)

(38.020792408535023, 0.37644348919341608, 120, 0.8416666666666667)

In [13]:
# Let's use the labMT dictionary
url = 'http://www.plosone.org/article/fetchSingleRepresentation.action?uri=info:doi/10.1371/journal.pone.0026752.s001'
labmt = pd.read_csv(url, skiprows=2, sep='\t', index_col=0)
 
average = labmt.happiness_average.mean()
happiness = (labmt.happiness_average - average).to_dict()
 
def score(text):
    words = text.split()
    return sum([happiness.get(word.lower(), 0.0) for word in words]) / len(words)


IndexError: list index out of range

In [65]:
labmt.head()

Unnamed: 0_level_0,happiness_rank,happiness_average,happiness_standard_deviation,twitter_rank,google_rank,nyt_rank,lyrics_rank
word,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
laughter,1,8.5,0.9313,3600,--,--,1728
happiness,2,8.44,0.9723,1853,2458,--,1230
love,3,8.42,1.1082,25,317,328,23
happy,4,8.3,0.9949,65,1372,1313,375
laughed,5,8.26,1.1572,3334,3542,--,2332


In [66]:
IMDB_df[IMDB_df.movie_id == 10027]['text'][0]

u"Sure, Titanic was a good movie, the first time you see it, but you really should see it a second time and your opinion of the film will definetly change. The first time you see the movie you see the underlying love-story and think: ooh, how romantic. The second time (and I am not the only one to think this) it is just annoying and you just sit there watching the movie thinking, When is this d**n ship going to sink??? And even this is not as impressive when you see it several times. The acting in this film is not bad, but definetly not great either. Was I glad DiCaprio did not win an oscar for that film, I mean who does he think he is, Anthony Hopkins or Denzel Washington? He does 1 half-good movie and won't do a film for less than $20 million. And then everyone is suprised that there are hardly any films with him in it. But enough about, in my eyes, the worst character of the film. Kate Winslet's performance on the other hand was wonderful. I also tink that the director is very talen

In [69]:
getValenceInfo(IMDB_df[IMDB_df.movie_id == 10027]['text'][0],happiness)

(10.386857757785162, 0.057704765321028674, 244, 0.7377049180327869)