* Cory Clayton (acc2ds@virginia.edu)
- DS5001
- 6 May 2021


# VOCAB table creation for movie transcript project

## setup

In [1]:
import pandas as pd
import numpy as np

import re
import nltk

import matplotlib.pyplot as plt
import seaborn as sns; sns.set()

In [2]:
data_in = './data_in/'
data_out = './data_out/'
data_prefix = 'movie'

In [3]:
OHCO = ['movie_id', 'Action_number', 'Dialogue_num','sent_num']
DIALOGUE = OHCO[:3]
ACTION = OHCO[:2]
MOVIE = OHCO[:1]


In [4]:
TOKENS = pd.read_csv('{}/{}-TOKENS.csv'.format(data_in, data_prefix))

In [5]:
TOKENS.set_index(OHCO,inplace=True)

In [6]:
TOKENS

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,token_num,pos_tuple,pos,token_str,term_str
movie_id,Action_number,Dialogue_num,sent_num,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
m1,1.0,0,0,0,"('Mama', 'NN')",NN,Mama,mama
m1,1.0,1,0,0,"('We', 'PRP')",PRP,We,we
m1,1.0,1,0,1,"('know', 'VBP')",VBP,know,know
m1,1.0,1,1,0,"('Jyn,', 'NNP')",NNP,"Jyn,",jyn
m1,1.0,1,1,1,"('gather', 'CC')",CC,gather,gather
...,...,...,...,...,...,...,...,...
m24,279.0,10,0,1,"('did', 'VBD')",VBD,did,did
m24,279.0,10,0,2,"('it', 'PRP')",PRP,it,it
m24,279.0,11,0,0,"('Yes', 'UH')",UH,Yes,yes
m24,279.0,12,0,0,"('Now', 'RB')",RB,Now,now


## create vocab from tokens

In [7]:
VOCAB = TOKENS.term_str.value_counts().to_frame()\
    .rename(columns={'index':'term_str', 'term_str':'n'})\
    .sort_index().reset_index().rename(columns={'index':'term_str'})
VOCAB.index.name = 'term_id'

VOCAB['num'] = VOCAB.term_str.str.match("\d+").astype('int')

### add stop words

In [8]:
sw = pd.DataFrame(nltk.corpus.stopwords.words('english'), columns=['term_str'])
sw = sw.reset_index().set_index('term_str')
sw.columns = ['dummy']
sw.dummy = 1

VOCAB['stop'] = VOCAB.term_str.map(sw.dummy)
VOCAB['stop'] = VOCAB['stop'].fillna(0).astype('int')

### add pos _ max

In [9]:
M = TOKENS.groupby(['term_str','pos']).pos.count().unstack(fill_value=0)

In [10]:
VOCAB = VOCAB.reset_index().set_index('term_str')
VOCAB['pos_max'] = M.idxmax(1)
VOCAB = VOCAB.reset_index().set_index('term_id')

## create vocab stats

In [11]:
N_vocab = VOCAB.shape[0]
U_vocab = 1/N_vocab

In [12]:
VOCAB['p'] = VOCAB.n / VOCAB.n.sum()  # Probability
VOCAB['s'] = 1 / VOCAB.p              # Surprise
VOCAB['i'] = np.log2(VOCAB.s)         # Information
VOCAB['h'] = VOCAB.p * VOCAB.i        # Entropy

In [13]:
VOCAB.set_index('term_str',inplace=True)

In [14]:
VOCAB['wlen'] = VOCAB.index.str.len()

## add TF

In [15]:
DOC = OHCO[:2] # Chapter
DOC

['movie_id', 'Action_number']

In [16]:
BOW = TOKENS.groupby(DOC+['term_str']).term_str.count().to_frame('tf_n')

In [17]:
D = BOW.groupby(DOC).tf_n
BOW['tf_jp'] = D.apply(lambda x: x / x.sum().sum()) # jp = P(w,d)
BOW['tf_cp'] = D.apply(lambda x: x / x.sum()) # cp = P(w|d)
BOW['tf_l2'] = D.apply(lambda x: x / np.sqrt((x**2).sum()))
BOW['tf_logn'] = D.apply(lambda x: np.log2(1 + x))
BOW['tf_sub'] = D.apply(lambda x: 1 + np.log2(x)) # Sublinear scaling; from Manning, et al.
BOW['tf_max'] = D.apply(lambda x: .4 + .6 * (x / x.max())) # See Manning, et al. for choice of α
BOW['tf_bool'] = D.apply(lambda x: x.astype('bool') / x.astype('bool').sum())

In [18]:
tf_types = [col.split('_')[1] for col in BOW.columns.to_list() if 'tf_' in col]

In [19]:
BOW

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,tf_n,tf_jp,tf_cp,tf_l2,tf_logn,tf_sub,tf_max,tf_bool
movie_id,Action_number,term_str,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
m1,1.0,gather,1,0.111111,0.111111,0.333333,1.0,1.0,1.0,0.111111
m1,1.0,its,1,0.111111,0.111111,0.333333,1.0,1.0,1.0,0.111111
m1,1.0,jyn,1,0.111111,0.111111,0.333333,1.0,1.0,1.0,0.111111
m1,1.0,know,1,0.111111,0.111111,0.333333,1.0,1.0,1.0,0.111111
m1,1.0,mama,1,0.111111,0.111111,0.333333,1.0,1.0,1.0,0.111111
...,...,...,...,...,...,...,...,...,...,...
m9,277.0,red,1,0.200000,0.200000,0.447214,1.0,1.0,1.0,0.200000
m9,278.0,hill,1,1.000000,1.000000,1.000000,1.0,1.0,1.0,1.000000
m9,279.0,motherf,1,0.333333,0.333333,0.577350,1.0,1.0,1.0,0.333333
m9,279.0,no,1,0.333333,0.333333,0.577350,1.0,1.0,1.0,0.333333


### add df and idf

In [20]:
VOCAB['df'] = BOW.groupby('term_str').tf_n.count()
N_docs = len(D.groups)
VOCAB['idf'] = np.log2(N_docs/VOCAB.df)

In [21]:
for tf_type in tf_types:
    BOW[f'tfidf_{tf_type}'] = BOW[f'tf_{tf_type}'] * VOCAB.idf

In [22]:
BOW

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,tf_n,tf_jp,tf_cp,tf_l2,tf_logn,tf_sub,tf_max,tf_bool,tfidf_n,tfidf_jp,tfidf_cp,tfidf_l2,tfidf_logn,tfidf_sub,tfidf_max,tfidf_bool
movie_id,Action_number,term_str,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1
m1,1.0,gather,1,0.111111,0.111111,0.333333,1.0,1.0,1.0,0.111111,10.014299,1.112700,1.112700,3.338100,10.014299,10.014299,10.014299,1.112700
m1,1.0,its,1,0.111111,0.111111,0.333333,1.0,1.0,1.0,0.111111,2.601518,0.289058,0.289058,0.867173,2.601518,2.601518,2.601518,0.289058
m1,1.0,jyn,1,0.111111,0.111111,0.333333,1.0,1.0,1.0,0.111111,8.014299,0.890478,0.890478,2.671433,8.014299,8.014299,8.014299,0.890478
m1,1.0,know,1,0.111111,0.111111,0.333333,1.0,1.0,1.0,0.111111,2.777807,0.308645,0.308645,0.925936,2.777807,2.777807,2.777807,0.308645
m1,1.0,mama,1,0.111111,0.111111,0.333333,1.0,1.0,1.0,0.111111,8.336228,0.926248,0.926248,2.778743,8.336228,8.336228,8.336228,0.926248
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
m9,277.0,red,1,0.200000,0.200000,0.447214,1.0,1.0,1.0,0.200000,7.291833,1.458367,1.458367,3.261007,7.291833,7.291833,7.291833,1.458367
m9,278.0,hill,1,1.000000,1.000000,1.000000,1.0,1.0,1.0,1.000000,8.429337,8.429337,8.429337,8.429337,8.429337,8.429337,8.429337,8.429337
m9,279.0,motherf,1,0.333333,0.333333,0.577350,1.0,1.0,1.0,0.333333,12.336228,4.112076,4.112076,7.122324,12.336228,12.336228,12.336228,4.112076
m9,279.0,no,1,0.333333,0.333333,0.577350,1.0,1.0,1.0,0.333333,2.432346,0.810782,0.810782,1.404315,2.432346,2.432346,2.432346,0.810782


### get the tfidf_sums

In [23]:
for tf_type in tf_types:
    col = f"tfidf_{tf_type}"
    VOCAB[col + "_sum"] = BOW.groupby('term_str')[col].sum()
    VOCAB[col + "_sum"] = (VOCAB[col + "_sum"] - VOCAB[col + "_sum"].mean()) / VOCAB[col + "_sum"].std()
    VOCAB[col + "_sum"] = VOCAB[col + "_sum"] - VOCAB[col + "_sum"].min() 
    VOCAB[col + "_sum"] = VOCAB[col + "_sum"] / N_docs

In [24]:
tfidf_sum_cols = [f"tfidf_{type}_sum" for type in tf_types]

In [25]:
VOCAB.sample(5)

Unnamed: 0_level_0,n,num,stop,pos_max,p,s,i,h,wlen,df,idf,tfidf_n_sum,tfidf_jp_sum,tfidf_cp_sum,tfidf_l2_sum,tfidf_logn_sum,tfidf_sub_sum,tfidf_max_sum,tfidf_bool_sum
term_str,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1
rings,3,0,0,NNS,1.5e-05,67824.666667,16.049522,0.000237,5,3,10.751265,1.3e-05,3.953253e-05,3.953253e-05,3.823994e-05,1.6e-05,1.5e-05,2.548305e-05,4.568841e-05
fractals,1,0,0,NNS,5e-06,203474.0,17.634485,8.7e-05,8,1,12.336228,0.0,4.261793e-07,4.261793e-07,8.301335e-07,0.0,0.0,2.638225e-07,6.960198e-07
aunt,10,0,0,NNP,4.9e-05,20347.4,14.312557,0.000703,4,7,9.528873,5.3e-05,2.658967e-05,2.658967e-05,3.967516e-05,5.5e-05,5.4e-05,5.065491e-05,3.081192e-05
d,12,0,1,NN,5.9e-05,16956.166667,14.049522,0.000829,1,11,8.876796,6.1e-05,0.0001003222,0.0001003222,0.0001046657,7.4e-05,6.9e-05,9.84425e-05,0.0001113262
child,43,0,0,NN,0.000211,4731.953488,12.20822,0.00258,5,40,7.014299,0.000186,0.0003220232,0.0003220232,0.0002792639,0.00023,0.000212,0.0002741263,0.0003644853


## write out

In [26]:
VOCAB.to_csv('{}/{}-VOCAB.csv'.format(data_out,data_prefix))