* Cory Clayton (acc2ds@virginia.edu)
- DS5001
- 6 May 2021


# Topic Modeling Using LDA on movie scripts

## Set up

In [2]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.decomposition import LatentDirichletAllocation as LDA

### LDA Params

In [2]:
ngram_range = [1,2]
n_terms = 4000
n_topics = 40
max_iter = 20
n_top_terms = 10

### OHCO

In [3]:
OHCO = ['movie_id', 'Action_number', 'Dialogue_num','sent_num']
DIALOGUE = OHCO[:3]
ACTION = OHCO[:2]
MOVIE = OHCO[:1]

In [4]:
data_in = 'data_in/'
data_out = 'data_out/'
data_prefix = 'movie'

In [5]:
TOKENS = pd.read_csv(f'{data_in}{data_prefix}-TOKENS.csv')

In [5]:
LIB = pd.read_csv(f'{data_in}{data_prefix}-LIB.csv').set_index('movie_id')

### Creating a column for Live Action and Animation

In [7]:
LIB["Animation"]=["Animation" if "Animation" in genre else "Live-Action" for genre in LIB["genre"]]

In [8]:
def generate_lda_tables(TOKENS,BAG,LIB,n_topics=40):
    TOKENS.term_str=TOKENS.term_str.astype("str")
    TOKENS=TOKENS.query("term_str != 'nan'")
    DOCS = TOKENS[TOKENS.pos.str.match(r'^NNS?$')]\
        .groupby(BAG).term_str\
        .apply(lambda x: ' '.join(x))\
        .to_frame()\
        .rename(columns={'term_str':'doc_str'})
    count_engine = CountVectorizer(max_features=n_terms, ngram_range=ngram_range, stop_words='english')
    count_model = count_engine.fit_transform(DOCS.doc_str)
    TERMS = count_engine.get_feature_names()
    lda_engine = LDA(n_components=n_topics, max_iter=max_iter, learning_offset=50., random_state=0)
    THETA = pd.DataFrame(lda_engine.fit_transform(count_model), index=DOCS.index)
    THETA.columns.name = 'topic_id'
    PHI = pd.DataFrame(lda_engine.components_, columns=TERMS)
    PHI.index.name = 'topic_id'
    PHI.columns.name  = 'term_str'
    TOPICS = PHI.stack().to_frame().rename(columns={0:'topic_weight'})\
        .groupby('topic_id')\
        .apply(lambda x: x.sort_values('topic_weight', ascending=False)\
        .head(n_top_terms)\
        .reset_index()\
        .drop('topic_id',1)['term_str'])
    TOPICS['label'] = TOPICS.apply(lambda x: str(x.name) + ' ' + ', '.join(x[:n_top_terms]), 1)
    TOPICS['doc_weight_sum'] = THETA.sum()
    return TOPICS,PHI,THETA
    

## Action As The BAG

In [9]:
MOVIE

['movie_id']

In [10]:
movie_TOPICS,movie_PHI,movie_THETA=generate_lda_tables(TOKENS,ACTION,LIB)

In [11]:
movie_TOPICS.to_csv(f"{data_out}{data_prefix}-topic_table.csv")

In [12]:
movie_THETA.to_csv(f"{data_out}{data_prefix}-document_topic_table.csv")

In [13]:
movie_PHI.T.to_csv(f"{data_out}{data_prefix}-topic_word_table.csv")

In [14]:
movie_THETA.groupby("movie_id").sum()

topic_id,0,1,2,3,4,5,6,7,8,9,...,30,31,32,33,34,35,36,37,38,39
movie_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
m1,3.473895,6.098428,4.501763,4.223159,3.153466,2.506629,4.667605,3.903915,3.299591,4.230455,...,3.489962,3.980992,5.62678,5.666833,5.509963,4.514962,6.162247,7.234203,6.094633,4.977941
m10,6.257079,9.469607,7.694788,4.957601,6.915142,6.233301,4.649762,6.11861,10.580896,4.559107,...,4.372211,7.301868,5.678044,4.238573,10.092234,9.623051,3.793043,7.500814,2.832521,6.747606
m11,2.943892,3.227141,3.27766,5.400165,5.524163,4.239893,5.108667,4.221138,4.713862,6.2357,...,3.454164,9.835696,5.527763,5.213196,5.338216,4.934516,6.217775,6.028157,2.924557,2.565321
m12,4.322333,8.130005,2.25784,2.016228,1.033769,1.526512,3.211753,2.178826,3.808031,3.367539,...,2.651223,4.887893,3.120714,3.271865,3.533224,1.811661,4.477396,1.388962,2.53343,2.597008
m13,8.010382,6.687542,3.947439,5.434693,9.567407,3.808192,8.283617,6.66394,6.193444,3.357746,...,8.459338,4.858901,6.99446,5.317359,4.268528,7.297903,5.558525,3.513714,4.790582,4.828544
m14,2.037845,1.740815,1.908848,2.558253,2.589125,5.225584,2.706136,2.456053,0.673608,0.912433,...,1.635832,2.436313,1.230182,8.90447,0.806473,2.067305,2.225493,6.609361,2.787256,2.47314
m15,13.884431,7.330794,7.019665,20.611962,10.547262,20.075651,8.800943,13.918856,5.98127,7.767341,...,14.270143,9.488102,13.223267,21.927751,7.953314,10.393873,8.623608,12.802951,15.153167,13.457693
m16,2.198556,2.977401,9.574363,3.356443,1.840791,5.368993,3.743993,3.602126,3.273953,4.296856,...,2.215052,4.678153,2.357974,3.16066,3.430159,3.465376,12.449257,1.895385,4.536007,2.496622
m17,4.286408,1.485215,4.574014,5.255911,1.900699,4.795028,0.880678,2.133836,0.839097,4.325142,...,2.705902,6.765045,2.396296,0.923678,3.040582,2.38722,2.563324,3.99909,3.771689,1.844138
m18,0.907374,2.72249,3.923985,2.079476,3.738274,2.185899,3.356797,3.526349,1.75837,3.751189,...,3.024564,2.651838,2.300341,0.711053,2.914975,2.094084,0.686766,1.861032,1.041641,1.619023


In [15]:
def get_topic_dists(THETA, TOPICS, LIB,lib_col,n_topics,BAG,join_col="movie_id",studio=None):
    if studio is not None:
        LIB=LIB.query("studio == @studio")
    topic_cols = [t for t in range(n_topics)]
    TOPIC_TABLE = THETA.join(LIB, on=join_col)\
        .reset_index().set_index([lib_col]+BAG)\
        .groupby(lib_col)[topic_cols].mean()\
        .T                                            
    TOPIC_TABLE.index.name = 'topic_id'
    TOPIC_TABLE['label'] = TOPICS['label']
    return TOPIC_TABLE
    

## n_topics 40

In [16]:
STUDIO_TABLE=get_topic_dists(movie_THETA,movie_TOPICS,LIB,"studio",40,ACTION)

### top lucas film

In [17]:
STUDIO_TABLE.sort_values("Lucasfilm Ltd.",ascending=False).style.background_gradient()

studio,Lucasfilm Ltd.,Marvel Studios,Pixar,Walt Disney Animation Studios,label
topic_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
26,0.054242,0.030177,0.017305,0.016819,"26 guy, ship, fleet, shield, right, space, yeah, gate, everybody, plans"
14,0.037424,0.035538,0.042236,0.021845,"14 sir, yes, yes sir, body, nemo, im, trade, youre, men, gonna"
22,0.034989,0.025152,0.030525,0.029017,"22 youre, minute, time, morning, droids, fine, oh, way, right, position"
27,0.032916,0.024152,0.024003,0.019539,"27 time, lady, memory, way, brother, isnt, dont, youre, car, core"
32,0.032659,0.025353,0.023868,0.024696,"32 power, wait, uh, people, wait wait, come, weapons, jump, cube, things"
11,0.032451,0.028648,0.031558,0.023984,"11 way, home, point, view, daughter, time, thats, turn, sorry, right"
31,0.03027,0.028683,0.016174,0.026901,"31 master, time, vers, night, youre, case, hero, shes, thing, guy"
29,0.028768,0.030765,0.025082,0.028018,"29 help, dont, control, work, memories, engine, core, youve, theyre, things"
18,0.028018,0.031012,0.022548,0.043879,"18 girl, time, father, head, plan, guys, sorry, youre, beast, village"
34,0.027949,0.030194,0.020323,0.023739,"34 people, guest, time, right, soul, family, guest guest, world, war, power"


### Top marvel

In [18]:
STUDIO_TABLE.sort_values("Marvel Studios",ascending=False).style.background_gradient()

studio,Lucasfilm Ltd.,Marvel Studios,Pixar,Walt Disney Animation Studios,label
topic_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
8,0.020669,0.036182,0.017306,0.018231,"8 moment, hmm, thor, kind, target, dont, hes, way, eyes, boss"
19,0.02065,0.035546,0.021615,0.018796,"19 yeah, hell, dude, time, man, years, hours, sorry, yeah yeah, way"
14,0.037424,0.035538,0.042236,0.021845,"14 sir, yes, yes sir, body, nemo, im, trade, youre, men, gonna"
25,0.019884,0.034772,0.020092,0.017835,"25 gonna, kids, talk, powers, mission, fear, time, fight, area, people"
21,0.021122,0.034035,0.048336,0.035318,"21 okay, hey, hey hey, yeah, thing, spark, train, okay okay, mask, left"
1,0.024417,0.032765,0.022527,0.028476,"1 sorry, course, time, day, shh, stark, weapon, drop, captain, yeah"
18,0.028018,0.031012,0.022548,0.043879,"18 girl, time, father, head, plan, guys, sorry, youre, beast, village"
29,0.028768,0.030765,0.025082,0.028018,"29 help, dont, control, work, memories, engine, core, youve, theyre, things"
34,0.027949,0.030194,0.020323,0.023739,"34 people, guest, time, right, soul, family, guest guest, world, war, power"
26,0.054242,0.030177,0.017305,0.016819,"26 guy, ship, fleet, shield, right, space, yeah, gate, everybody, plans"


### Top disney animatd studios

In [19]:
STUDIO_TABLE.sort_values("Walt Disney Animation Studios",ascending=False).style.background_gradient()

studio,Lucasfilm Ltd.,Marvel Studios,Pixar,Walt Disney Animation Studios,label
topic_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
18,0.028018,0.031012,0.022548,0.043879,"18 girl, time, father, head, plan, guys, sorry, youre, beast, village"
13,0.020844,0.019057,0.016701,0.043597,"13 hook, way, bit, maui, love, ooh, time, thing, deal, beast"
15,0.023415,0.025509,0.027675,0.0369,"15 yeah, time, dream, school, today, gonna, day, life, attack, hands"
2,0.019009,0.02058,0.015458,0.036001,"2 heart, gonna, future, people, congratulations, right, island, honor, way, time"
21,0.021122,0.034035,0.048336,0.035318,"21 okay, hey, hey hey, yeah, thing, spark, train, okay okay, mask, left"
23,0.022185,0.01799,0.016742,0.031624,"23 elsa, sister, love, way, uh, doesnt, years, kingdom, world, books"
28,0.024787,0.019856,0.021256,0.030322,"28 ah, son, blood, man, right, bye, hold, drive, death, day"
24,0.026194,0.02315,0.022713,0.029493,"24 right, looks, strength, hand, thing, cold, birthday, head, storm, way"
22,0.034989,0.025152,0.030525,0.029017,"22 youre, minute, time, morning, droids, fine, oh, way, right, position"
36,0.025068,0.018515,0.027467,0.028856,"36 hello, boat, yeah, place, sea, anybody, road, heart, bubbles, board"


### top Pixar

In [20]:
STUDIO_TABLE.sort_values("Pixar",ascending=False).style.background_gradient()

studio,Lucasfilm Ltd.,Marvel Studios,Pixar,Walt Disney Animation Studios,label
topic_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
21,0.021122,0.034035,0.048336,0.035318,"21 okay, hey, hey hey, yeah, thing, spark, train, okay okay, mask, left"
14,0.037424,0.035538,0.042236,0.021845,"14 sir, yes, yes sir, body, nemo, im, trade, youre, men, gonna"
11,0.032451,0.028648,0.031558,0.023984,"11 way, home, point, view, daughter, time, thats, turn, sorry, right"
22,0.034989,0.025152,0.030525,0.029017,"22 youre, minute, time, morning, droids, fine, oh, way, right, position"
33,0.020748,0.023117,0.030481,0.020026,"33 door, door door, youre, monsters, hey, chance, guys, tower, ships, field"
7,0.021557,0.022586,0.030313,0.022933,"7 whoa, energy, whoa whoa, luck, wow, life, attention, rule, place, people"
38,0.018635,0.012383,0.029772,0.018648,"38 race, ocean, pilot, thing, world, car, yes, race car, hey, sulley"
4,0.018815,0.02378,0.029615,0.013011,"4 friend, boy, time, speed, joy, ones, life, years, race, rocket"
5,0.014654,0.026355,0.029374,0.024751,"5 kid, yeah, look, time, scare, right, youre, huh, good, car"
0,0.018176,0.017507,0.029099,0.019734,"0 time, hat, dead, butt, idea, look, customers, watch, hat hat, news"


In [21]:
DIRECTOR_TABLE=get_topic_dists(movie_THETA,movie_TOPICS,LIB,"director",40,ACTION)

In [22]:
DIRECTOR_TABLE.sort_values("Anthony Russo Joe Russo",ascending=False).style.background_gradient()

director,Andrew Stanton,Anthony Russo Joe Russo,Byron Howard Nathan Greno,Byron Howard Rich Moore,Chris Buck Jennifer Lee,Dan Scanlon,Gareth Edwards,Gary Trousdale Kirk Wise,George Lucas,J.J. Abrams,Joe Johnston,John Lasseter,Joss Whedon,Pete Docter,Ron Clements John Musker,Ryan Fleck Anna Boden,Tony Bancroft Barry Cook,label
topic_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1
21,0.054798,0.039528,0.01768,0.070502,0.047392,0.036333,0.031946,0.014381,0.017094,0.016896,0.014864,0.013054,0.022031,0.051953,0.026303,0.041071,0.016654,"21 okay, hey, hey hey, yeah, thing, spark, train, okay okay, mask, left"
19,0.028277,0.037274,0.005915,0.033389,0.014807,0.018957,0.011995,0.01196,0.018433,0.031186,0.048524,0.0237,0.035277,0.02006,0.02118,0.023024,0.013918,"19 yeah, hell, dude, time, man, years, hours, sorry, yeah yeah, way"
8,0.024926,0.035,0.008546,0.005345,0.025352,0.006669,0.014998,0.0272,0.020465,0.025926,0.030227,0.007712,0.049604,0.017609,0.020086,0.022235,0.018907,"8 moment, hmm, thor, kind, target, dont, hes, way, eyes, boss"
26,0.016731,0.034537,0.010453,0.023766,0.008063,0.014112,0.082583,0.011933,0.032524,0.057885,0.021488,0.013046,0.025075,0.018228,0.027845,0.025568,0.009636,"26 guy, ship, fleet, shield, right, space, yeah, gate, everybody, plans"
1,0.028243,0.034516,0.027015,0.00946,0.032324,0.017236,0.02772,0.058071,0.021338,0.025561,0.06049,0.032505,0.031138,0.020551,0.018266,0.015222,0.029274,"1 sorry, course, time, day, shh, stark, weapon, drop, captain, yeah"
25,0.012501,0.034477,0.008559,0.026148,0.019605,0.019257,0.024117,0.012813,0.021541,0.013979,0.036235,0.011327,0.039734,0.022957,0.017933,0.0273,0.014327,"25 gonna, kids, talk, powers, mission, fear, time, fight, area, people"
35,0.01838,0.034043,0.085836,0.015205,0.018532,0.020468,0.020523,0.01294,0.020912,0.029783,0.019804,0.021523,0.018731,0.027072,0.02126,0.023276,0.022517,"35 hair, stone, life, time, summer, yeah, day, way, face, gotta"
29,0.025993,0.031349,0.004173,0.02271,0.010846,0.023858,0.023905,0.044409,0.032292,0.028411,0.019183,0.011174,0.02336,0.026559,0.021276,0.045874,0.063474,"29 help, dont, control, work, memories, engine, core, youve, theyre, things"
34,0.014833,0.03091,0.044007,0.019367,0.016675,0.007985,0.025045,0.025237,0.027186,0.031509,0.016924,0.009307,0.0356,0.024044,0.021044,0.02518,0.031344,"34 people, guest, time, right, soul, family, guest guest, world, war, power"
18,0.019579,0.030871,0.020367,0.016408,0.038224,0.039806,0.029457,0.070855,0.032647,0.020662,0.033755,0.015346,0.032574,0.022419,0.059476,0.027748,0.044865,"18 girl, time, father, head, plan, guys, sorry, youre, beast, village"


In [23]:
MARVEL_TABLE=get_topic_dists(movie_THETA,movie_TOPICS,LIB,"year",40,ACTION,studio="Marvel Studios")
MARVEL_TABLE.sort_values(2019,ascending=False).style.background_gradient()

year,2011.0,2012.0,2014.0,2015.0,2016.0,2018.0,2019.0,label
topic_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
21,0.014864,0.024073,0.051173,0.018461,0.030537,0.029162,0.04543,"21 okay, hey, hey hey, yeah, thing, spark, train, okay okay, mask, left"
29,0.019183,0.031141,0.019809,0.009759,0.04243,0.029399,0.037304,"29 help, dont, control, work, memories, engine, core, youve, theyre, things"
11,0.012079,0.019881,0.005804,0.02809,0.022283,0.042572,0.036853,"11 way, home, point, view, daughter, time, thats, turn, sorry, right"
31,0.033215,0.025604,0.009604,0.038174,0.025161,0.025463,0.034003,"31 master, time, vers, night, youre, case, hero, shes, thing, guy"
18,0.033755,0.046465,0.034089,0.008292,0.02315,0.026118,0.033701,"18 girl, time, father, head, plan, guys, sorry, youre, beast, village"
22,0.028587,0.030029,0.017708,0.006218,0.025332,0.0156,0.033221,"22 youre, minute, time, morning, droids, fine, oh, way, right, position"
26,0.021488,0.025081,0.040963,0.025064,0.027947,0.032752,0.032355,"26 guy, ship, fleet, shield, right, space, yeah, gate, everybody, plans"
15,0.023089,0.028956,0.015838,0.035252,0.015095,0.016968,0.032059,"15 yeah, time, dream, school, today, gonna, day, life, attack, hands"
34,0.016924,0.044128,0.028598,0.020692,0.026573,0.030738,0.030616,"34 people, guest, time, right, soul, family, guest guest, world, war, power"
16,0.004726,0.027395,0.025552,0.026546,0.01891,0.031182,0.030355,"16 ha, ha ha, lets, lot, stones, hammer, time, pressure, whos, gonna"


### Using Dialogue As The Bag

In [24]:
dialogue_TOPICS,dialogue_PHI,dialogue_THETA=generate_lda_tables(TOKENS,DIALOGUE,LIB)

In [25]:
dialogue_TOPICS[['label','doc_weight_sum']]

term_str,label,doc_weight_sum
topic_id,Unnamed: 1_level_1,Unnamed: 2_level_1
0,"0 wait, thanks, rest, body, space, wait wait, ...",392.191145
1,"1 power, huh, help, son, mother, monster, mons...",423.16108
2,"2 whoa, hello, problem, stone, hi, dream, memo...",380.125393
3,"3 order, signal, strength, base, congratulatio...",316.967299
4,"4 uh, ha, ow, youve, ha ha, powers, bye, great...",386.768312
5,"5 hours, fun, im, person, shot, army, sea, fal...",319.916456
6,"6 world, theyre, kids, road, hero, cause, ooh,...",371.623172
7,"7 right, look, dad, eyes, hook, know, path, fa...",419.247618
8,"8 ah, school, daughter, peace, year, story, fa...",340.354949
9,"9 sorry, nemo, mission, weapon, tell, hammer, ...",379.635606


In [26]:
dialogue_GENRES=get_topic_dists(dialogue_THETA,dialogue_TOPICS,LIB,"studio",40,DIALOGUE)

In [27]:
dialogue_GENRES.sort_values("Lucasfilm Ltd.",ascending=False).style.background_gradient()

studio,Lucasfilm Ltd.,Marvel Studios,Pixar,Walt Disney Animation Studios,label
topic_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
28,0.034926,0.028215,0.026443,0.027763,"28 youre, yes, sir, dude, yes sir, dear, weve, train, wife, shh"
27,0.032726,0.026028,0.018893,0.024226,"27 friends, shes, captain, stay, left, droids, security, carrots, nice, enemy"
1,0.031699,0.025571,0.025905,0.031762,"1 power, huh, help, son, mother, monster, monsters, sister, drive, magic"
32,0.031665,0.034096,0.030815,0.030261,"32 time, okay, beast, pilot, scare, floor, play, change, avengers, dreams"
20,0.030259,0.025502,0.025425,0.021209,"20 guys, planet, message, turn, energy, plans, drop, yep, brothers, ships"
37,0.030126,0.027876,0.022564,0.025745,"37 thing, war, minute, choice, boss, bear, second, nope, papa, target"
31,0.029882,0.027538,0.029135,0.025831,"31 way, idea, plan, car, race, summer, race car, form, maam, way way"
33,0.028985,0.024225,0.024719,0.022172,"33 dont, master, men, matter, eye, force, business, ready, number, souls"
7,0.028804,0.025511,0.02924,0.027712,"7 right, look, dad, eyes, hook, know, path, fathers, difference, armor"
12,0.02797,0.019111,0.021822,0.021093,"12 care, baby, um, control, looks, house, fleet, match, ugh, yeah yeah"


In [28]:
dialogue_names=get_topic_dists(dialogue_THETA,dialogue_TOPICS,LIB,"name",40,DIALOGUE)

In [7]:
#dialogue_names