In [1]:
import pandas as pd
from mlxtend.preprocessing import OnehotTransactions
from mlxtend.frequent_patterns import apriori
from mlxtend.frequent_patterns import association_rules
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline

In [2]:
df = pd.read_csv('../Data/the-office-lines - scripts.csv')
df = df.query('deleted == False')
df = df.drop(['id','line_text','deleted'],axis=1)
df.head(5)

Unnamed: 0,season,episode,scene,speaker
0,1,1,1,Michael
1,1,1,1,Jim
2,1,1,1,Michael
3,1,1,1,Jim
4,1,1,1,Michael


In [3]:
def getSeasonConvos(season):
    s = df.query('season == ' + season)
    num_eps = len(s['episode'].unique())

    g = s.groupby(['episode','scene'])

    convos = []
    eps = s.groupby(['episode'])
    for i in range(num_eps):
        num_scenes = eps.get_group(i+1)['scene'].unique()
        for j in num_scenes:
            speakers = g.get_group((i+1,j))['speaker'].unique()
            convos.append(speakers)
    return convos

def getSeasonConvosWOInterviews(season):
    s = df.query('season == ' + season)
    num_eps = len(s['episode'].unique())

    g = s.groupby(['episode','scene'])

    convos = []
    eps = s.groupby(['episode'])
    for i in range(num_eps):
        num_scenes = eps.get_group(i+1)['scene'].unique()
        for j in num_scenes:
            speakers = g.get_group((i+1,j))['speaker'].unique()
            if len(speakers) > 1:
                convos.append(speakers)
    return convos

In [4]:
def getFrequentItemSetsAndRules(season, support, threshold):
    oht = OnehotTransactions()
    convos = getSeasonConvos(season)
    oht_ary = oht.fit(convos).transform(convos)
    s_coded = pd.DataFrame(oht_ary, columns=oht.columns_)
    # season frequent itemsets
    frequent_itemsets = apriori(s_coded, min_support = support, use_colnames=True)
    # season association rules
    rules = association_rules(frequent_itemsets, metric="confidence", min_threshold = threshold)
    return frequent_itemsets, rules
def getFrequentItemSetsAndRulesWOInterviews(season, support, threshold):
    oht = OnehotTransactions()
    convos = getSeasonConvosWOInterviews(season)
    oht_ary = oht.fit(convos).transform(convos)
    s_coded = pd.DataFrame(oht_ary, columns=oht.columns_)
    # season frequent itemsets
    frequent_itemsets = apriori(s_coded, min_support = support, use_colnames=True)
    # season association rules
    rules = association_rules(frequent_itemsets, metric="confidence", min_threshold = threshold)
    return frequent_itemsets, rules

In [33]:
s1 = getFrequentItemSetsAndRules('1', 0.05, 0.5)
s1[1]

Unnamed: 0,antecedants,consequents,antecedent support,consequent support,support,confidence,lift,leverage,conviction
0,(Oscar),(Michael),0.071429,0.567227,0.054622,0.764706,1.348148,0.014106,1.839286
1,(Ryan),(Michael),0.07563,0.567227,0.063025,0.833333,1.469136,0.020126,2.596639


In [34]:
s2 = getFrequentItemSetsAndRulesWOInterviews('2', 0.05, 0.5)
s2[1]

Unnamed: 0,antecedants,consequents,antecedent support,consequent support,support,confidence,lift,leverage,conviction
0,(Angela),(Dwight),0.097027,0.369327,0.050078,0.516129,1.397485,0.014244,1.303391
1,(Dwight),(Michael),0.369327,0.586854,0.233177,0.631356,1.075831,0.016436,1.120717
2,(Jan),(Michael),0.082942,0.586854,0.065728,0.792453,1.35034,0.017053,1.99061
3,(Jim),(Pam),0.338028,0.322379,0.175274,0.518519,1.608414,0.066301,1.407367
4,(Pam),(Jim),0.322379,0.338028,0.175274,0.543689,1.608414,0.066301,1.450704
5,(Kevin),(Michael),0.101721,0.586854,0.054773,0.538462,0.917538,-0.004923,0.895149
6,(Ryan),(Michael),0.106416,0.586854,0.059468,0.558824,0.952235,-0.002983,0.936463


In [35]:
s3 = getFrequentItemSetsAndRulesWOInterviews('3', 0.05, 0.5)
s3[1]

Unnamed: 0,antecedants,consequents,antecedent support,consequent support,support,confidence,lift,leverage,conviction
0,(Dwight),(Michael),0.341094,0.501746,0.185099,0.542662,1.081547,0.013956,1.089465
1,(Jan),(Michael),0.074505,0.501746,0.058207,0.78125,1.557062,0.020824,2.277732
2,(Karen),(Jim),0.135041,0.263097,0.080326,0.594828,2.260871,0.044797,1.81874


In [20]:
s2[1]

Unnamed: 0,antecedants,consequents,antecedent support,consequent support,support,confidence,lift,leverage,conviction
0,(Angela),(Dwight),0.097027,0.369327,0.050078,0.516129,1.397485,0.014244,1.303391
1,(Dwight),(Michael),0.369327,0.586854,0.233177,0.631356,1.075831,0.016436,1.120717
2,(Jan),(Michael),0.082942,0.586854,0.065728,0.792453,1.35034,0.017053,1.99061
3,(Jim),(Pam),0.338028,0.322379,0.175274,0.518519,1.608414,0.066301,1.407367
4,(Pam),(Jim),0.322379,0.338028,0.175274,0.543689,1.608414,0.066301,1.450704
5,(Kevin),(Michael),0.101721,0.586854,0.054773,0.538462,0.917538,-0.004923,0.895149
6,(Ryan),(Michael),0.106416,0.586854,0.059468,0.558824,0.952235,-0.002983,0.936463


In [37]:
s4 = getFrequentItemSetsAndRulesWOInterviews('4', 0.05, 0.5)
s4[1]

Unnamed: 0,antecedants,consequents,antecedent support,consequent support,support,confidence,lift,leverage,conviction
0,(Dwight),(Michael),0.277064,0.572477,0.165138,0.596026,1.041136,0.006525,1.058294
1,(Jan),(Michael),0.082569,0.572477,0.069725,0.844444,1.475071,0.022456,2.748362
2,(Jim),(Pam),0.344954,0.326606,0.187156,0.542553,1.661188,0.074492,1.472072
3,(Pam),(Jim),0.326606,0.344954,0.187156,0.573034,1.661188,0.074492,1.534186
4,(Kevin),(Michael),0.110092,0.572477,0.056881,0.516667,0.902511,-0.006144,0.88453
5,(Oscar),(Michael),0.110092,0.572477,0.06055,0.55,0.960737,-0.002475,0.950051
6,(Ryan),(Michael),0.095413,0.572477,0.066055,0.692308,1.20932,0.011433,1.38945
7,"(Jim, Dwight)",(Michael),0.100917,0.572477,0.051376,0.509091,0.889277,-0.006397,0.87088


In [38]:
s5 = getFrequentItemSetsAndRulesWOInterviews('5', 0.05, 0.5)
s5[1]

Unnamed: 0,antecedants,consequents,antecedent support,consequent support,support,confidence,lift,leverage,conviction
0,(Phyllis),(Dwight),0.121495,0.333778,0.065421,0.538462,1.613231,0.024868,1.44348
1,(Holly),(Michael),0.090788,0.528705,0.077437,0.852941,1.613265,0.029437,3.204806
2,(Pam),(Jim),0.280374,0.311081,0.140187,0.5,1.607296,0.052968,1.377837
3,(Oscar),(Michael),0.11749,0.528705,0.061415,0.522727,0.988694,-0.000702,0.987475
4,(Pam),(Michael),0.280374,0.528705,0.142857,0.509524,0.963721,-0.005378,0.960893
5,(Ryan),(Michael),0.078772,0.528705,0.06008,0.762712,1.442604,0.018433,1.986172


In [39]:
s6 = getFrequentItemSetsAndRulesWOInterviews('6', 0.05, 0.5)
s6[1]

Unnamed: 0,antecedants,consequents,antecedent support,consequent support,support,confidence,lift,leverage,conviction
0,(Angela),(Dwight),0.091549,0.314085,0.057746,0.630769,2.008279,0.028992,1.857688
1,(Phyllis),(Dwight),0.112676,0.314085,0.056338,0.5,1.591928,0.020948,1.371831
2,(Erin),(Michael),0.16338,0.533803,0.085915,0.525862,0.985124,-0.001297,0.983252
3,(Jim),(Michael),0.340845,0.533803,0.187324,0.549587,1.029569,0.00538,1.035043
4,(Jim),(Pam),0.340845,0.288732,0.176056,0.516529,1.788954,0.077643,1.471169
5,(Pam),(Jim),0.288732,0.340845,0.176056,0.609756,1.788954,0.077643,1.689085
6,(Kevin),(Michael),0.133803,0.533803,0.070423,0.526316,0.985974,-0.001002,0.984194
7,(Oscar),(Michael),0.115493,0.533803,0.060563,0.52439,0.982367,-0.001087,0.980209
8,(Phyllis),(Michael),0.112676,0.533803,0.070423,0.625,1.170844,0.010276,1.243192
9,"(Michael, Andy)",(Dwight),0.095775,0.314085,0.052113,0.544118,1.732393,0.022031,1.504589


In [40]:
s7 = getFrequentItemSetsAndRulesWOInterviews('7', 0.05, 0.5)
s7[1]

Unnamed: 0,antecedants,consequents,antecedent support,consequent support,support,confidence,lift,leverage,conviction
0,(Darryl),(Andy),0.120664,0.257919,0.066365,0.55,2.132456,0.035244,1.64907
1,(Holly),(Michael),0.092006,0.410256,0.072398,0.786885,1.918033,0.034652,2.767258
2,(Ryan),(Michael),0.099548,0.410256,0.051282,0.515152,1.255682,0.010442,1.216346


In [41]:
s8 = getFrequentItemSetsAndRulesWOInterviews('8', 0.05, 0.5)
s8[1]

Unnamed: 0,antecedants,consequents,antecedent support,consequent support,support,confidence,lift,leverage,conviction
0,(Erin),(Andy),0.225392,0.405136,0.134094,0.594937,1.468488,0.04278,1.468572
1,(Phyllis),(Andy),0.116976,0.405136,0.061341,0.52439,1.294358,0.01395,1.250741
2,(Jim),(Dwight),0.35806,0.36234,0.18117,0.505976,1.396414,0.051431,1.290748
3,(Dwight),(Jim),0.36234,0.35806,0.18117,0.5,1.396414,0.051431,1.28388
4,(Nellie),(Dwight),0.10699,0.36234,0.057061,0.533333,1.471916,0.018295,1.366415
5,(Pam),(Jim),0.226819,0.35806,0.122682,0.540881,1.510587,0.041467,1.398198


In [42]:
s9 = getFrequentItemSetsAndRulesWOInterviews('9', 0.05, 0.5)
s9[1]

Unnamed: 0,antecedants,consequents,antecedent support,consequent support,support,confidence,lift,leverage,conviction
0,(Oscar),(Angela),0.157258,0.168011,0.080645,0.512821,3.052308,0.054224,1.707767
1,(Clark),(Dwight),0.114247,0.358871,0.063172,0.552941,1.54078,0.022172,1.434104
2,(Pete),(Erin),0.096774,0.18414,0.061828,0.638889,3.469586,0.044008,2.259305


In [53]:
# getting association rules for entire show
oht = OnehotTransactions()
convos = getSeasonConvos('1') + getSeasonConvos('2')  + getSeasonConvos('3')
convos += getSeasonConvos('4') + getSeasonConvos('5') + getSeasonConvos('6')
convos += getSeasonConvos('7') + getSeasonConvos('8') + getSeasonConvos('9')
oht_ary = oht.fit(convos).transform(convos)
s_coded = pd.DataFrame(oht_ary, columns=oht.columns_)

In [57]:
# season frequent itemsets
frequent_itemsets = apriori(s_coded, min_support = 0.05, use_colnames=True)
# season association rules
rules = association_rules(frequent_itemsets, metric="confidence", min_threshold = 0.1)

In [58]:
rules

Unnamed: 0,antecedants,consequents,antecedent support,consequent support,support,confidence,lift,leverage,conviction
0,(Jim),(Dwight),0.242965,0.267375,0.076845,0.316279,1.182905,0.011882,1.071527
1,(Dwight),(Jim),0.267375,0.242965,0.076845,0.287405,1.182905,0.011882,1.062363
2,(Michael),(Dwight),0.342299,0.267375,0.081026,0.236712,0.885318,-0.010496,0.959828
3,(Dwight),(Michael),0.267375,0.342299,0.081026,0.303043,0.885318,-0.010496,0.943676
4,(Michael),(Jim),0.342299,0.242965,0.056617,0.165401,0.68076,-0.02655,0.907064
5,(Jim),(Michael),0.242965,0.342299,0.056617,0.233023,0.68076,-0.02655,0.857525
6,(Jim),(Pam),0.242965,0.217087,0.090406,0.372093,1.71403,0.037661,1.246862
7,(Pam),(Jim),0.217087,0.242965,0.090406,0.41645,1.71403,0.037661,1.297291
8,(Michael),(Pam),0.342299,0.217087,0.060233,0.175966,0.810578,-0.014076,0.950098
9,(Pam),(Michael),0.217087,0.342299,0.060233,0.27746,0.810578,-0.014076,0.910263
