In [1]:
import pandas as pd
import numpy as np

In [2]:
data = pd.read_csv("../Data/data.csv")
data.head()

Unnamed: 0,line_no,character,line,gender,n_words,season,episode,season_episode,episode_no
0,1,ROYCE,What do you expect? They're savages. One lot s...,male,24,1,1,s01e01,1
1,2,WILL,I've never seen wildlings do a thing like this...,male,21,1,1,s01e01,1
2,3,ROYCE,How close did you get?,male,5,1,1,s01e01,1
3,4,WILL,Close as any man would.,male,5,1,1,s01e01,1
4,5,GARED,We should head back to the wall.,male,7,1,1,s01e01,1


# Lexical Diversity by Gender

In [3]:
gender_lines = data.groupby(['gender'])['line'].apply(lambda x: ' '.join(x)).reset_index()
gender_lines.head()

Unnamed: 0,gender,line
0,female,"Fine work, as always. Well done. Thank you. ..."
1,male,What do you expect? They're savages. One lot s...


In [4]:
gender_lines['line'] = gender_lines['line'].str.lower()
gender_lines.head()

Unnamed: 0,gender,line
0,female,"fine work, as always. well done. thank you. ..."
1,male,what do you expect? they're savages. one lot s...


In [5]:
import nltk
from nltk.tokenize import word_tokenize

In [6]:
female = gender_lines[(gender_lines.gender == 'female')]
male = gender_lines[(gender_lines.gender == 'male')]

In [7]:
female['tokenized_sents'] = female.apply(lambda row: nltk.word_tokenize(row['line']), axis=1)
male['tokenized_sents'] = male.apply(lambda row: nltk.word_tokenize(row['line']), axis=1)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """Entry point for launching an IPython kernel.
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  


In [8]:
f_words = female['tokenized_sents'].tolist()
f_words = [item for sublist in f_words for item in sublist]
f_words[0:10]

['fine', 'work', ',', 'as', 'always', '.', 'well', 'done', '.', 'thank']

In [9]:
n_words = len(f_words)
n_distinctive_words = len(set(f_words))
lexical_diversity = 1.0 * n_distinctive_words / n_words
print("Number of words: ", n_words)
print("Number of distinctive words: ", n_distinctive_words)
print("Lexical diversity: ", round(lexical_diversity,2))

Number of words:  102887
Number of distinctive words:  5215
Lexical diversity:  0.05


In [10]:
m_words = male['tokenized_sents'].tolist()
m_words = [item for items in m_words for item in items]
m_words[0:10]

['what', 'do', 'you', 'expect', '?', 'they', "'re", 'savages', '.', 'one']

In [11]:
n_words = len(m_words)
n_distinctive_words = len(set(m_words))
lexical_diversity = 1.0 * n_distinctive_words / n_words
print("Number of words: ", n_words)
print("Number of distinctive words: ", n_distinctive_words)
print("Lexical diversity: ", round(lexical_diversity,2))

Number of words:  250631
Number of distinctive words:  8640
Lexical diversity:  0.03


# Lexical Diversity by Characters

In [12]:
character_lines = data.groupby(['character','gender'])['line'].apply(lambda x: ' '.join(x)).reset_index()
character_lines.head()

Unnamed: 0,character,gender,line
0,ADDAM MARBRAND,male,ls it true about Stannis and Renly? First we...
1,AEMON,male,"How many winters have you seen, Lord Tyrion? ..."
2,AERON,male,"Theon of the House Greyjoy, you would this day..."
3,ALL,male,What is dead may never die. Aye! Aye! Aye!...
4,ALL THREE,male,. What's dead may never die. The only night sh...


In [13]:
character_lines['line'] = character_lines['line'].str.lower()
character_lines.head()

Unnamed: 0,character,gender,line
0,ADDAM MARBRAND,male,ls it true about stannis and renly? first we...
1,AEMON,male,"how many winters have you seen, lord tyrion? ..."
2,AERON,male,"theon of the house greyjoy, you would this day..."
3,ALL,male,what is dead may never die. aye! aye! aye!...
4,ALL THREE,male,. what's dead may never die. the only night sh...


In [14]:
character_lines['tokenized_sents'] = character_lines.apply(lambda row: nltk.word_tokenize(row['line']), axis=1)

In [15]:
# top 10 vocal characters
tyrion = character_lines[(character_lines.character == 'TYRION')]
cersei = character_lines[(character_lines.character == 'CERSEI')]
jon = character_lines[(character_lines.character == 'JON')]
jaime = character_lines[(character_lines.character == 'JAIME')]
daenerys = character_lines[(character_lines.character == 'DAENERYS')]
littlefinger = character_lines[(character_lines.character == 'BAELISH')]
sansa = character_lines[(character_lines.character == 'SANSA')]
varys = character_lines[(character_lines.character == 'VARYS')]
davos = character_lines[(character_lines.character == 'DAVOS')]
arya = character_lines[(character_lines.character == 'ARYA')]

## male characters

In [16]:
tyrion_words = tyrion['tokenized_sents'].tolist()
tyrion_words = [item for items in tyrion_words for item in items]

n_words = len(tyrion_words)
n_distinctive_words = len(set(tyrion_words))
lexical_diversity = 1.0 * n_distinctive_words / n_words
print("Number of words: ", n_words)
print("Number of distinctive words: ", n_distinctive_words)
print("Lexical diversity: ", round(lexical_diversity,2))

Number of words:  29768
Number of distinctive words:  3161
Lexical diversity:  0.11


In [17]:
jon_words = jon['tokenized_sents'].tolist()
jon_words = [item for items in jon_words for item in items]

n_words = len(jon_words)
n_distinctive_words = len(set(jon_words))
lexical_diversity = 1.0 * n_distinctive_words / n_words
print("Number of words: ", n_words)
print("Number of distinctive words: ", n_distinctive_words)
print("Lexical diversity: ", round(lexical_diversity,2))

Number of words:  13901
Number of distinctive words:  1551
Lexical diversity:  0.11


In [18]:
jaime_words = jaime['tokenized_sents'].tolist()
jaime_words = [item for items in jaime_words for item in items]

n_words = len(jaime_words)
n_distinctive_words = len(set(jaime_words))
lexical_diversity = 1.0 * n_distinctive_words / n_words
print("Number of words: ", n_words)
print("Number of distinctive words: ", n_distinctive_words)
print("Lexical diversity: ", round(lexical_diversity,2))

Number of words:  13634
Number of distinctive words:  1887
Lexical diversity:  0.14


In [19]:
littlefinger_words = littlefinger['tokenized_sents'].tolist()
littlefinger_words = [item for items in littlefinger_words for item in items]

n_words = len(littlefinger_words)
n_distinctive_words = len(set(littlefinger_words))
lexical_diversity = 1.0 * n_distinctive_words / n_words
print("Number of words: ", n_words)
print("Number of distinctive words: ", n_distinctive_words)
print("Lexical diversity: ", round(lexical_diversity,2))

Number of words:  10287
Number of distinctive words:  1711
Lexical diversity:  0.17


In [20]:
varys_words = varys['tokenized_sents'].tolist()
varys_words = [item for items in varys_words for item in items]

n_words = len(varys_words)
n_distinctive_words = len(set(varys_words))
lexical_diversity = 1.0 * n_distinctive_words / n_words
print("Number of words: ", n_words)
print("Number of distinctive words: ", n_distinctive_words)
print("Lexical diversity: ", round(lexical_diversity,2))

Number of words:  7994
Number of distinctive words:  1590
Lexical diversity:  0.2


In [21]:
davos_words = davos['tokenized_sents'].tolist()
davos_words = [item for items in davos_words for item in items]

n_words = len(davos_words)
n_distinctive_words = len(set(davos_words))
lexical_diversity = 1.0 * n_distinctive_words / n_words
print("Number of words: ", n_words)
print("Number of distinctive words: ", n_distinctive_words)
print("Lexical diversity: ", round(lexical_diversity,2))

Number of words:  7684
Number of distinctive words:  1317
Lexical diversity:  0.17


## female characters

In [22]:
cersei_words = cersei['tokenized_sents'].tolist()
cersei_words = [item for items in cersei_words for item in items]

n_words = len(cersei_words)
n_distinctive_words = len(set(cersei_words))
lexical_diversity = 1.0 * n_distinctive_words / n_words
print("Number of words: ", n_words)
print("Number of distinctive words: ", n_distinctive_words)
print("Lexical diversity: ", round(lexical_diversity,2))

Number of words:  18618
Number of distinctive words:  2256
Lexical diversity:  0.12


In [23]:
daenerys_words = daenerys['tokenized_sents'].tolist()
daenerys_words = [item for items in daenerys_words for item in items]

n_words = len(daenerys_words)
n_distinctive_words = len(set(daenerys_words))
lexical_diversity = 1.0 * n_distinctive_words / n_words
print("Number of words: ", n_words)
print("Number of distinctive words: ", n_distinctive_words)
print("Lexical diversity: ", round(lexical_diversity,2))

Number of words:  12697
Number of distinctive words:  1612
Lexical diversity:  0.13


In [24]:
sansa_words = sansa['tokenized_sents'].tolist()
sansa_words = [item for items in sansa_words for item in items]

n_words = len(sansa_words)
n_distinctive_words = len(set(sansa_words))
lexical_diversity = 1.0 * n_distinctive_words / n_words
print("Number of words: ", n_words)
print("Number of distinctive words: ", n_distinctive_words)
print("Lexical diversity: ", round(lexical_diversity,2))

Number of words:  10189
Number of distinctive words:  1310
Lexical diversity:  0.13


In [25]:
arya_words = arya['tokenized_sents'].tolist()
arya_words = [item for items in arya_words for item in items]

n_words = len(arya_words)
n_distinctive_words = len(set(arya_words))
lexical_diversity = 1.0 * n_distinctive_words / n_words
print("Number of words: ", n_words)
print("Number of distinctive words: ", n_distinctive_words)
print("Lexical diversity: ", round(lexical_diversity,2))

Number of words:  7899
Number of distinctive words:  1109
Lexical diversity:  0.14


In [26]:
import statistics

avg_male_lexical_diversity = statistics.mean([0.11,0.11,0.14,0.17,0.2,0.17])
avg_female_lexical_diversity = statistics.mean([0.12,0.13,0.13,0.14])

print("average male lexical diversity: ", round(avg_male_lexical_diversity,2))
print("average female lexical diversity: ", round(avg_female_lexical_diversity,2))

average male lexical diversity:  0.15
average female lexical diversity:  0.13


# Exploring POS: Adjective Ratio

In [27]:
from nltk import word_tokenize, pos_tag, pos_tag_sents

In [28]:
f_text = female['tokenized_sents'].tolist()
f_text = [item for sublist in f_text for item in sublist]
#f_text[0:10]
tagged_f_texts = nltk.pos_tag(f_text)
nltk.FreqDist([b for (a, b) in tagged_f_texts]).most_common(20)

[('NN', 14502),
 ('.', 12190),
 ('PRP', 10333),
 ('VB', 7067),
 ('DT', 6776),
 ('IN', 6537),
 ('JJ', 5551),
 ('RB', 5516),
 ('VBP', 5047),
 ('VBD', 3358),
 ('NNS', 3228),
 (',', 3144),
 ('PRP$', 3034),
 ('VBZ', 2433),
 ('TO', 2377),
 ('MD', 2266),
 ('CC', 2121),
 ('VBN', 1429),
 ('VBG', 1256),
 ('WP', 996)]

In [29]:
m_text = male['tokenized_sents'].tolist()
m_text = [item for sublist in m_text for item in sublist]
#m_text[0:10]
tagged_m_texts = nltk.pos_tag(m_text)
nltk.FreqDist([b for (a, b) in tagged_m_texts]).most_common(20)

[('NN', 37395),
 ('.', 29039),
 ('PRP', 22087),
 ('DT', 18699),
 ('IN', 17261),
 ('VB', 15952),
 ('JJ', 13960),
 ('RB', 12903),
 ('VBP', 11368),
 ('NNS', 8761),
 (',', 8272),
 ('VBD', 7121),
 ('PRP$', 7088),
 ('VBZ', 6128),
 ('TO', 5567),
 ('MD', 5038),
 ('CC', 4971),
 ('VBN', 3564),
 ('VBG', 3275),
 ('WP', 2025)]

In [30]:
def adjnounratio(text):
    l_adjs = len([tag for (word,tag) in text if tag == 'JJ'])
    l_noun = len([tag for (word,tag) in text if tag == 'NN' or tag == 'NNS'])
    ratio = l_adjs/l_noun
    return(ratio)

In [32]:
adjnounratio(tagged_f_texts)

0.3130851663846588

In [33]:
adjnounratio(tagged_m_texts)

0.30245255221423

## filtering on main characters

In [34]:
character_lines.head()

Unnamed: 0,character,gender,line,tokenized_sents
0,ADDAM MARBRAND,male,ls it true about stannis and renly? first we...,"[ls, it, true, about, stannis, and, renly, ?, ..."
1,AEMON,male,"how many winters have you seen, lord tyrion? ...","[how, many, winters, have, you, seen, ,, lord,..."
2,AERON,male,"theon of the house greyjoy, you would this day...","[theon, of, the, house, greyjoy, ,, you, would..."
3,ALL,male,what is dead may never die. aye! aye! aye!...,"[what, is, dead, may, never, die, ., aye, !, a..."
4,ALL THREE,male,. what's dead may never die. the only night sh...,"[., what, 's, dead, may, never, die, ., the, o..."


In [35]:
character_lines_2 = character_lines[(character_lines.character == 'TYRION') |
                                    (character_lines.character == 'JON') |
                                    (character_lines.character == 'JAIME') |
                                    (character_lines.character == 'BAELISH') |
                                    (character_lines.character == 'VARYS') |
                                    (character_lines.character == 'DAVOS') |
                                    (character_lines.character == 'CERSEI') |
                                    (character_lines.character == 'DAENERYS') |
                                    (character_lines.character == 'SANSA') |
                                    (character_lines.character == 'ARYA')]

In [36]:
female = character_lines_2[(character_lines_2.gender == 'female')]
male = character_lines_2[(character_lines_2.gender == 'male')]

In [37]:
f_text = female['tokenized_sents'].tolist()
f_text = [item for sublist in f_text for item in sublist]
#f_text[0:10]
tagged_f_texts = nltk.pos_tag(f_text)
nltk.FreqDist([b for (a, b) in tagged_f_texts]).most_common(20)

[('NN', 6871),
 ('.', 5843),
 ('PRP', 4958),
 ('VB', 3553),
 ('DT', 3261),
 ('IN', 3047),
 ('RB', 2701),
 ('JJ', 2556),
 ('VBP', 2324),
 ('VBD', 1684),
 ('NNS', 1584),
 ('PRP$', 1429),
 (',', 1345),
 ('TO', 1229),
 ('MD', 1200),
 ('VBZ', 1145),
 ('CC', 1029),
 ('VBN', 666),
 ('VBG', 634),
 ('WP', 541)]

In [38]:
m_text = male['tokenized_sents'].tolist()
m_text = [item for sublist in m_text for item in sublist]
#m_text[0:10]
tagged_m_texts = nltk.pos_tag(m_text)
nltk.FreqDist([b for (a, b) in tagged_m_texts]).most_common(20)

[('NN', 12243),
 ('.', 9114),
 ('PRP', 7292),
 ('DT', 6100),
 ('IN', 6003),
 ('VB', 5321),
 ('JJ', 4848),
 ('RB', 4482),
 ('VBP', 3878),
 ('NNS', 2861),
 (',', 2661),
 ('VBD', 2399),
 ('PRP$', 2271),
 ('VBZ', 2075),
 ('TO', 2008),
 ('MD', 1652),
 ('CC', 1635),
 ('VBN', 1227),
 ('VBG', 1159),
 ('WP', 655)]

In [40]:
adjnounratio(tagged_f_texts)

0.3023063276167948

In [42]:
adjnounratio(tagged_m_texts)

0.3209745762711864