In [1]:
import string

In [2]:
contents = unicode(open('data/friends_transcripts.txt').read(), errors='ignore')
contents[:200]



First element of ``re.split`` is empty string, so skip it:

In [3]:
import re

splitted = re.split('======== (\d\d)-(\d\d)\n', contents)[1:]
print(len(splitted))

666


In [4]:
from itertools import izip

def tripletswise(t):
    it = iter(t)
    return izip(it,it,it)

def clean(txt):
    for i in string.punctuation:
        txt = txt.replace(i, '')
    return txt

matched_data = [(int(x), int(y), clean(z)) for x, y, z in tripletswise(splitted)]
N = len(matched_data)

The first element is **season**,

The second is **episode**,

And the third – actual **transcript**:

In [5]:
import nltk
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer

def stem_data(data, use_stopwords=True):
    def tokenize(doc):        
        tokens = nltk.word_tokenize(doc.lower())
        if use_stopwords:
            return [t for t in tokens if not t in stopwords.words('english')]
        else:
            return tokens
    
    def stem(doc):
        stemmer = PorterStemmer()
        return [stemmer.stem(t) for t in tokenize(doc)]
        
    return [(seas, ep, stem(doc)) for seas, ep, doc in data]


stemmed_data_w_stopwords = stem_data(matched_data, True)
stemmed_data_wout_stopwords = stem_data(matched_data, False)

In [6]:
print(stemmed_data_w_stopwords[0][2][:10])
print(stemmed_data_wout_stopwords[0][2][:10])

[u'written', u'marta', u'kauffman', u'david', u'crane', u'monica', u'there', u'noth', u'tell', u'he']
[u'written', u'by', u'marta', u'kauffman', u'david', u'crane', u'monica', u'there', u'noth', u'to']


In [7]:
from collections import Counter

def count_doc_frequencies(data):
    doc_frequencies = Counter()
    for _, _, doc in data:
        for w in set(doc):
            doc_frequencies[w] += 1
    
    return doc_frequencies


docfreq_w_stopwords = count_doc_frequencies(stemmed_data_w_stopwords)
docfreq_wout_stopwords = count_doc_frequencies(stemmed_data_wout_stopwords)

In [8]:
import operator

def sortdict(d):
    return sorted(d.items(), key=operator.itemgetter(1), reverse=True)
    
print(sortdict(docfreq_w_stopwords)[:10])
print(sortdict(docfreq_wout_stopwords)[:10])

[(u'end', 216), (u'ross', 214), (u'well', 212), (u'dont', 212), (u'im', 211), (u'joey', 211), (u'oh', 211), (u'know', 210), (u'hey', 209), (u'your', 209)]
[(u'by', 217), (u'end', 216), (u'you', 214), (u'ross', 214), (u'the', 213), (u'and', 213), (u'well', 212), (u'just', 212), (u'dont', 212), (u'are', 212)]


In [21]:
import math

def calc_tfidf(word, freq, doc_freq):
    tf = freq
    idf = math.log(N / (doc_freq[word] + 1))
    return tf * idf

def calculate_tfidf(data, document_frequencies, limit=20):
    result = []
    for seas, ep, script in data:
        metricised = {}
        for word, count in Counter(script).iteritems():
            metric = calc_tfidf(word, count, document_frequencies)
            metricised[word] = metric

        tfidf_weighted = sortdict(metricised)
    
        result.append((seas, ep, tfidf_weighted[:limit]))
        
    return result
        
tfidf_w_stopwords = calculate_tfidf(stemmed_data_w_stopwords, docfreq_w_stopwords)
tfidf_wout_stopwords = calculate_tfidf(stemmed_data_wout_stopwords, docfreq_wout_stopwords)

In [22]:
def print_tfidf_data(tfidf_data):
    for seas, ep, metrics in tfidf_data:
        freq = ", ".join("{}: {:6.3f}".format(x, y) for x, y in metrics)
        print("Season {}, episode {}: {}".format(seas, ep, freq))
        print
        
print_tfidf_data(tfidf_w_stopwords[:5])

Season 1, episode 1: paul: 117.460, franni: 25.824, cut: 19.775, la: 15.390, goodnight: 12.712, spoon: 12.364, stair: 12.364, cmon:  9.730, bracketi:  9.419, bookcas:  9.419, boot:  9.273, wine:  8.959, push:  8.959, vega:  8.671, grandchildren:  8.608, aura:  8.608, shoe:  8.047, joann:  8.015, flavor:  8.015, aruba:  8.015

Season 1, episode 2: barri: 56.664, carol: 46.674, susan: 32.236, robbi: 28.257, marsha: 23.548, helen: 22.705, geller: 20.874, ew: 20.520, comedian: 18.838, oberman: 17.216, julia: 14.129, minni: 14.129, mr: 10.397, mindi:  9.940, marlon:  9.419, twirli:  9.419, doy:  9.419, chaotic:  9.419, ludwin:  9.419, fluf:  9.419

Season 1, episode 3: alan: 87.036, lizzi: 48.088, paula: 28.257, smoke: 22.874, notnotmin: 18.838, chi: 16.029, thumb: 15.890, farm: 15.137, notmin: 14.129, crook: 14.129, cell:  9.888, basealan:  9.419, schhteve:  9.419, knucklecrack:  9.419, millner:  9.419, lambchop:  9.419, damon:  9.419, coyot:  9.419, poptart:  9.419, thousand:  9.210

Seas

In [23]:
from ipy_table import make_table

make_table(tfidf_w_stopwords[0][2])

0,1
paul,117.4596
franni,25.8244
cut,19.775
la,15.3897
goodnight,12.7122
spoon,12.3642
stair,12.3642
cmon,9.7296
bracketi,9.4191
bookcas,9.4191


In [24]:
print_tfidf_data(tfidf_wout_stopwords[:5])

Season 1, episode 1: paul: 117.460, franni: 25.824, cut: 19.775, la: 15.390, goodnight: 12.712, spoon: 12.364, stair: 12.364, cmon:  9.730, bracketi:  9.419, bookcas:  9.419, boot:  9.273, wine:  8.959, push:  8.959, vega:  8.671, grandchildren:  8.608, aura:  8.608, shoe:  8.047, joann:  8.015, flavor:  8.015, aruba:  8.015

Season 1, episode 2: barri: 56.664, carol: 46.674, susan: 32.236, robbi: 28.257, marsha: 23.548, helen: 22.705, geller: 20.874, ew: 20.520, comedian: 18.838, oberman: 17.216, julia: 14.129, minni: 14.129, mr: 10.397, mindi:  9.940, twirli:  9.419, doy:  9.419, marlon:  9.419, chaotic:  9.419, ludwin:  9.419, fluf:  9.419

Season 1, episode 3: alan: 87.036, lizzi: 48.088, paula: 28.257, smoke: 22.874, notnotmin: 18.838, chi: 16.029, thumb: 15.890, farm: 15.137, notmin: 14.129, crook: 14.129, cell:  9.888, basealan:  9.419, schhteve:  9.419, knucklecrack:  9.419, millner:  9.419, lambchop:  9.419, damon:  9.419, coyot:  9.419, poptart:  9.419, thousand:  9.210

Seas

In [25]:
make_table(tfidf_wout_stopwords[0][2])

0,1
paul,117.4596
franni,25.8244
cut,19.775
la,15.3897
goodnight,12.7122
spoon,12.3642
stair,12.3642
cmon,9.7296
bracketi,9.4191
bookcas,9.4191


In [26]:
for x, y in zip(tfidf_w_stopwords, tfidf_wout_stopwords):
    if x != y:
        print(x)
        print(y)
        print

(1, 2, [(u'barri', 56.66426688112432), (u'carol', 46.67369946058891), (u'susan', 32.23619130191664), (u'robbi', 28.257181207874005), (u'marsha', 23.54765100656167), (u'helen', 22.705137803509565), (u'geller', 20.873633484694086), (u'ew', 20.519594859692294), (u'comedian', 18.838120805249336), (u'oberman', 17.21626037281668), (u'julia', 14.128590603937003), (u'minni', 14.128590603937003), (u'mr', 10.39720770839918), (u'mindi', 9.939626599152001), (u'marlon', 9.419060402624668), (u'twirli', 9.419060402624668), (u'doy', 9.419060402624668), (u'chaotic', 9.419060402624668), (u'ludwin', 9.419060402624668), (u'fluf', 9.419060402624668)])
(1, 2, [(u'barri', 56.66426688112432), (u'carol', 46.67369946058891), (u'susan', 32.23619130191664), (u'robbi', 28.257181207874005), (u'marsha', 23.54765100656167), (u'helen', 22.705137803509565), (u'geller', 20.873633484694086), (u'ew', 20.519594859692294), (u'comedian', 18.838120805249336), (u'oberman', 17.21626037281668), (u'julia', 14.128590603937003), (u

Good! Now just quickly do the same w/ stopwords for 2,3-grams:

In [27]:
from nltk.util import ngrams

bigrammed_data_w_stopwords = [(s, e, list(ngrams(d, 2))) for s, e, d in stemmed_data_w_stopwords]
trigrammed_data_w_stopwords = [(s, e, list(ngrams(d, 3))) for s, e, d in stemmed_data_w_stopwords]

In [28]:
bigram_tfidf_data = calculate_tfidf(bigrammed_data_w_stopwords, count_doc_frequencies(bigrammed_data_w_stopwords))
print_tfidf_data(bigram_tfidf_data[:5])

Season 1, episode 1: (u'cut', u'cut'): 68.865, (u'wine', u'guy'): 23.548, (u'grab', u'spoon'): 18.838, (u'push', u'stair'): 18.838, (u'paul', u'wine'): 18.838, (u'your', u'shoe'): 17.216, (u'know', u'paul'): 17.216, (u'paul', u'monica'): 17.216, (u'one', u'woman'): 14.444, (u'fifth', u'date'): 14.129, (u'shoe', u'your'): 12.912, (u'real', u'date'): 12.912, (u'la', u'vega'): 10.833, (u'woman', u'that'):  9.419, (u'bookcas', u'chandler'):  9.419, (u'great', u'butt'):  9.419, (u'stair', u'push'):  9.419, (u'ross', u'grab'):  9.419, (u'made', u'coffe'):  9.419, (u'morn', u'paul'):  9.419

Season 1, episode 2: (u'mr', u'geller'): 31.192, (u'ew', u'ew'): 28.051, (u'dr', u'oberman'): 17.216, (u'ross', u'marsha'): 14.129, (u'barri', u'yeah'): 12.022, (u'carol', u'ross'): 11.561, (u'susan', u'ross'): 10.302, (u'chaotic', u'twirli'):  9.419, (u'helen', u'geller'):  9.419, (u'give', u'barri'):  9.419, (u'good', u'shake'):  9.419, (u'um', u'how'):  9.419, (u'alreadi', u'fluf'):  9.419, (u'na', u'h

In [29]:
make_table(bigram_tfidf_data[0][2])

0,1
"(u'cut', u'cut')",68.865
"(u'wine', u'guy')",23.5477
"(u'grab', u'spoon')",18.8381
"(u'push', u'stair')",18.8381
"(u'paul', u'wine')",18.8381
"(u'your', u'shoe')",17.2163
"(u'know', u'paul')",17.2163
"(u'paul', u'monica')",17.2163
"(u'one', u'woman')",14.4437
"(u'fifth', u'date')",14.1286


In [30]:
trigram_tfidf_data = calculate_tfidf(trigrammed_data_w_stopwords, count_doc_frequencies(trigrammed_data_w_stopwords))
print_tfidf_data(trigram_tfidf_data[:5])

Season 1, episode 1: (u'cut', u'cut', u'cut'): 65.933, (u'paul', u'wine', u'guy'): 18.838, (u'shoe', u'your', u'shoe'): 12.912, (u'your', u'shoe', u'your'): 12.912, (u'im', u'la', u'vega'):  9.419, (u'push', u'stair', u'push'):  9.419, (u'one', u'woman', u'that'):  9.419, (u'know', u'paul', u'franni'):  9.419, (u'stair', u'push', u'stair'):  9.419, (u'paul', u'yeah', u'joey'):  9.419, (u'ross', u'grab', u'spoon'):  9.419, (u'get', u'go', u'work'):  9.419, (u'ill', u'never', u'grandchildren'):  9.419, (u'hat', u'im', u'say'):  8.608, (u'believ', u'im', u'hear'):  8.608, (u'isnt', u'anyth', u'cant'):  8.608, (u'rachel', u'goodnight', u'ross'):  8.608, (u'back', u'high', u'school'):  7.568, (u'monica', u'yeah', u'yeah'):  6.868, (u'ross', u'got', u'ta'):  5.666

Season 1, episode 2: (u'ew', u'ew', u'ew'): 24.044, (u'barri', u'yeah', u'well'):  9.419, (u'well', u'monica', u'ross'):  9.419, (u'gon', u'na', u'helen'):  9.419, (u'god', u'oh', u'god'):  8.959, (u'monica', u'ross', u'your'):  8

In [31]:
make_table(trigram_tfidf_data[0][2])

0,1
"(u'cut', u'cut', u'cut')",65.9334
"(u'paul', u'wine', u'guy')",18.8381
"(u'shoe', u'your', u'shoe')",12.9122
"(u'your', u'shoe', u'your')",12.9122
"(u'im', u'la', u'vega')",9.4191
"(u'push', u'stair', u'push')",9.4191
"(u'one', u'woman', u'that')",9.4191
"(u'know', u'paul', u'franni')",9.4191
"(u'stair', u'push', u'stair')",9.4191
"(u'paul', u'yeah', u'joey')",9.4191


In [33]:
tfidf_w_stopwords[0]

(1,
 1,
 [(u'paul', 117.45961322761602),
  (u'franni', 25.824390559225023),
  (u'cut', 19.775021196025975),
  (u'la', 15.389696144769221),
  (u'goodnight', 12.712215321391783),
  (u'spoon', 12.364169813433264),
  (u'stair', 12.364169813433264),
  (u'cmon', 9.729550745276565),
  (u'bracketi', 9.419060402624668),
  (u'bookcas', 9.419060402624668),
  (u'boot', 9.273127360074948),
  (u'wine', 8.958797346140274),
  (u'push', 8.958797346140274),
  (u'vega', 8.671115273688493),
  (u'grandchildren', 8.60813018640834),
  (u'aura', 8.60813018640834),
  (u'shoe', 8.047189562170502),
  (u'joann', 8.014666370464942),
  (u'flavor', 8.014666370464942),
  (u'aruba', 8.014666370464942)])

In [34]:
bigram_tfidf_data[0]

(1,
 1,
 [((u'cut', u'cut'), 68.86504149126672),
  ((u'wine', u'guy'), 23.54765100656167),
  ((u'grab', u'spoon'), 18.838120805249336),
  ((u'push', u'stair'), 18.838120805249336),
  ((u'paul', u'wine'), 18.838120805249336),
  ((u'your', u'shoe'), 17.21626037281668),
  ((u'know', u'paul'), 17.21626037281668),
  ((u'paul', u'monica'), 17.21626037281668),
  ((u'one', u'woman'), 14.443671650576897),
  ((u'fifth', u'date'), 14.128590603937003),
  ((u'shoe', u'your'), 12.912195279612511),
  ((u'real', u'date'), 12.912195279612511),
  ((u'la', u'vega'), 10.832753737932673),
  ((u'woman', u'that'), 9.419060402624668),
  ((u'bookcas', u'chandler'), 9.419060402624668),
  ((u'great', u'butt'), 9.419060402624668),
  ((u'stair', u'push'), 9.419060402624668),
  ((u'ross', u'grab'), 9.419060402624668),
  ((u'made', u'coffe'), 9.419060402624668),
  ((u'morn', u'paul'), 9.419060402624668)])

In [35]:
trigram_tfidf_data[0]

(1,
 1,
 [((u'cut', u'cut', u'cut'), 65.93342281837268),
  ((u'paul', u'wine', u'guy'), 18.838120805249336),
  ((u'shoe', u'your', u'shoe'), 12.912195279612511),
  ((u'your', u'shoe', u'your'), 12.912195279612511),
  ((u'im', u'la', u'vega'), 9.419060402624668),
  ((u'push', u'stair', u'push'), 9.419060402624668),
  ((u'one', u'woman', u'that'), 9.419060402624668),
  ((u'know', u'paul', u'franni'), 9.419060402624668),
  ((u'stair', u'push', u'stair'), 9.419060402624668),
  ((u'paul', u'yeah', u'joey'), 9.419060402624668),
  ((u'ross', u'grab', u'spoon'), 9.419060402624668),
  ((u'get', u'go', u'work'), 9.419060402624668),
  ((u'ill', u'never', u'grandchildren'), 9.419060402624668),
  ((u'hat', u'im', u'say'), 8.60813018640834),
  ((u'believ', u'im', u'hear'), 8.60813018640834),
  ((u'isnt', u'anyth', u'cant'), 8.60813018640834),
  ((u'rachel', u'goodnight', u'ross'), 8.60813018640834),
  ((u'back', u'high', u'school'), 7.568379267836522),
  ((u'monica', u'yeah', u'yeah'), 6.86797440897

In [39]:
import json
with open('ngrams.json', 'w') as f:
    to_dump = {
        'uni': tfidf_w_stopwords[0][2],
        'bi': bigram_tfidf_data[0][2],
        'tri': trigram_tfidf_data[0][2]
    }
    
    json.dump(to_dump, f)