In [1]:
import string

In [2]:
contents = unicode(open('data/friends_transcripts.txt').read(), errors='ignore')
contents[:200]



First element of ``re.split`` is empty string, so skip it:

In [3]:
import re

splitted = re.split('======== (\d\d)-(\d\d)\n', contents)[1:]
print(len(splitted))

666


In [4]:
from itertools import izip

def tripletswise(t):
    it = iter(t)
    return izip(it,it,it)

def clean(txt):
    for i in string.punctuation:
        txt = txt.replace(i, '')
    return txt

matched_data = [(int(x), int(y), clean(z)) for x, y, z in tripletswise(splitted)]
N = len(matched_data)

The first element is **season**,

The second is **episode**,

And the third – actual **transcript**:

In [5]:
import nltk
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer

def stem_data(data, use_stopwords=True):
    def tokenize(doc):        
        tokens = nltk.word_tokenize(doc.lower())
        if use_stopwords:
            return [t for t in tokens if not t in stopwords.words('english')]
        else:
            return tokens
    
    def stem(doc):
        stemmer = PorterStemmer()
        return [stemmer.stem(t) for t in tokenize(doc)]
        
    return [(seas, ep, stem(doc)) for seas, ep, doc in data]


stemmed_data_w_stopwords = stem_data(matched_data, True)
stemmed_data_wout_stopwords = stem_data(matched_data, False)

In [6]:
print(stemmed_data_w_stopwords[0][2][:10])
print(stemmed_data_wout_stopwords[0][2][:10])

[u'written', u'marta', u'kauffman', u'david', u'crane', u'monica', u'there', u'noth', u'tell', u'he']
[u'written', u'by', u'marta', u'kauffman', u'david', u'crane', u'monica', u'there', u'noth', u'to']


In [7]:
from collections import Counter

def count_doc_frequencies(data):
    doc_frequencies = Counter()
    for _, _, doc in data:
        for w in set(doc):
            doc_frequencies[w] += 1
    
    return doc_frequencies


docfreq_w_stopwords = count_doc_frequencies(stemmed_data_w_stopwords)
docfreq_wout_stopwords = count_doc_frequencies(stemmed_data_wout_stopwords)

In [8]:
import operator

def sortdict(d):
    return sorted(d.items(), key=operator.itemgetter(1), reverse=True)
    
print(sortdict(docfreq_w_stopwords)[:10])
print(sortdict(docfreq_wout_stopwords)[:10])

[(u'end', 216), (u'ross', 214), (u'well', 212), (u'dont', 212), (u'im', 211), (u'joey', 211), (u'oh', 211), (u'know', 210), (u'hey', 209), (u'your', 209)]
[(u'by', 217), (u'end', 216), (u'you', 214), (u'ross', 214), (u'the', 213), (u'and', 213), (u'well', 212), (u'just', 212), (u'dont', 212), (u'are', 212)]


In [9]:
import math

def calc_tfidf(word, freq, doc_freq):
    tf = freq
    idf = math.log(N / (doc_freq[word] + 1))
    return tf * idf

def calculate_tfidf(data, document_frequencies, limit=5):
    result = []
    for seas, ep, script in data:
        metricised = {}
        for word, count in Counter(script).iteritems():
            metric = calc_tfidf(word, count, document_frequencies)
            metricised[word] = metric

        tfidf_weighted = sortdict(metricised)
    
        result.append((seas, ep, tfidf_weighted[:limit]))
        
    return result
        
tfidf_w_stopwords = calculate_tfidf(stemmed_data_w_stopwords, docfreq_w_stopwords)
tfidf_wout_stopwords = calculate_tfidf(stemmed_data_wout_stopwords, docfreq_wout_stopwords)

In [10]:
def print_tfidf_data(tfidf_data):
    for seas, ep, metrics in tfidf_data:
        freq = ", ".join("{}: {:6.3f}".format(x, y) for x, y in metrics)
        print("Season {}, episode {}: {}".format(seas, ep, freq))
        
print_tfidf_data(tfidf_w_stopwords[:5])

Season 1, episode 1: paul: 117.460, franni: 25.824, cut: 19.775, la: 15.390, goodnight: 12.712
Season 1, episode 2: barri: 56.664, carol: 46.674, susan: 32.236, robbi: 28.257, marsha: 23.548
Season 1, episode 3: alan: 87.036, lizzi: 48.088, paula: 28.257, smoke: 22.874, notnotmin: 18.838
Season 1, episode 4: joann: 24.044, omnipot: 23.548, kiki: 23.548, receptionist: 21.666, pizza: 19.709
Season 1, episode 5: angela: 64.997, bob: 39.586, janic: 33.271, laundri: 31.794, sud: 23.548


In [11]:
print_tfidf_data(tfidf_wout_stopwords[:5])

Season 1, episode 1: paul: 117.460, franni: 25.824, cut: 19.775, la: 15.390, goodnight: 12.712
Season 1, episode 2: barri: 56.664, carol: 46.674, susan: 32.236, robbi: 28.257, marsha: 23.548
Season 1, episode 3: alan: 87.036, lizzi: 48.088, paula: 28.257, smoke: 22.874, notnotmin: 18.838
Season 1, episode 4: joann: 24.044, omnipot: 23.548, kiki: 23.548, receptionist: 21.666, pizza: 19.709
Season 1, episode 5: angela: 64.997, bob: 39.586, janic: 33.271, laundri: 31.794, sud: 23.548


In [12]:
for x, y in zip(tfidf_w_stopwords, tfidf_wout_stopwords):
    if x != y:
        print(x)
        print(y)
        print

(1, 13, [(u'ronni', 122.44778523412069), (u'roger', 104.7166194666825), (u'tribbiani', 36.25420552604762), (u'boobi', 28.051332296627297), (u'mr', 22.873856958478193)])
(1, 13, [(u'ronni', 122.44778523412069), (u'roger', 104.7166194666825), (u'tribbiani', 36.25420552604762), (u'ma', 34.43252074563336), (u'boobi', 28.051332296627297)])

(2, 23, [(u'mandel', 4.30406509320417), (u'brown', 2.4849066497880004), (u'origin', 1.791759469228055), (u'written', 0.0), (u'end', 0.0)])
(2, 23, [(u'mandel', 4.30406509320417), (u'brown', 2.4849066497880004), (u'origin', 1.791759469228055), (u'end', 0.0), (u'written', 0.0)])

(7, 15, [(u'cecilia', 240.18604026692904), (u'jessica', 46.94193286437492), (u'dina', 36.06599866709224), (u'lockhart', 32.96671140918634), (u'own', 29.662531794038962)])
(7, 15, [(u'cecilia', 240.18604026692904), (u'jessica', 46.94193286437492), (u'dina', 36.06599866709224), (u'lockhart', 32.96671140918634), (u'scottish', 25.824390559225023)])

(8, 16, [(u'soul', 37.8418963391826

Good! Now just quickly do the same w/ stopwords for 2,3-grams:

In [21]:
from nltk.util import ngrams

bigrammed_data_w_stopwords = [(s, e, list(ngrams(d, 2))) for s, e, d in stemmed_data_w_stopwords]
trigrammed_data_w_stopwords = [(s, e, list(ngrams(d, 3))) for s, e, d in stemmed_data_w_stopwords]

In [22]:
print_tfidf_data(calculate_tfidf(bigrammed_data_w_stopwords, count_doc_frequencies(bigrammed_data_w_stopwords))[:5])

Season 1, episode 1: (u'cut', u'cut'): 68.865, (u'wine', u'guy'): 23.548, (u'grab', u'spoon'): 18.838, (u'push', u'stair'): 18.838, (u'paul', u'wine'): 18.838
Season 1, episode 2: (u'mr', u'geller'): 31.192, (u'ew', u'ew'): 28.051, (u'dr', u'oberman'): 17.216, (u'ross', u'marsha'): 14.129, (u'barri', u'yeah'): 12.022
Season 1, episode 3: (u'feel', u'thing'): 14.444, (u'wouldnt', u'fair'): 14.129, (u'alan', u'alan'): 14.129, (u'footbal', u'phone'): 14.129, (u'alan', u'ross'): 14.129
Season 1, episode 4: (u'pizza', u'guy'): 32.059, (u'she', u'walk'): 12.912, (u'im', u'okay'): 10.397, (u'kill', u'monica'):  9.534, (u'drop', u'towel'):  9.419
Season 1, episode 5: (u'la', u'la'): 22.705, (u'bob', u'joey'): 14.129, (u'hey', u'hey'):  9.888, (u'actor', u'bob'):  9.419, (u'he', u'sophist'):  9.419


In [23]:
print_tfidf_data(calculate_tfidf(trigrammed_data_w_stopwords, count_doc_frequencies(trigrammed_data_w_stopwords))[:5])

Season 1, episode 1: (u'cut', u'cut', u'cut'): 65.933, (u'paul', u'wine', u'guy'): 18.838, (u'shoe', u'your', u'shoe'): 12.912, (u'your', u'shoe', u'your'): 12.912, (u'im', u'la', u'vega'):  9.419
Season 1, episode 2: (u'ew', u'ew', u'ew'): 24.044, (u'barri', u'yeah', u'well'):  9.419, (u'well', u'monica', u'ross'):  9.419, (u'gon', u'na', u'helen'):  9.419, (u'god', u'oh', u'god'):  8.959
Season 1, episode 3: (u'thousand', u'dollar', u'footbal'):  9.419, (u'get', u'meet', u'guy'):  9.419, (u'seven', u'thousand', u'dollar'):  9.419, (u'realli', u'like', u'paula'):  9.419, (u'go', u'guy', u'friend'):  9.419
Season 1, episode 4: (u'im', u'okay', u'okay'): 17.216, (u'pizza', u'guy', u'yeah'):  9.419, (u'okay', u'got', u'one'):  9.419, (u'see', u'lem', u'see'):  9.419, (u'she', u'walk', u'she'):  9.419
Season 1, episode 5: (u'la', u'la', u'la'): 18.921, (u'hey', u'hey', u'hey'): 14.166, (u'real', u'job', u'go'):  9.419, (u'month', u'call', u'actor'):  9.419, (u'sophist', u'real', u'job'): 