In [1]:
import string

In [2]:
contents = unicode(open('data/friends_transcripts.txt').read(), errors='ignore')
contents[:200]



First element of ``re.split`` is empty string, so skip it:

In [3]:
import re

splitted = re.split('======== (\d\d)-(\d\d)\n', contents)[1:]
print(len(splitted))

666


In [None]:
from itertools import izip

def tripletswise(t):
    it = iter(t)
    return izip(it,it,it)

def clean(txt):
    for i in string.punctuation:
        txt = txt.replace(i, '')
    return txt

matched_data = [(int(x), int(y), clean(z)) for x, y, z in tripletswise(splitted)]
N = len(matched_data)

The first element is **season**,

The second is **episode**,

And the third – actual **transcript**:

In [None]:
import nltk
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer

def stem_data(data, use_stopwords=True):
    def tokenize(doc):        
        tokens = nltk.word_tokenize(doc.lower())
        if use_stopwords:
            return [t for t in tokens if not t in stopwords.words('english')]
        else:
            return tokens
    
    def stem(doc):
        stemmer = PorterStemmer()
        return [stemmer.stem(t) for t in tokenize(doc)]
        
    return [(seas, ep, stem(doc)) for seas, ep, doc in data]


stemmed_data_w_stopwords = stem_data(matched_data, True)
stemmed_data_wout_stopwords = stem_data(matched_data, False)

In [None]:
print(stemmed_data_w_stopwords[0][2][:10])
print(stemmed_data_wout_stopwords[0][2][:10])

In [None]:
from collections import Counter

def count_doc_frequencies(data):
    doc_frequencies = Counter()
    for _, _, doc in data:
        for w in set(doc):
            doc_frequencies[w] += 1
    
    return doc_frequencies


docfreq_w_stopwords = count_doc_frequencies(stemmed_data_w_stopwords)
docfreq_wout_stopwords = count_doc_frequencies(stemmed_data_wout_stopwords)

In [None]:
import operator

def sortdict(d):
    return sorted(d.items(), key=operator.itemgetter(1), reverse=True)
    
print(sortdict(docfreq_w_stopwords)[:10])
print(sortdict(docfreq_wout_stopwords)[:10])

In [None]:
import math

def calc_tfidf(word, freq, doc_freq):
    tf = freq
    idf = math.log(N / (doc_freq[word] + 1))
    return tf * idf

def calculate_tfidf(data, document_frequencies, limit=5):
    result = []
    for seas, ep, script in data:
        metricised = {}
        for word, count in Counter(script).iteritems():
            metric = calc_tfidf(word, count, document_frequencies)
            metricised[word] = metric

        tfidf_weighted = sortdict(metricised)
    
        result.append((seas, ep, tfidf_weighted[:limit]))
        
    return result
        
tfidf_w_stopwords = calculate_tfidf(stemmed_data_w_stopwords, docfreq_w_stopwords)
tfidf_wout_stopwords = calculate_tfidf(stemmed_data_wout_stopwords, docfreq_wout_stopwords)

In [None]:
def print_tfidf_data(tfidf_data):
    for seas, ep, metrics in tfidf_data:
        freq = ", ".join("{}: {:6.3f}".format(x, y) for x, y in metrics)
        print("Season {}, episode {}: {}".format(seas, ep, freq))
        print
        
print_tfidf_data(tfidf_w_stopwords[:5])

In [None]:
from ipy_table import make_table

make_table(tfidf_w_stopwords[0][2])

In [None]:
print_tfidf_data(tfidf_wout_stopwords[:5])

In [None]:
make_table(tfidf_wout_stopwords[0][2])

In [None]:
for x, y in zip(tfidf_w_stopwords, tfidf_wout_stopwords):
    if x != y:
        print(x)
        print(y)
        print

Good! Now just quickly do the same w/ stopwords for 2,3-grams:

In [None]:
from nltk.util import ngrams

bigrammed_data_w_stopwords = [(s, e, list(ngrams(d, 2))) for s, e, d in stemmed_data_w_stopwords]
trigrammed_data_w_stopwords = [(s, e, list(ngrams(d, 3))) for s, e, d in stemmed_data_w_stopwords]

In [None]:
bigram_tfidf_data = calculate_tfidf(bigrammed_data_w_stopwords, count_doc_frequencies(bigrammed_data_w_stopwords))
print_tfidf_data(bigram_tfidf_data[:5])

In [None]:
make_table(bigram_tfidf_data[0][2])

In [None]:
trigram_tfidf_data = calculate_tfidf(trigrammed_data_w_stopwords, count_doc_frequencies(trigrammed_data_w_stopwords))
print_tfidf_data(trigram_tfidf_data[:5])

In [None]:
make_table(trigram_tfidf_data[0][2])