In [None]:
%matplotlib inline
import numpy as np
from numpy import dot
import pandas as pd

In [None]:
def norm_vec(v):
    return v / np.linalg.norm(v)
np.set_printoptions(precision=3)

#### Using external modules
To make our lives simpler, and to make these notebooks a little less crowded, we can stick functions we use frequently into separate modules. (This are files with the extention .py.) Then we can import individual functions from them. Or we can import everything.

This line gives allows you do use everything from utilities as if it were in this notebook

In [None]:
from utilities import * 
corpus = load_entire_directory("corpora/seasons")

If you import like this you can access everything from utilities like so:

In [None]:
import utilities
corpus = utilities.load_entire_directory("corpora/seasons")

You can import just one thing from a module:

In [None]:
from utilities import load_entire_directory
corpus = load_entire_directory("corpora/seasons")

Finally, you can give modules shorthand names for convenience

In [None]:
from utilities import load_entire_directory as led
corpus = led("corpora/seasons")

In [None]:
from seasons_module import load_seasons_corpus

<h2>The Seasons Corpus</h2>

**Load the corpus**

In [None]:
seasons_corpus = load_seasons_corpus()

In [19]:
print(seasons_corpus["angelapre"])

[['that', 's', 'because', 'of', 'the', 'sun', 'is', 'in', 'the', 'center', 'and', 'the', 'earth', 'moves', 'around', 'the', 'sun', 'and', 'the', 'earth', 'is', 'like', 'at', 'one', 'point', 'in', 'the', 'winter', 'it', 's', 'like', 'farther', 'away', 'from', 'the', 'sun', 'and', 'towards', 'the', 'summer', 'it', 's', 'closer', 'it', 's', 'near', 'towards', 'the', 'sun', 'okay', 'the', 'sun', 's', 'in', 'the', 'middle', 'and', 'the', 'earth', 'kind', 'of', 'orbits', 'around', 'it', 'and', 'like', 'say', 'at', 'one', 'it', 's', 'probably', 'more', 'of', 'an', 'ovally', 'type', 'thing', 'in', 'the', 'winter', 'er', 'probably', 'this', 'will', 'be', 'winter', 'since', 'it', 's', 'further', 'away', 'see', 'that', 's', 'winter', 'would', 'be', 'like', 'the', 'earth', 'orbits', 'around', 'the', 'sun', 'like', 'summer', 'is', 'the', 'closest', 'to', 'the', 'sun', 'spring', 'is', 'kind', 'of', 'a', 'little', 'further', 'away', 'and', 'then', 'like', 'fall', 'is', 'further', 'away', 'then', 'spr

In [20]:
d = {}
for name, val in seasons_corpus.items():
    d[name] = val[1]
print(str(d))

{'vanessapre': 'cf', '-vanessapost': 'none', '-williampost': 'none', 'lesliepre': 'side', 'zeldapre': 'tilt', 'williampre': 'none', '-jacobpost': 'none', '-marthapost': 'tilt', '-robbiepost': 'side', 'kurtpre': 'side', 'angelapre': 'cf', 'ftcandice': 'none', '-bethpost': 'tilt', 'ftharmony': 'none', '-alipost': 'none', 'ftcaitlin': 'tilt', 'denisepre': 'side', 'randypre': 'cf', 'ovadyapre': 'cf', '-kurtpost': 'none', 'ftblake': 'tilt', 'alipre': 'side', 'jacobpre': 'side', 'ftsamantha': 'side', 'kimberleypre': 'none', '-ovadyapost': 'cf', '-stanpost': 'tilt', 'ftcassandra': 'side', 'ftlibby': 'side', '-randypost': 'side', '-sandrapost': 'tilt', 'ftlisa': 'tilt', 'ftmarcus': 'cf', 'kellypre': 'none', 'deidrapre': 'side', 'ftalex': 'none', 'ftelesha': 'tilt', 'bethpre': 'tilt', '-amandapost': 'tilt', 'jillpre': 'cf', '-kimberleypost': 'none', '-denisepost': 'side', 'robbiepre': 'side', 'edgarpre': 'none', 'ftmason': 'none', 'ftholly': 'side', 'markpre': 'tilt', 'amandapre': 'tilt', '-edg

**Compile the vocabulary**

This is every unique word in the corpus

In [21]:
set_vocab = set([])
for fname in seasons_corpus.keys():
    set_vocab = set_vocab.union(set(seasons_corpus[fname][0]))

Read in a stop list. Then remove all of these words from the vocabulary

In [22]:
f = open("lists/seasons_stop_list.txt")
stop_list = set(f.read().split("\n"))

In [23]:
pruned_vocab = set(sorted([w for w in list(set_vocab) if w not in stop_list]))

In [24]:
len(pruned_vocab)

616

**Compute the document vector for each document**

In [25]:
doc_vectors = {}
for fname in seasons_corpus.keys():
    doc_vectors[fname] = np.array([seasons_corpus[fname][0].count(word) for word in pruned_vocab])

In [26]:
print(doc_vectors["angelapre"])

[1 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 2 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 2 0 0 0 0 3 0 0 0 0 2 0 0 0 0 0 0 0 0 0 1 2 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0
 1 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0
 0 0 0 0 0 0 0 0 1 0 0 0 2 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 1 1 0 2 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 2 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 1 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 2 0 0 1
 0 0 0 0 0 0 2 2 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 1 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 1 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 2 0 0
 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 1 0 0 0 0 0 0 0 0 0 0 0 

**Normalize the vectors**

In [27]:
# normalize the vectors
for fname, vec in doc_vectors.items():
    doc_vectors[fname] = norm_vec(vec)

In [28]:
print(doc_vectors["angelapre"])

[0.085 0.    0.    0.085 0.    0.    0.    0.    0.    0.    0.    0.
 0.    0.    0.    0.    0.    0.17  0.    0.    0.    0.    0.    0.
 0.    0.    0.    0.    0.    0.    0.    0.    0.    0.    0.    0.
 0.    0.    0.    0.    0.    0.    0.    0.    0.17  0.    0.    0.
 0.    0.255 0.    0.    0.    0.    0.17  0.    0.    0.    0.    0.
 0.    0.    0.    0.    0.085 0.17  0.    0.    0.    0.    0.    0.
 0.    0.    0.    0.    0.    0.    0.    0.    0.    0.    0.    0.
 0.    0.    0.    0.    0.    0.    0.    0.    0.    0.    0.    0.
 0.    0.    0.    0.    0.    0.    0.    0.    0.    0.    0.085 0.
 0.    0.    0.    0.085 0.    0.    0.    0.    0.    0.085 0.    0.
 0.    0.    0.    0.    0.    0.    0.    0.    0.    0.    0.    0.
 0.    0.    0.    0.    0.    0.    0.    0.    0.    0.    0.    0.085
 0.    0.    0.    0.    0.    0.    0.    0.    0.    0.    0.    0.
 0.085 0.    0.    0.    0.17  0.    0.    0.    0.    0.    0.    0.
 0.085 0.    0.  

**Compare some pairs of students**

In [29]:
def compare_students(s1, s2):
    return round(dot(doc_vectors[s1], doc_vectors[s2]), 3)

In [30]:
compare_students('alipre', 'jillpre')

0.301

In [31]:
from utilities import ListTable

In [32]:
tab = ListTable()
tab.append(["name", "similarity", "code"])
for name in doc_vectors.keys():
    tab.append([name, str(compare_students(name, 'angelapre')), seasons_corpus[name][1]])
tab

0,1,2
name,similarity,code
vanessapre,0.446,cf
-vanessapost,0.352,none
-williampost,0.509,none
lesliepre,0.481,side
zeldapre,0.265,tilt
williampre,0.421,none
-jacobpost,0.366,none
-marthapost,0.226,tilt
-robbiepost,0.161,side


**Compare to pre-written comparison documents**

In [33]:
from seasons_module import load_seasons_comparison_files
comparison_dict = load_seasons_comparison_files()

In [34]:
# Compute vectors for the comparison documents
comparison_vectors = {}
for fname in comparison_dict.keys():
    comparison_vectors[fname] = norm_vec(np.array([comparison_dict[fname].count(word) for word in pruned_vocab]))

In [35]:
def compare_to_compvecs(s1):
    resultdict = {}
    for cname in comparison_vectors.keys():
        resultdict[cname] = dot(doc_vectors[s1], comparison_vectors[cname])
    return resultdict

In [36]:
compare_to_compvecs("angelapre")

{'side': 0.5143281672291301,
 'tilt': 0.39569872439840387,
 'cf': 0.5837300238472753}

In [None]:
def max_from_dict(the_dict):
    key, value = max(the_dict.items(), key=lambda x:x[1])
    return key

student_codes = {}
for name in doc_vectors.keys():
    student_codes[name] = max_from_dict(compare_to_compvecs(name))

**How similar are our results to the codes assigned by human coders?**

In [None]:
def compute_accuracy():
    number_right = 0
    total_possible = 0
    for name in student_codes.keys():
        if seasons_corpus[name][1] != "none":
            total_possible += 1
            if student_codes[name] == seasons_corpus[name][1]:
                number_right += 1
    return 1.0 * number_right / total_possible

In [None]:
compute_accuracy()

In [None]:
gold_list = []
test_list = []
for name in student_codes.keys():
    if seasons_corpus[name][1] != "none":
        gold_list += [seasons_corpus[name][1]]
        test_list += [student_codes[name]]
cm = nltk.ConfusionMatrix(gold_list, test_list)
cm

In [None]:
print(cm)

## Some slightly different ways of computing document vectors

### First variant: use just a subset of the vocabulary when constructing the vectors

In [None]:
word_fdist = nltk.FreqDist()
for fname in seasons_corpus.keys():
    pruned_transcript_words = [w for w in seasons_corpus[fname][0] if w not in stop_list]
    word_fdist.update(pruned_transcript_words)
word_fdist.most_common(25)

In [None]:
new_vocab = [w[0] for w in word_fdist.most_common(50) if w not in stop_list]

In [None]:
print(new_vocab)

In [None]:
# compute the document vector for each document
doc_vectors = {}
for fname in seasons_corpus.keys():
    doc_vectors[fname] = norm_vec(np.array([seasons_corpus[fname][0].count(word) for word in new_vocab]))

In [None]:
# Compute vectors for the comparison documents
comparison_vectors = {}
for fname in comparison_dict.keys():
    comparison_vectors[fname] = norm_vec(np.array([comparison_dict[fname].count(word) for word in new_vocab]))

In [None]:
student_codes = {}
for name in doc_vectors.keys():
    student_codes[name] = max_from_dict(compare_to_compvecs(name))
compute_accuracy()

In [None]:
gold_list = []
test_list = []
for name in student_codes.keys():
    if seasons_corpus[name][1] != "none":
        gold_list += [seasons_corpus[name][1]]
        test_list += [student_codes[name]]
cm = nltk.ConfusionMatrix(gold_list, test_list)
cm
print(cm)

### Other variants: Use different weight factors when constructing the vectors

#### A weight factor function will commonly use these different quantities in some combination

* `tf = term frequency` (number of times the term appears in the present document)
* `df = document frequency` (number of documents in which the term appears)
* `cf = corpus frequency` (total number of times the term appears in the entire corpus)
* `N = number of documents`

In [None]:
def tf(tf, df, cf, N):
    return tf

def logtf(tf, df, cf, N):
    if tf == 0:
        result = 0
    else:
        result = (1 + np.log(tf))
    return result

def onehot(tf, df, cf, N):
    if tf == 0:
        return 0
    else:
        return 1

def tfidf(tf, df, cf, N):
    if tf == 0:
        result = 0
    else:
        result = (1 + np.log(tf)) * np.log(N  / df)
    return result

#### We need to construct the document frequency distribution since we don't have that yet.

In [None]:
doc_fdist = nltk.FreqDist()
for fname in seasons_corpus.keys():
    pruned_transcript_words = [w for w in seasons_corpus[fname][0] if w not in stop_list]
    doc_fdist.update(list(set(pruned_transcript_words)))

#### A little function to simplify the task of constructing vectors with different weight factors

In [None]:
def compute_vector(words, vocab, df, cf, N, weight_function):
    new_vector = []
    for w in vocab:
        tf = words.count(w)
        new_vector.append(weight_function(tf, df[w], cf[w], N))
    return norm_vec(np.array(new_vector))

In [None]:
# compute the document vector for each document
doc_vectors = {}
N = len(seasons_corpus.keys())
wf = tfidf
for fname in seasons_corpus.keys():
    doc_vectors[fname] = compute_vector(seasons_corpus[fname][0], new_vocab, doc_fdist, word_fdist, N, wf)
# Compute vectors for the comparison documents
comparison_vectors = {}
for fname in comparison_dict.keys():
    comparison_vectors[fname] = compute_vector(comparison_dict[fname], new_vocab, doc_fdist, word_fdist, N, wf)

In [None]:
student_codes = {}
for name in doc_vectors.keys():
    student_codes[name] = max_from_dict(compare_to_compvecs(name))
compute_accuracy()