In [None]:
import csv
import numpy as np

from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.feature_extraction.text import CountVectorizer

from sklearn.linear_model import SGDClassifier

import matplotlib.pyplot as plt
%matplotlib inline

import re

In [None]:
# load custom dictionary
custom_dictionary = dict()
row_count = 0
with open('../lexicons/custom_dictionary.csv', 'rt') as csvfile:
    reader = csv.reader(csvfile)
    for row in reader:
            custom_dictionary[row[0]] = row[1]   
            row_count += 1
print("read",row_count,"words")

In [None]:
def spellProcessor(word):
    word = word.lower()
    word = word.replace("gyftis", "gifts")
    word = word.replace("gether", "gather")
    word = word.replace("spirituall", "spiritual")
    word = word.replace("feythfull", "faith")
    word = word.replace("wytnes", "witness")
    word = word.replace("almes", "alms")
    word = word.replace("desyre", "desire")
    word = word.replace("selfe", "self")
    word = word.replace("saffely", "safely")
    word = word.replace("realme", "realm")
    word = word.replace("acte", "act")
    word = word.replace("fourme", "form")
    word = word.replace("subiectes", "subjects")
    word = word.replace("theyr", "their")
    word = word.replace("kynde", "kind")
    word = word.replace("kynge", "king")
    word = word.replace("kyndes", "kinds")
    word = word.replace("vpon", "unto")
    word = word.replace("purueyours", "purveyors")
    word = word.replace("highnes", "highness")
    word = word.replace("euery", "every")
    word = word.replace("quene", "queen")
    word = word.replace("quenes", "queens")
    word = word.replace("whiche", "which")
    word = word.replace("bloude", "blood")
    word = word.replace("soueraine", "sovereign")
    word = word.replace("enactd", "enacted")
    word = word.replace("vs", "us")
    
    # replace digits
    tmp = list()
    for w in word.split():
        w = re.sub('\d','', w)
        tmp.append(w)
    word = ' '.join(tmp)

    #process custom dictionary entries
    tmp = list()
    for w in word.split():
        if w in custom_dictionary:
            w = custom_dictionary[w]
        tmp.append(w)
    word = ' '.join(tmp)
    
    return word

stopWords = [",","the","and","of","or","to","in","shall","be","that","any","by",".",
              "such","as","this","for","same","all","said","other","'s",";",
              "her","is","every","[","]","they","within", "our", "not", "so",
              "made", "no", "then", ":", "do", "from", "if", "it", "which", "at", "with",
             "thereof","upon", "a", "because", "used", "some", "but", "aforesaid", "also",
             ")","(", "what", "&", "may", "are", "their", "them", "sayde", "suche", "shalbe", "anye", "sayd",
             "thesaid", "/", "...", "/", "either", "haue", "vnto", "thy", "did", "was", "were", "have", "thee", 
             "your", "thou", "unto", "hath", "had", "went"]

In [None]:
input_data = list()
row_count = 0

metadata=list()
with open('../texts/textMetadata.csv', 'rt') as csvfile:
    reader = csv.reader(csvfile)
    for row in reader:
        # deal with the header
        if row_count != 0:
            metadata.append(row)   
        row_count += 1
print("read",row_count,"lines")

In [None]:
metadata[0]

In [None]:
labels = [i[4] for i in metadata]
files = list()
for i in metadata:
    fn = "../texts/" + i[0]
    files.append(fn)
    
vectorizer = CountVectorizer(
    input='filename',
    lowercase=True,
    ngram_range=(1,3),
    strip_accents=None, preprocessor=spellProcessor, stop_words=stopWords)

In [None]:
# get document term matrix
dtm = vectorizer.fit_transform(files)

# convert to tf-idf frequencies (to account for differences in text lengths)
tfidf_transformer = TfidfTransformer()
dtm_tfidf = tfidf_transformer.fit_transform(dtm)

In [None]:
# vocabulary counts
# (45, 979638) with custom dictionary
# (45, 990188) without custom dictionary

dtm.shape

In [None]:
# attempt to deal with imbalanced dataset

for c in set(labels):
    print("Class: {0}, Documents: {1}".format(c,labels.count(c)))

from sklearn.utils.class_weight import compute_class_weight
class_weights = compute_class_weight('balanced',
                                     np.unique(labels),
                                     labels)

In [None]:
# create and fit model using Support Vector Machine (SVM)
clf = SGDClassifier(tol=None,class_weight="balanced",max_iter=1000).fit(dtm_tfidf, labels)

In [None]:
# this function will return total count of a term in the vocabulary
def get_counts(term):
    widx = vectorizer.vocabulary_[term]
    return(vocab_sums[0,widx])

In [None]:
# what are our top terms?
vocab_sums = dtm.sum(axis=0)
sorted_vocab = [(v, vocab_sums[0, i]) for v, i in vectorizer.vocabulary_.items()]
sorted_vocab = sorted(sorted_vocab, key = lambda x: x[1], reverse=True)

# display top twenty words
for i in range(1,20):
    print(sorted_vocab[i][0],"->",sorted_vocab[i][1])

In [None]:
# produce key features for each class
feature_names = vectorizer.get_feature_names()
feature_list=dict()

for cn, cl in enumerate(clf.classes_):
    terms = np.argsort(clf.coef_[cn])
    values = clf.coef_[cn].ravel()[np.argsort(clf.coef_[cn].ravel())]
    print("Key features for {}:".format(cl))
    feature_list[cl] = list()
    for i, t in enumerate(terms):
        # reverse sign
        v = -np.round(values[i],3)
        feature_list[cl].append((feature_names[t],v))
        if i < 50:
            print("{0} ({1})".format(feature_names[t],v),end=", ")
    print("\n")

In [None]:
# reduce data
word_cloud_data=dict()
for cl in feature_list.keys():
    viz_words = feature_list[cl][:25] + feature_list[cl][-25:]
    word_cloud_data[cl] = dict()
    for i in feature_list[cl][:100]:
        word_cloud_data[cl][i[0]] = float(i[1])
        
    values = [x[1] for x in viz_words]
    kwords = [x[0] for x in viz_words]
    y_pos = range(len(values))

    #fig, ax = plt.subplots()
    fig = plt.figure(figsize=(35, 20), dpi=75)
    plt.barh(y_pos, values, align='center',tick_label=kwords,color="red")
    plt.title("Key Features: {0}".format(cl))
    plt.show()

In [None]:
from wordcloud import WordCloud

# show most important features for each class:
for cl in word_cloud_data.keys():
    print("Class: {0}".format(cl))
    fig = plt.figure(figsize=(35, 20), dpi=75)
    wordcloud = WordCloud(width=900,height=500, max_words=1000,
                      relative_scaling=1).generate_from_frequencies(word_cloud_data[cl])
    plt.imshow(wordcloud, interpolation='bilinear')
    plt.axis("off")
    plt.show()