In [2]:
# General Libraries Needed
import csv
import os 
import re
import glob, csv
import pandas as pd
from collections import defaultdict, Counter
from lxml import etree

# Functions for Supervised Classification
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
import numpy as np

# Libraries for Graphing
import matplotlib.pyplot as plt
import seaborn as sns
plt.style.use('ggplot')

In [3]:
def getTexts(folder):
    '''
    Takes in plain text files and outputs a tuple of lists, with the first being the text
    within each file as a string and the second list being the IDs of each text. 
    '''
    textStrings = []
    fileNames = []
    for file in os.listdir(folder):
        path = os.path.join(folder,file)
        f = open(path,'r')
        text = f.readlines()[0]
        textStrings.append(text)
        name = file.split('.')[0]
        fileNames.append(name)
        f.close()
    return textStrings,fileNames

In [4]:

fileinfo = getTexts("/srv/data/texts")
# First we need to create an "instance" of the vectorizer, with the proper settings.
# Normalization is set to 'l2' by default
tfidf = TfidfVectorizer(min_df=2, sublinear_tf=True)
# I am choosing to turn on sublinear term frequency scaling, which takes the log of
# term frequencies and can help to de-emphasize function words like pronouns and articles. 
# You might make a different choice depending on your corpus.

# Once we've created the instance, we can "transform" our counts
results = tfidf.fit_transform(fileinfo[0])

# Make results readable using Pandas
readable_results = pd.DataFrame(results.toarray(), index=fileinfo[1], columns=tfidf.get_feature_names()) # Convert information back to a DataFrame
readable_results



Unnamed: 0,00,000,0000,03,04,05,06,08,09,10,...,zugh,zuiria,zulpher,zultan,zuma,zuny,zur,zurich,zutphen,ça
A01932,0.0,0.027438,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
A02495,0.004455,0.005033,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.010308,...,0.0,0.0,0.0,0.006427,0.0,0.007966,0.009818,0.0,0.0,0.006693
A03149,0.00399,0.007633,0.005238,0.0,0.0,0.0,0.0,0.005756,0.0,0.014328,...,0.009746,0.005756,0.0,0.0,0.0,0.0,0.011763,0.013735,0.013668,0.0
A04813,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.016237,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
A07886,0.093886,0.0,0.0,0.0,0.07104,0.022265,0.073979,0.078064,0.0,0.016907,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
A10821,0.024619,0.011789,0.006527,0.0,0.011052,0.014382,0.015053,0.0,0.007173,0.019356,...,0.0,0.0,0.0,0.0,0.0,0.0,0.005618,0.018717,0.015576,0.0
A12458,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.02205,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
A12460,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.008377,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
A12466,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.023526,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
A13290,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.037746,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [11]:

def keywords(csv):
    df = pd.read_csv(csv)
    keywords = df['keywords']
    ids = df['id']
    numFiles = len(ids)
    count = 0
    dict = {}
    while count < numFiles:
        words = set(keywords[count].split('--'))
        # removing unnecessary keywords
        words.discard(' Early works to 1800.')
        words.discard('')
        # Removing unnecessary dates  
        newWords = []
        for w in words: 
            w = w.replace('.','')
            w = re.sub(r'\([^)]*\)','',w)
            w = re.sub(r' ca|-|[0-9]{4}|,','',w)
            w = w.strip()
            newWords.append(w)
        newWords = set(newWords)
        newWords.discard('')
        newWords.discard('-')
        newWords.discard('17th century')
        newWords.discard('To')
        dict[ids[count]] = newWords
        count += 1
    return dict


def keyterms(csv):
    df = pd.read_csv(csv)
    keywords = df['keywords']
    ids = df['id']
    numFiles = len(ids)
    count = 0
    dict = {}
    while count < numFiles:
        words = set(keywords[count].split('--'))
        # removing unnecessary keywords
        words.discard(' Early works to 1800.')
        words.discard('')
        # Removing unnecessary dates  
        newWords = []
        for w in words: 
            w = w.replace('.','')
            w = re.sub(r'\([^)]*\)','',w)
            w = re.sub(r' ca|-|[0-9]{4}|,','',w)
            w = w.strip()
            newWords.append(w)
        newWords = set(newWords)
        newWords.discard('')
        newWords.discard('-')
        newWords.discard('17th century')
        newWords.discard('To')
        dict[ids[count]] = newWords
        count += 1
    keyterms = []
    for name in ids:
            keyterms.extend(dict[name])
    return keyterms

In [12]:

kterms = keyterms('/srv/data/CSVs/EPtuning.csv')
print(Counter(kterms))

targets=[]
kwdict = keywords('/srv/data/CSVs/EPtuning.csv')

df = pd.read_csv('/srv/data/CSVs/EPtuning.csv')
filekeys = df['id']
for filekey in filekeys:
    for t in kterms:
        if any(t in k for k in kwdict[filekey]):
            targets.append(t)
            break
        else:
            continue

print(targets)
print(Counter(targets))






Counter({'Great Britain': 8, 'History': 6, 'Voyages and travels': 5, 'Description and travel': 5, 'Commerce': 4, 'Colonial period': 3, 'Balance of trade': 2, 'Massachusetts': 2, 'New England': 2, 'Colonies': 2, 'Sermons English': 2, 'Customs administration': 2, 'Cape Breton Island': 1, 'Discoveries  English': 1, 'Geography': 1, 'Guyana': 1, 'Discovery and exploration': 1, 'East India Company': 1, 'East Indies': 1, 'Weights and measures': 1, 'Coinage': 1, 'Virginia': 1, 'Indians of North America': 1, 'Sermons': 1, 'OT': 1, 'Genesis XII 13': 1, 'Bible': 1, 'Newfoundland': 1, 'Colonization': 1, 'Litterateurs': 1, 'Learning and scholarship': 1, 'Law and legislation': 1, 'Civil War': 1, 'Tariff': 1, 'Import quotas': 1, 'England': 1, 'Grammar': 1, 'Mexico': 1, 'Central America': 1, 'Catholic Church': 1, 'Missions': 1, 'Pokonchi language': 1, 'Latin America': 1, 'Mercantile system': 1, 'Commercial policy': 1, 'Recipes': 1, 'Cookery': 1, 'Home economics': 1, 'Europe': 1, 'Fifth Monarchy Men': 

In [13]:
X_train, X_test, y_train, y_test = train_test_split(readable_results, targets, test_size=0.45, random_state=42)
lr = LogisticRegression(random_state=0, solver='lbfgs', penalty='none')
clf = lr.fit(X_train, y_train)

y_pred = clf.predict(X_test)
# evaluate accuracy
print("Accuracy score:", accuracy_score(y_test, y_pred, normalize=True, sample_weight=None))
print()
print("Results of this run:\n")
print("Play Title | Actual Genre | Predicted Genre")
for title, real, predicted in zip(X_test.index, y_test, y_pred):
    print(f"{title} | {real} | {predicted}")

Accuracy score: 0.08333333333333333

Results of this run:

Play Title | Actual Genre | Predicted Genre
A12466 | History | Voyages and travels
A66847 | Recipes | Voyages and travels
A01932 | Cape Breton Island | History
B00052 | Description and travel | Voyages and travels
A31106 | Litterateurs | Voyages and travels
A13290 | Colonies | History
A41427 | Description and travel | Voyages and travels
A02495 | Voyages and travels | Voyages and travels
A83297 | Great Britain | History
A10821 | Commerce | Voyages and travels
A03149 | Geography | Voyages and travels
A37936 | Great Britain | History


In [16]:
cm = confusion_matrix(y_test,y_pred)
print(cm)
print(clf.classes_)

[[0 0 0 0 0 0 1 0 0 0]
 [0 0 0 0 0 0 1 0 0 0]
 [0 0 0 0 0 0 0 0 0 1]
 [0 0 0 0 0 0 0 0 0 2]
 [0 0 0 0 0 0 0 0 0 1]
 [0 0 0 0 0 0 2 0 0 0]
 [0 0 0 0 0 0 0 0 0 1]
 [0 0 0 0 0 0 0 0 0 1]
 [0 0 0 0 0 0 0 0 0 1]
 [0 0 0 0 0 0 0 0 0 1]]
['Commerce' 'Description and travel' 'East India Company' 'Great Britain'
 'Guyana' 'History' 'Newfoundland' 'Voyages and travels']


In [15]:
cm = confusion_matrix(y_test,y_pred)
cm_df = pd.DataFrame(cm, columns=clf.classes_, index=clf.classes_)
f, ax = plt.subplots(figsize=(15, 5))
sns.heatmap(cm_df,annot=True,cmap='Greens',linewidths=.5)

ValueError: Shape of passed values is (10, 10), indices imply (8, 8)