In [1]:
# General Libraries Needed
import csv
import os 
import re
import glob, csv
import pandas as pd
from collections import defaultdict, Counter
from lxml import etree

# Functions for Supervised Classification
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
import numpy as np

# Libraries for Graphing
import matplotlib.pyplot as plt
import seaborn as sns
plt.style.use('ggplot')

In [2]:
def getTexts(folder):
    '''
    Takes in plain text files and outputs a tuple of lists, with the first being the text
    within each file as a string and the second list being the IDs of each text. 
    '''
    textStrings = []
    fileNames = []
    for file in os.listdir(folder):
        path = os.path.join(folder,file)
        f = open(path,'r')
        text = f.readlines()[0]
        textStrings.append(text)
        name = file.split('.')[0]
        fileNames.append(name)
        f.close()
    return textStrings,fileNames

In [4]:

fileinfo = getTexts("/srv/data/companyTextsClean")
# First we need to create an "instance" of the vectorizer, with the proper settings.
# Normalization is set to 'l2' by default
tfidf = TfidfVectorizer(min_df=2, sublinear_tf=True)
# I am choosing to turn on sublinear term frequency scaling, which takes the log of
# term frequencies and can help to de-emphasize function words like pronouns and articles. 
# You might make a different choice depending on your corpus.

# Once we've created the instance, we can "transform" our counts
results = tfidf.fit_transform(fileinfo[0])

# Make results readable using Pandas
readable_results = pd.DataFrame(results.toarray(), index=fileinfo[1], columns=tfidf.get_feature_names_out()) # Convert information back to a DataFrame
readable_results

Unnamed: 0,aaron,abandon,abas,abase,abash,abate,abatement,abbess,abbey,abbot,...,zeinel,zeinell,zemes,zermonia,zingis,zizimus,zofala,zolnock,zone,zoroam
A09209,0.0,0.000000,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
A03476,0.0,0.000000,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
A07886,0.0,0.020718,0.0,0.0,0.0,0.045048,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
A03477,0.0,0.000000,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
A12330,0.0,0.000000,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
A22250,0.0,0.000000,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
A68246,0.0,0.000000,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
A72397,0.0,0.000000,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
B00838,0.0,0.000000,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [9]:
def keywords(csv):
    '''
    Returns a dictionary in this format {id : (keywords,date)}
    '''
    df = pd.read_csv(csv)
    keywords = df['keywords']
    ids = df['id']
    dates = df['date']
    numFiles = len(ids)
    count = 0
    dict = {}
    while count < numFiles:
        words = set(keywords[count].split('--'))
        # removing unnecessary keywords
        words.discard('')
        # Removing unnecessary dates  
        newWords = []
        for w in words: 
            w = w.replace('.','')
            w = re.sub(r'\([^)]*\)','',w)
            w = re.sub(r' ca|-|[0-9]{4}|,','',w)
            if re.search('Sultan of the Turks',w):
                w = 'Sultan of the Turks'
            if re.search('Süleyman',w):
                w = 'Süleyman'
            w = w.strip()
            newWords.append(w)
        newWords = set(newWords)
        newWords.discard('')
        newWords.discard('-')
        newWords.discard('17th century')
        newWords.discard('Early works to')
        newWords.discard('To')
        newWords.discard('No Keywords')
        newWords.discard('Great Britain')
        dict[ids[count]] = (newWords,dates[count])
        count += 1
    return dict 

In [27]:
f = open('/srv/data/metadata/textCounts/eicOurTime.txt','r')
text = f.readlines()[0]
name = text.split(' -- ')[0]
name

'A09209'

In [23]:
def keywords_text(txt):
    f = open(txt,'r')
    dict={}
    for text in f.readlines():
        name = text.split(' -- ')[0]
        keywords = text.split(' -- ')[1]
        keywords = keywords.replace('{','')
        keywords = keywords.replace('}','')
        keywords = keywords.replace("'",'')
        keywords = keywords.split(', ')
        newWords = []
        for w in keywords: 
            w = w.replace('.','')
            w = re.sub(r'\([^)]*\)','',w)
            w = re.sub(r' ca|-|[0-9]{4}|,','',w)
            if re.search('Sultan of the Turks',w):
                w = 'Sultan of the Turks'
            if re.search('Süleyman',w):
                w = 'Süleyman'
            w = w.strip()
            newWords.append(w)
        newWords = set(newWords)
        newWords.discard('')
        newWords.discard('-')
        newWords.discard('17th century')
        newWords.discard('Early works to')
        newWords.discard('To')
        newWords.discard('No Keywords')
        newWords.discard('Great Britain')
        dict[name] = newWords
    return dict

    

In [37]:

targets=[]

dict1 = keywords_text('/srv/data/metadata/textCounts/eicOurTime.txt')
dict2 = keywords_text('/srv/data/metadata/textCounts/levantOurTime.txt')
dict3 = keywords_text('/srv/data/metadata/textCounts/virginiaOurTime.txt')
kwdict = dict1|dict2
kwdict = kwdict|dict3
#kwdict = keywords('/srv/data/companyCSV/companyCSV.csv')
for filekey in kwdict.keys():
   if filekey not in ['A08092','B12972','A09573','A15796','B11348','B14268']:
    kw = kwdict[filekey]
    levant_terms = ['Turkey', 'Sultan of the Turks', 'Mediterranean Sea', 'Süleyman']
    eastind_terms = ['East India Company', 'East Indies']
    virginia_terms = ['Virginia', 'Virginia Company of London']
    if any(k in levant_terms for k in kw):
        targets.append('Levant')
    elif any(k in eastind_terms for k in kw):
        targets.append('East India')
    elif any(k in virginia_terms for k in kw):
        targets.append('Virginia')
    else:
        targets.append('Neither')

In [39]:
len(targets)

108

In [40]:
X_train, X_test, y_train, y_test = train_test_split(readable_results, targets, test_size=0.45, random_state=42)
lr = LogisticRegression(random_state=0, solver='lbfgs', penalty='none')
clf = lr.fit(X_train, y_train)

y_pred = clf.predict(X_test)
# evaluate accuracy
print("Accuracy score:", accuracy_score(y_test, y_pred, normalize=True, sample_weight=None))
print()
print("Results of this run:\n")
print("Play Title | Actual Genre | Predicted Genre")
for title, real, predicted in zip(X_test.index, y_test, y_pred):
    print(f"{title} | {real} | {predicted}")

Accuracy score: 0.8979591836734694

Results of this run:

Play Title | Actual Genre | Predicted Genre
A14518 | Virginia | Virginia
A04364 | East India | East India
A12330 | East India | East India
A16711 | Virginia | Virginia
A08440 | Virginia | Virginia
A14526 | Virginia | Virginia
A07559 | Levant | Levant
A09478 | Levant | Levant
A22537 | Virginia | Virginia
A04763 | East India | East India
A10725 | Virginia | Virginia
A12470 | Virginia | Virginia
A09209 | East India | Virginia
A13057 | Virginia | Virginia
A37552 | East India | East India
A19590 | Virginia | Virginia
A18686 | Levant | Levant
A14514 | Virginia | Virginia
B00838 | Virginia | Virginia
A15309 | Levant | Levant
A03451 | East India | Virginia
A17260 | Levant | Levant
A04581 | Virginia | Virginia
A01836 | Levant | Levant
A73966 | East India | Virginia
A04911 | Levant | Levant
A08162 | Levant | Levant
A38817 | Virginia | Virginia
A15387 | East India | Levant
A02472 | Levant | Levant
A14519 | Virginia | Virginia
A12466 | Virg