In [None]:
##### Welcome to our Hands-On Workshop! We'll mostly be exploring classification (predictive modeling) as a
# way to study and analyze corpora of literary texts, poetry in particular.

##### INSTALLATION!

#### If you haven't installed Python yet, I'd like you to download and install the Anaconda distribution, which
# can be found here: https://docs.anaconda.com/anaconda/install/
# There are options for both MAC and Windows


In [None]:
##### DATA!

# I circulated a link to the data in preparation for this workshop. Please download the data and put it somewhere
# you can find on your machine. The data is out of copyright by American standards, so you are free to share it or
# use it.

# A word or two about MASSES versus OTHERS, two important American literary journals from the Modernist period
# i.e, 1910 to 1925 or so. Masses was an important left-wing, proletarian journal; Others was a flagship journal
# of high modernism, especially Imagism. TS Eliot, Pound, Amy Lowell all published there.

In [None]:
##### OK let's get going

# First, let's just get to know our data!

# Let's read in all the metadata and data
# Some necessary imports

import pandas as pd


# supress warnings, do not do this at home, but it makes the notebook look better (KLN)
import warnings
warnings.filterwarnings("ignore")

meta1 = 'MassesData.csv'
meta2 = 'OthersData.csv'

mass_meta = pd.read_csv(meta1, encoding='latin1')
# Let's get rid of any rows that don't have FILENAME
mass_meta = mass_meta.dropna(axis=0, subset=['FILENAME'])

other_meta = pd.read_csv(meta2, encoding='latin1')
other_meta = other_meta.dropna(axis=0, subset=['FILENAME'])

# Sanity check
#mass_meta.head()
#other_meta.head()

mass_meta.shape, other_meta.shape, mass_meta.columns, other_meta.columns
#mass_meta.head()

In [None]:
# This is a really nice data structure but we're not done. Let's read in the actual
# text files into the dataframe, and then we'll have ALL of the data (meta + text) in one place!

mass_path = 'Masses/'
other_path = 'Others/'

import codecs
import string
exclude = set(string.punctuation)

mass_meta['TEXT'] = ''
other_meta['TEXT'] = ''

for index, row in mass_meta.iterrows():
    filepath = mass_path + str(row['FILENAME'])
    text = codecs.open(filepath, "r")         
    raw = text.read()
    raw = raw.lower()
    raw = ''.join(ch for ch in raw if ch not in exclude)
    raw1 = raw.split()
    mass_meta.set_value(index, 'TEXT', raw1)
    
for index, row in other_meta.iterrows():
    filepath = other_path + str(row['FILENAME'])
    text = codecs.open(filepath, "r")         
    raw = text.read()
    raw = raw.lower()
    raw = ''.join(ch for ch in raw if ch not in exclude)
    raw1 = raw.split()
    other_meta.set_value(index, 'TEXT', raw1)

In [None]:
# Sanity check
mass_meta.head()

In [None]:
# OK let's learn a bit about our two corpora; let's get a baseline of comparison to understand
# how they might be similar or different

# First, average year
mass_meta['YEAR'].mean(), other_meta['YEAR'].mean()

In [None]:
# Let's next get average length of title + length of poem

mass_meta['TITLE_LENGTH'] = ''
mass_meta['LENGTH'] = ''
other_meta['TITLE_LENGTH'] = ''
other_meta['LENGTH'] = ''

for index, row in mass_meta.iterrows():
    title = len(row['POEM TITLE'].split())
    mass_meta.set_value(index, 'TITLE_LENGTH', title)
    text = len(row['TEXT'])
    mass_meta.set_value(index, 'LENGTH', text)
    
for index, row in other_meta.iterrows():
    title = len(row['POEM TITLE'].split())
    other_meta.set_value(index, 'TITLE_LENGTH', title)
    text = len(row['TEXT'])
    other_meta.set_value(index, 'LENGTH', text)


In [None]:
# Sanity check
mass_meta.head()

In [None]:
# Now, compute average title length, average text length
#mass_meta['TITLE_LENGTH'].mean(), other_meta['TITLE_LENGTH'].mean()
mass_meta['LENGTH'].mean(), other_meta['LENGTH'].mean()

In [None]:
mass_meta['TITLE_LENGTH'].mean(), other_meta['TITLE_LENGTH'].mean()

In [None]:
# Let's get some more information, working at the full corpus level

all_mass = []

for index, row in mass_meta.iterrows():
    all_mass.append(row['TEXT'])
    
all_mass = [item for sublist in all_mass for item in sublist]

all_other = []

for index, row in other_meta.iterrows():
    all_other.append(row['TEXT'])
    
all_other = [item for sublist in all_other for item in sublist]

# Sanity check
len(all_mass), all_mass[0:5], len(all_other), all_other[0:5]


In [None]:
# Let's get Type-Token Ratio (TTR) for both corpora

TTR_mass = len(set(all_mass)) / len(all_mass)
TTR_other = len(set(all_other)) / len(all_other)

TTR_mass, TTR_other

In [None]:
# Next let's identify the most common words in each corpus

from collections import Counter
mass_counts = Counter(all_mass)
other_counts = Counter(all_other)

import operator
from operator import itemgetter

# And then some simple counting
ranked_mass = sorted(mass_counts.items(), key=itemgetter(1), reverse=True)
ranked_other = sorted(other_counts.items(), key=itemgetter(1), reverse=True)

ranked_mass[0:10], ranked_other[0:10]


In [None]:
# After removing stopwords

text_file = open("jockers_stopwords.txt", "r")
jockers_words = text_file.read().split()

all_mass2 = [word for word in all_mass if word not in jockers_words]
all_other2 = [word for word in all_other if word not in jockers_words]

mass_counts2 = Counter(all_mass2)
other_counts2 = Counter(all_other2)

ranked_mass2 = sorted(mass_counts2.items(), key=itemgetter(1), reverse=True)
ranked_other2 = sorted(other_counts2.items(), key=itemgetter(1), reverse=True)

ranked_mass2[0:10], ranked_other2[0:10]

In [None]:
# Ok one more thing; let's determine which words are in MASS but not OTHER and vice versa!

mass_not_other = []

for word in all_mass2:
    if word not in all_other2:
        mass_not_other.append(word)
        
other_not_mass = []

for word in all_other2:
    if word not in all_mass2:
        other_not_mass.append(word)
        

In [None]:
len(mass_not_other), len(other_not_mass)

In [None]:
mass_counts3 = Counter(mass_not_other)
other_counts3 = Counter(other_not_mass)

ranked_mass3 = sorted(mass_counts3.items(), key=itemgetter(1), reverse=True)
ranked_other3 = sorted(other_counts3.items(), key=itemgetter(1), reverse=True)

ranked_mass3[0:10], ranked_other3[0:10]

In [None]:
##### SUMMARY
# OK that's enough of that surface analysis!
# So we know a few things now about these two corpora and how they are similar/different
# We know about their average years; their average title and poem lengths
# About their basic lexical diversity/repetitiveness, and total number of words
# Which words are most common and which words are in one corpus and not in another

## This is important to start developing a basic intuition as to these two corpora before we do
# more complex analysis, i.e. classification. Before we throw a complex model at the data and we
# develop layers of mediation, I think it's important to first know what's there more directly,
# before we run all the data through a machine learning algorithm.


In [None]:
# OK let's start classification

# First let's merge our two metadata dataframes into one

all_meta = pd.concat([mass_meta, other_meta])
# Sanity check
mass_meta.shape, other_meta.shape, all_meta.shape
all_meta = all_meta.reset_index()
all_meta = all_meta.drop('index', 1)
all_meta.shape

In [None]:
# OK next we have to build the document text matrix (DTM) for classification

from sklearn.feature_extraction.text import CountVectorizer

corpus_path = 'ALL_TEXTS'

# Build DTM
vectorizer = CountVectorizer(input='filename', min_df=3, encoding='utf8')
dtm = vectorizer.fit_transform(corpus_path + "/" + all_meta['FILENAME'])
vocab = vectorizer.get_feature_names()
matrix = dtm.toarray()

In [None]:
# Sanity check
matrix.shape, len(vocab), all_meta.shape

In [None]:
# Things are about to get complicated so let's first reduce our metadata to just what we need
all_meta = all_meta[['AUTHOR', 'JOURNAL', 'YEAR', 'POEM TITLE', 'FILENAME']]


In [None]:
# Next let's merge the DTM with the metadata, so we have everything we need for classification

DTM = pd.DataFrame(matrix, columns=vocab)


In [None]:
final_df = pd.concat([all_meta, DTM], axis=1)
final_df.shape


In [None]:
# OK now we have everything we need for classification/predictive modeling
# I'm going to implement the ONE VS ALL method (mini lecture on that) to produce stable results

from sklearn.linear_model import LogisticRegression

output = []

i=0
for i in range(final_df.shape[0]):
    # First grab all data minus test case, and then, the test case
    predict_row = final_df.loc[[i]]
    train_rows = final_df.drop(i)
    
    # Specify logit model, l1 penalty and C=1.0 (standard)
    model = LogisticRegression(penalty='l1', C=1)
    
    # Fit the model
    X = train_rows.iloc[:, 5:]
    y = train_rows.iloc[:, 1]
    TEST_CASE = predict_row.iloc[:, 5:]
    true_label = predict_row.iloc[:, 1].values
    true_fname = predict_row.iloc[:, 4].values
    model.fit(X, y)
    
    # Predict
    # predict class labels for the test set
    predicted = model.predict(TEST_CASE)
    # generate class probabilities
    probs = model.predict_proba(TEST_CASE)
    
    # Save output
    output.append((str(true_fname), str(true_label), str(predicted), probs))


In [None]:
# Sanity check
output[0]


In [None]:
# OK let's parse this output so we can analyze it

files = []
trues = []
predicts = []
probs = []

for item in output:
    files.append(item[0])
    trues.append(item[1])
    predicts.append(item[2])
    probs.append(item[3])


In [None]:
# Make a dataframe for easy viewing
df = pd.DataFrame(files, columns=['FILENAME'])
df['TRUE_CLASS'] = trues
df['PREDICT_CLASS'] = predicts
df.head()

In [None]:
# Prob values are still messy, we need to clean them

import re
probs3A = []
probs3B = []

for prob in probs:
    x = prob.tolist()
    probs3A.append(x[0][0])
    probs3B.append(x[0][1])

In [None]:
# Put into dataframe
df['PROB_MASS'] = probs3A
df['PROB_OTHER'] = probs3B
# Sanity check
df.head()

In [None]:
# Let's do some analysis. First let's determine how accurate the classifier is
# by computing the rate of misclassifieds, and let's also identify those misclassifieds

df['RESULT'] = ''

for index, row in df.iterrows():
    if row['TRUE_CLASS'] != row['PREDICT_CLASS']:
        df.set_value(index, 'RESULT', 'FALSE')
    else:
        df.set_value(index, 'RESULT', 'CORRECT')
        
misclassifieds = df[df['RESULT'] == 'FALSE']

In [None]:
# Simple stuff, compute overall accuracy of classifier

corrects = df[df['RESULT'] == 'CORRECT']
accuracy = corrects.shape[0] / df.shape[0]
accuracy

In [None]:
# Texts most strongly predicted to belong to MASSES or OTHERS

mass_df = df.sort_values(by=['PROB_MASS'], ascending=False)
other_df = df.sort_values(by=['PROB_OTHER'], ascending=False)
other_df[0:10]

In [None]:
##### Exercise! That's interesting MAS0206 is so strongly predicted to be OTHERS!
# Let's take a look at it!

In [None]:
##### Let's now look at specific feature weights based on the model
# The idea is that we want a bit more granularity as to what specific features are driving our classification

In [None]:
# Some imports

import numpy as np
from scipy.stats import pearsonr, norm

In [None]:
# We'll need some functions to help us compute these feature weights

# Simple function to compute Z-score

def Ztest(vec1, vec2):
    # edited from https://stats.stackexchange.com/questions/124096/two-samples-z-test-in-python
    
    X1, X2 = np.mean(vec1), np.mean(vec2)
    sd1, sd2 = np.std(vec1), np.std(vec2)
    n1, n2 = len(vec1), len(vec2)
    
    pooledSE = np.sqrt(sd1**2/n1 + sd2**2/n2)
    z = (X1 - X2)/pooledSE
    pval = 2*(norm.sf(abs(z)))
    
    return z, pval


In [None]:
# Another function to compute logistic regression weights on each feature (also does Z-test)

canonic_c = 1.0 # value returning best f1

def feat_pval_weight(meta_df_, dtm_df_):
    # Split dtms for ease in pipeline
    
    dtm_df_ = dtm_df_.loc[meta_df_.index.tolist()]
    dtm_df_ = normalize_model(dtm_df_, dtm_df_)[0]
    dtm_df_ = dtm_df_.dropna(axis=1, how='any')
    
    best_dtm = dtm_df_.loc[meta_df_[meta_df_['JOURNAL']=='Mas'].index.tolist()].as_matrix()
    black_dtm = dtm_df_.loc[meta_df_[meta_df_['JOURNAL']=='Oth'].index.tolist()].as_matrix()
    
    pvals = [Ztest(best_dtm[:,i],black_dtm[:,i])[1] for i in range(dtm_df_.shape[1])]
    
    clf = LogisticRegression(penalty='l1', C=canonic_c, class_weight = 'balanced')
    clf.fit(dtm_df_, meta_df_['JOURNAL']=='Mas')
    weights = clf.coef_[0]
    
    feature_df = pd.DataFrame()
    
    feature_df['FEAT'] = dtm_df_.columns
    feature_df['P_VALUE'] = pvals
    feature_df['LR_WEIGHT'] = weights
    
    return feature_df

In [None]:
# It's probably a good idea to turn all the values in the DTM to standard units; that's what this function does

def normalize_model(train_df_, test_df_):
    
    # Normalize each value by the sum of all values in its row
    train_df_ = train_df_.apply(lambda x: x/sum(x), axis=1)
    test_df_ = test_df_.apply(lambda x: x/sum(x), axis=1)
    
    # Get mean and stdev for each column
    train_mean = np.mean(train_df_)
    train_std = np.std(train_df_)

    # Transform each value to standard units for its column
    train_df_ = ( train_df_ - train_mean ) / train_std
    test_df_ = ( test_df_ - train_mean ) / train_std
    
    train_df_ = train_df_.dropna(axis=1, how='any')
    test_df_ = test_df_[train_df_.columns]
    
    return train_df_, test_df_

In [None]:
# Let's recall our earlier dataframes

# Metadata
all_meta.head()


In [None]:
# Document Term Matrix
DTM.head()

In [None]:
# Sanity check
all_meta.shape, DTM.shape

In [None]:
# Bonferri adjustment
sig_thresh = 0.01 #/ len(DTM.columns) Usually we use bonferri correction with so many features but for some reason
# this wasn't necessary for this dataset, probably because this set is relatively small

In [None]:
# Get feature report
import numpy as np
feat_df = feat_pval_weight(all_meta, DTM)
feat_df.shape

In [None]:
# Feature Report, TOP 25

out = feat_df[ (feat_df['P_VALUE'] <= sig_thresh ) ].sort_values('LR_WEIGHT', ascending=False)
# True is OTHER, False is MASS

out2 = out['FEAT'].tolist()
top_feats = out2[0:20]
#print("TOP OTHER TEXT FEATURES: ")
print("TOP MASSES TEXT FEATURES: ")
for o in top_feats:
    print(o)

In [None]:
##### Close reading!
# Let's go back to that MASSES text that was misclassified confidently to be an OTHERS text
# Now we have a foundation or a better basis to close read it; we know the specific words 
# that are important in classifying it as OTHERS rather than MASSES, it's actual identity

In [None]:
# Ok so that's just some basic stuff we can do with classification/predictive modeling with a 2 class example
# Obviously there is a ton more we can do; for example, in the lecture I gave on "Race and Distant Reading,"
# I look at the feature variance between our two classes, and show how one group is far more variant than
# the other. There's a lot more one can do with this method, of course, for literary studies.


In [None]:
# I want to do one more thing though. I increasingly find it useful to approach one's data from multiple perspectives,
# to use multiple methods to see how different models understand the data.
# There's another way we can determine the semantic differences between two groups of texts that is far simpler than
# classification.
# We can simply use a most distinctive words test.
# So let's try that, just to see what kind of differnet results this produces versus our classification exercise.


In [None]:
# We'll use the Mann Whitney U test
# A good overview/explanation and rationale for literary texts/poems is here:
# https://tedunderwood.com/2011/11/09/identifying-the-terms-that-
# characterize-an-author-or-genre-why-dunnings-may-not-be-the-best-method/

# Key point: "In general, it gives less weight to raw frequency, and more weight to 
# the relative ubiquity of a term in different corpora."

In [None]:
# Let's first call up our data
final_df.head()

In [None]:
# We need to split our corpora back into MASSES and OTHERS again
# This is the format my Mann Whitney U test function takes

corpus1 = final_df[final_df['JOURNAL'] <= 'Mas']
corpus2 = final_df[final_df['JOURNAL'] >= 'Oth']

corpus1.shape, corpus2.shape

In [None]:
# Let's slice both corpora now to just get the word counts

corpus1A = corpus1.iloc[:, 5:]
corpus2A = corpus2.iloc[:, 5:]
corpus1A.shape, corpus2A.shape
corpus1A.head()

In [None]:
# Let's run the mann whitney utest
# NB: For a function that takes some time, I like to see its progress, thus the i counter

from scipy.stats import mannwhitneyu

i = 0
out = []
for column in corpus1A:
    print(i)
    vals = corpus1A[column].values
    vals2 = corpus2A[column].values
    mw = mannwhitneyu(vals, vals2)
    mwStat = mw.statistic
    mwRho = mwStat / corpus1A.shape[0] * corpus2A.shape[0]
    out.append((column, mwStat, mwRho))
    i = i+1

In [None]:
# Fit into a dataframe
words = []
rho = []

for item in out:
    words.append(item[0])
    rho.append(item[2])
    
mdw_df = pd.DataFrame(words, columns=['WORD'])
mdw_df['MDW_RHO'] = rho
len(rho)

In [None]:
df2 = mdw_df.sort_values("MDW_RHO", ascending=True)
# False is MASSES, True is OTHERS
df2[0:20] # 

In [None]:
## What do you make of these results compared to the classification feature results?
# What's your intuition as to why we got these results based on what you know of the model?