# Problem Set 1 - Andreas Bloch
This is the problem set submission of Andreas Bloch (abloch@student.ethz.ch)
## Imports
First we'll load some libraries and scripts that will be useful throughout the entire notebook.

In [1]:
import os

%matplotlib inline
import matplotlib.pyplot as plt

from collections import Counter

import itertools

from nltk.sentiment.vader import SentimentIntensityAnalyzer
from nltk.stem import SnowballStemmer

import numpy as np

import pandas as pd

import re

import seaborn as sns

from sklearn.cluster import KMeans
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
from sklearn.metrics import f1_score
from sklearn.metrics import roc_auc_score
from sklearn.metrics import roc_curve
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import cross_val_predict
from sklearn.preprocessing import StandardScaler
from sklearn.utils import shuffle

import spacy
from spacy.lang.en.stop_words import STOP_WORDS
from spacy.lemmatizer import Lemmatizer
from spacy.lang.en import LEMMA_INDEX, LEMMA_EXC, LEMMA_RULES

from string import punctuation

import swifter

## Reading of Data
Next we'll read the data from disk into a pandas data frame.

In [2]:
# define data locations
DATA_DIR = './data'
CASE_DIR = './data/cases'
REVERSED_CASES = 'case_reversed.csv'

# read reversal decisions from CSV
case_reversed_csv = pd.read_csv(os.path.join(DATA_DIR, REVERSED_CASES))

# create data frame to combine the data sources
reversal_case_ids = case_reversed_csv['caseid'].values
columns = ['year', 'text', 'doc', 'reversed',
           'num_sentences', 'num_words', 'num_letters',
           'num_nouns', 'num_verbs', 'num_adjectives',
           'trigrams']

data = pd.DataFrame(
    index=pd.Index(reversal_case_ids, name='case_id'),
    columns=pd.Index(columns, name='attributes')
)

# store reversal decisions into data frame
for idx, row in case_reversed_csv.iterrows():
    case_id = row[0]
    reversed = row[1]
    data.at[case_id, 'reversed'] = reversed

# read and store case txt files into data frame
case_ids = []
for file in os.listdir(os.fsencode(CASE_DIR)):
    filename = os.fsdecode(file)
    file_handle = open(os.path.join(CASE_DIR, filename), 'r')

    year = filename.partition('_')[0]
    case_id = filename.partition('_')[2].partition('.')[0]
    text = file_handle.read()

    case_ids.append(case_id)

    data.at[case_id, 'year'] = year
    data.at[case_id, 'text'] = text

# data integrity check:
# check that every case has a matching reversal decision and vice-versa
if set(case_ids) != set(reversal_case_ids):
    raise Exception('case_ids not matching! check loading of data!')

# set data types
data[['year']] = data[['year']].astype(int)
data[['reversed']] = data[['reversed']].astype(float)
data[['num_sentences']] = data[['num_sentences']].astype(float)
data[['num_words']] = data[['num_words']].astype(float)
data[['num_letters']] = data[['num_letters']].astype(float)
data[['num_nouns']] = data[['num_nouns']].astype(float)
data[['num_verbs']] = data[['num_verbs']].astype(float)
data[['num_adjectives']] = data[['num_adjectives']].astype(float)

# print data types
print('Dataframe Data Types:')
data.dtypes

Dataframe Data Types:


attributes
year                int64
text               object
doc                object
reversed          float64
num_sentences     float64
num_words         float64
num_letters       float64
num_nouns         float64
num_verbs         float64
num_adjectives    float64
trigrams           object
dtype: object

In [3]:
# print preview of data to show that it's loaded
print('Dataframe Data:')
data.head()

Dataframe Data:


attributes,year,text,doc,reversed,num_sentences,num_words,num_letters,num_nouns,num_verbs,num_adjectives,trigrams
case_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
X3JGGO,1925,"POLLOCK , District Judge.\nFor convenience, t...",,0.0,,,,,,,
X3OH3J,1924,"JOHNSON , Circuit Judge.\nThis is a patent in...",,0.0,,,,,,,
X3U0KO,1925,"WOOLLEY , Circuit Judge.\nThe indictment agai...",,0.0,,,,,,,
X53HAD,1924,"ROGERS , Circuit Judge.\nThe complainant is a...",,0.0,,,,,,,
X9VC5V,1925,"DAWKINS , District Judge.\nComplainant brough...",,0.0,,,,,,,


In [4]:
# print data frame dimensions
print('Dataframe Dimensions:')
data.shape

Dataframe Dimensions:


(5762, 11)

## Shuffling of Data

Shuffle the data to make sure we have positive and negative reversal examples.

In [5]:
data = shuffle(data, random_state=23)

## Optional: Subsampling of Data

Execute this code if you want to subsample your data.

In [6]:
SUBSAMPLE = False         # set this to True if you want to subsample your data for performance reasons
SUBSAMPLING_SIZE = 1000

In [7]:
if SUBSAMPLE: 
    data = data.head(SUBSAMPLING_SIZE)

## Exercise 1
**Q:** Use spaCy to process all cases. Split the documents into sentences and tokens. Compute number of sentences, words, and letters for each document. Report histograms for these statistics.

In [None]:
nlp = spacy.load('en')

# do NLP with spacy on documents
data['doc'] = data['text'].swifter.apply(nlp)

# count number of sentences, words and letters
data['num_sentences'] = data['doc'].swifter.apply(
    lambda doc: len(list(doc.sents))
)
data['num_words'] = data['doc'].swifter.apply(
    lambda doc: len(doc)
)
data['num_letters'] = data['text'].swifter.apply(
    lambda text: len(''.join(filter(str.isalpha, text)))
)

In [None]:
# use wide figures across whole document
sns.set(rc={'figure.figsize':(16,5)})

# plot sentence frequencies
ax = sns.distplot(data['num_sentences'].values, kde=False)
ax.set(xlabel='Number of Sentences', ylabel='Frequency')
plt.show()

# plot word frequencies
ax = sns.distplot(data['num_words'].values, kde=False)
ax.set(xlabel='Number of Words', ylabel='Frequency')
plt.show()

# plot letter frequencies
ax = sns.distplot(data['num_letters'].values, kde=False)
ax.set(xlabel='Number of Letters', ylabel='Frequency')
plt.show()

## Exercise 2

**Q:** Use the spaCy parts of speech (POS) tags to count number of nouns, verbs, and adjectives in each document. Visualize POS frequency by year.

**A:** See here if you want a more detailed explanation of spacy.parts_of_speech symbols:

https://universaldependencies.org/u/pos/all.html

I've decided not to count proper nouns (PROPN) as nouns.

In [None]:
# count number of nouns, verbs and adjectives (through POS tags)
data['num_nouns'] = data['doc'].swifter.apply(
    lambda doc: sum(token.pos == spacy.parts_of_speech.NOUN for token in doc)
)
data['num_verbs'] = data['doc'].swifter.apply(
    lambda doc: sum(token.pos == spacy.parts_of_speech.VERB for token in doc)
)
data['num_adjectives'] = data['doc'].swifter.apply(
    lambda doc: sum(token.pos == spacy.parts_of_speech.ADJ for token in doc)
)

In [None]:
data.groupby('year')['num_nouns','num_verbs','num_adjectives'].sum().plot.line()

## Exercise 3

**Q:** Follow the steps in lecture to normalize your corpus (e.g., removing punctuation) and discuss your choices about what information to exclude. Using the normalized tokens, make a feature set of all trigrams that end in a noun.

**A:** The steps are explained in the comments in the code below. The feature set will be stored in the column data['trigrams'].


In [None]:
# jthe list of stop words used by spacy can be found in the imports above.
# i've decided not to add or remove any stop words from this list as they
# seem to be fine.
print('Used Stop Words:')
print(STOP_WORDS)

# create punctuation remover
punctuation_remover = str.maketrans('', '', punctuation)

# create lemmatizer
lemmatizer = Lemmatizer(LEMMA_INDEX, LEMMA_EXC, LEMMA_RULES)

# create snowball stemmer
stemmer = SnowballStemmer('english')

# function to normalize a doc
def normalize_text_and_create_trigrams(doc):

    # list of found normalized 3-grams in doc
    trigrams = []

    # process sent. by sent. to avoid creating n-grams that overlap a sent.
    for sentence in doc.sents:

        words = []  # words in sentence
        poses = []  # POSes of words in sentence

        # filter out unwanted tokens and normalize the kept tokens
        for token in sentence:
            # get the token's word(s)
            word = ''.join(token.text)
            # replace newlines with spaces
            word = word.replace('\r', ' ').replace('\n', ' ')
            # remove punctuation
            word = word.translate(punctuation_remover)
            # replace multiple subsequent spaces with one space
            word = re.sub(' +', ' ', word)
            # check that word still has some text (not just one char or space)
            if len(word) <= 1:
                continue
            # normalize numbers (28, 28th, 1st, ...)
            if any(char.isdigit() for char in word):
                word = '#'
            # lemmatize the word
            lemmas = lemmatizer(word, token.pos)[0]
            if isinstance(lemmas, (list,)) and len(lemmas) > 0:
                # pick the first option if several lemmas were found
                word = lemmas[0]
            else:
                # no lemma was found (just keep the original word)
                word = word
            # convert the word to lowercase
            word = word.lower()
            # remove stopwords
            # this check has to be done at the end because some words aren't
            # a stop-word in their unlemmatized form. Further note that
            # token.is_stop didn't use all the words from STOP_WORDS, so it's
            # better to check against the full list again.
            if word in STOP_WORDS:
                continue
            # stem the word to remove singular/plural
            word = stemmer.stem(word)
            # keep track of the token's word and pos tag
            words.append(word)
            poses.append(token.pos)

        # zip the words and pos tags together
        words_and_poses = list(zip(words, poses))

        # function to generate list of n-grams of a sequence of items
        def gen_n_grams(items, n):
            if len(items) >= n:
                for i in range(0, len(items)-n+1):
                    yield items[i:i + n]

        # generate candidate 3-grams
        candidate_trigrams = gen_n_grams(words_and_poses, 3)

        # check whether found candidate 3-grams of sentence end in a noun
        for candidate_trigram in candidate_trigrams:
            if candidate_trigram[2][1] == spacy.parts_of_speech.NOUN:
                # create 3-gram string
                trigram_string = \
                    candidate_trigram[0][0] + "." + \
                    candidate_trigram[1][0] + "." + \
                    candidate_trigram[2][0]
                trigrams.append(trigram_string)

    return trigrams

# determine trigrams (ending in a noun) occurring in each text
data['trigrams'] = data['doc'].swifter.apply(normalize_text_and_create_trigrams)

In [None]:
print('Example Trigram-Creation (Preview):')
print('')
print('Original Text (Preview):')
print(data['text'].iloc[0][0:400]+"...")
print('')
print('Created Trigrams (Preview):')
print(data['trigrams'].iloc[0][0:10])

## Exercise 4

**Q:** Make a dataframe with at least 1000 features (frequencies over trigrams ending in a noun). Standardize the features to variance one while maintaining sparsity. 
    

In [None]:
# create counter to count trigram frequencies
term_frequencies = Counter()

# for each list of trigrams that appear in a case
for case_trigrams in data['trigrams']:
    # update the term frequencies
    term_frequencies.update(case_trigrams)

# print 100 most common trigrams
print('100 Most Common Trigrams:')
for term_freq in term_frequencies.most_common(100):
    print(term_freq)
print('')

In [None]:
# create a vocabulary (set of trigrams) from the 1000 most common trigrams
vocab = set(x[0] for x in term_frequencies.most_common(1000))

# create data frame to represent cases with features according to 1000 most common trigrams
data_featurized_columns = list(vocab)+['reversed']
data_featurized = pd.DataFrame(
    index=pd.Index(list(data.index.values), name='case_id'),
    columns=pd.Index(data_featurized_columns, name='features')
)

# print shape of featurized data frame
print('Featurized Data Shape:')
data_featurized.shape

In [None]:
# for each list of trigrams that appear in a case, create its vector representation
# according to the 1000 most common trigrams
for case_id, case in data.iterrows():
    # create dictionary to count trigram occurrences
    # (of trigrams from 1000 most common trigrams)
    # initialize all counts with zero
    case_trigram_features = dict((t,0) for t in vocab)
    # for each trigram appearing in the case
    for trigram in case['trigrams']:
        # if the trigram appears in the 1000 most common trigrams vocabulary
        if trigram in vocab:
            # increment the count for that trigram
            case_trigram_features[trigram] += 1
    # keep track of label (whether the case was reversed)
    case_trigram_features['reversed'] = case['reversed']
    # store the case_trigram_counts and reversal decision into data_featurized
    for col, val in case_trigram_features.items():
        data_featurized.at[str(case_id), str(col)] = float(val)

# print a preview of the data features
print('Data 1000-Most-Common-Trigrams-Featurized:')
data_featurized.head()

In [None]:
# extract data matrices
X = data_featurized.values.astype(float)
y = X[:,-1]
X = X[:,:-1]

# print shapes of data matrices
print('Shapes of X and y:')
print(X.shape)
print(y.shape)

# print preview of X
print('X= (preview)')
X[0:10,0:10]

In [None]:
# standardize features (without subtracting mean to maintain sparsity)
scaler = StandardScaler(copy=False, with_mean=False, with_std=True)
X = scaler.fit_transform(X)

# print rescaled data
print('X= (preview)')
X[0:10,0:10]

## Exercise 5

**Q:** Link the dataframe to the outcome reverse. Create a training set and test set. Train a LogisticRegression model with default parameters to predict reversal. Compute accuracy and F1 for the prediction in the training set and in the test set. 

**A:** The cases and the reversal outcomes have already been linked in the data loading stage (at the beginning of this notebook).

In [None]:
# shuffle the data (again just to make sure)
X, y = shuffle(X, y, random_state=71)

In [None]:
# create training and test set
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

# train a LogisticRegression model with the default parameters
# (specify solver to avoid warnings)
log_reg = LogisticRegression(solver='liblinear')
log_reg.fit(X_train, y_train)

# do predictions on training and test set
y_train_pred = log_reg.predict(X_train)
y_test_pred = log_reg.predict(X_test)

# print accuracy and F1 for training set
acc = accuracy_score(y_train, y_train_pred, normalize=True)
f1 = f1_score(y_train, y_train_pred, average='macro')
print('Accuracy and F1-Score on Training Set with Default Parameters:')
print('Accuracy: \t'+str(acc))
print('F1: \t\t'+str(f1))
print('')

# print accuracy and F1 for test set
acc = accuracy_score(y_test, y_test_pred, normalize=True)
f1 = f1_score(y_test, y_test_pred, average='macro')
print('Accuracy and F1-Score on Test Set with Default Parameters:')
print('Accuracy: \t'+str(acc))
print('F1: \t\t'+str(f1))
print('')

## Exercise 6

**Q:** Use GridSearchCV() to choose hyperparameters: L1 vs L2 penalty, and regularization parameter C. Report the best model parameters and score. Report the ROC Curve and AUC.

In [None]:
# use logistic regression model
# (specify solver to avoid warnings)
# (increase max_iter to ensure convergence)
log_reg = LogisticRegression(
    solver='liblinear',
    max_iter=10000
)

# specify parameter grid
# - alpha: constant that multiplies L1/L2-penalty terms
# - l1_ratio: mixing parameter
param_grid = {
    'penalty': ['l1', 'l2'],
    'C': [0.0001, 0.001, 0.01, 0.1, 0.2, 0.3, 0.5, 0.7, 1.0, 1.5, 2.0, 5.0]
}

# specify grid search
grid = GridSearchCV(
    estimator=log_reg,          # estimator to use
    param_grid=param_grid,      # parameters to do grid search over
    scoring='f1',               # use F1 score to evaluate models
    n_jobs=-1,                  # use all cores
    iid=True,                   # assume data was i.i.d. (to avoid warning)
    cv=10,                      # use stratified 10-fold CV
    refit=True,                 # re-fit best model
    verbose=0,                  # do not print training progress
    return_train_score=True     # save training scores
)

# train with grid-search
grid.fit(X, y)
print('')

# report best hyperparameters
print('Best Hyperparameters:')
print(grid.best_params_)
print('')

# report the best score
print('Best Score:')
print(grid.best_score_)
print('')

# keep track of best model
best_model = grid.best_estimator_

# print ROC curve
y_pred = cross_val_predict(best_model, X, y, method='decision_function', cv=10)
fpr, tpr, thresholds = roc_curve(y, y_pred)
plt.plot(fpr, tpr)
plt.plot([0,1],[0,1],'k--')
plt.xlabel('False Positive Rate (FPR)')
plt.ylabel('True Positive Rate (TPR)')
plt.show()

# print area under the curve (AUC) score
auc = roc_auc_score(y, y_pred)
print('Area under the Curve (AUC) of Best Model:')
print('AUC: '+str(auc))
print('')

## Exercise 7

**Q:** Make a new dataframe where each sentence (of each case) is treated as a separate document. Compute vader compound sentiment scores for each sentence and report the top 10 highest- and lowest-sentiment sentences.

In [None]:
# build data frame for sentences
columns = [
    'sentence',
    'sentiment_score',
    'cluster'
]
sent_data = pd.DataFrame(columns=pd.Index(columns, name='attributes'))

# set data frame data types
sent_data[['sentiment_score']] = sent_data[['sentiment_score']].astype(float)
sent_data[['cluster']] = sent_data[['cluster']].astype(int)

# gets sentences of a document
def get_sentences(doc):
    sentences = []
    for sentence in doc.sents:
        # replace newlines with spaces
        sentence = sentence.text.replace('\r', ' ').replace('\n', ' ')
        sentences.append(sentence)
    return sentences

# extract sentences from previously processed data
sentences = data['doc'].swifter.apply(get_sentences)

# store sentences into data frame
sent_data['sentence'] = list(itertools.chain.from_iterable(sentences.values))

# print data types
print('Dataframe Data Types:')
print(sent_data.dtypes)
print('')

# print preview of data to show that it's loaded
print('Dataframe Data:')
print(sent_data.head())
print('')

# create sentiment intensity analyzer
sid = SentimentIntensityAnalyzer()

# get the compound polarity score for a sentence
def get_compound_polarity_score(sentence):
    scores = sid.polarity_scores(sentence)
    compound_score = scores['compound']
    return compound_score

# compute the sentiment score for all sentences in data
sent_data['sentiment_score'] = \
    sent_data['sentence'].swifter.apply(get_compound_polarity_score)

# sort sentences by sentiment score
sent_data = sent_data.sort_values(by=['sentiment_score'], ascending=False)

def print_sentences(sents):
    for idx, row in sents.iterrows():
        print('(Score: '+str(row['sentiment_score'])+') '+str(row['sentence']))
        print('')
    print('')

In [None]:
print('10 Highest Sentiment Sentences:\n')
print_sentences(sent_data.head(10))

In [None]:
print('10 Lowest Sentiment Sentences:\n')
print_sentences(sent_data.tail(10))

In [None]:
# re-sort data according to index
sent_data = sent_data.sort_index()

## Exercise 8

**Q:** Use TfidfVectorizer to compute tf-idf frequencies for each sentence, and then compute cosine similarities between all sentences. Report example pairs of very similar sentences and very dissimilar sentences. 

In [None]:
# convert the collection of raw sentences to a matrix of TF-IDF features
tfidf = TfidfVectorizer(
    strip_accents='ascii',  # only consider ascii characters
    lowercase=True,         # convert text to lowercase
    stop_words='english',   # use english stop words
    ngram_range=(2, 3),     # consider (2 to 3)-grams for embeddings
    max_df=0.5,             # ignore terms that are in more than 50% of docs
    min_df=0.001,           # ignore terms that are in less than .1% of docs
    max_features=1000,      # only consider top 1000 terms (w.r.t. term freq.)
    binary=False,           # do not use binary counts (use integer counts)
    norm='l2',              # each sentence-embedding will have unit-norm
    use_idf=True,           # enables inverse-doc-freq weighting
)

# fit TF-IDF Vectorizer
X = tfidf.fit_transform(sent_data['sentence'])

# print embedding dimensions
print('Embedding Dimensionality Chosen by TfidfVectorizer: '+str(X.shape[1]))

# compute the pairwise cosine similarirties
S = cosine_similarity(X, X)

def largest_indices(arr, n):
    """Returns the n largest indices from a numpy array."""
    flat = arr.flatten()
    indices = np.argpartition(flat, -n)[-n:]
    indices = indices[np.argsort(-flat[indices])]
    return np.unravel_index(indices, arr.shape)

In [None]:
# reduce size of S for efficiency reasons
max_idx = min(1000, S.shape[0])
S = S[0:max_idx,0:max_idx]

In [None]:
# report an example of very similar sentences
most_sim_idx = \
    np.unravel_index(np.argmax(S, axis=None), S.shape)
print('Two very similar sentences are ' + str(most_sim_idx) +
      ' with similarity '+str(S[most_sim_idx])+'\n')
print(str(most_sim_idx[0])+": "+sent_data.iloc[most_sim_idx[0]]['sentence']+'\n')
print(str(most_sim_idx[1])+": "+sent_data.iloc[most_sim_idx[1]]['sentence'])

In [None]:
# report an example of very dissimilar sentences
most_dissim_idx = np.unravel_index(np.argmin(S, axis=None), S.shape)
print('Two very dissimilar sentences are ' + str(most_dissim_idx) +
      ' with similarity '+str(S[most_dissim_idx])+":"+'\n')
print(str(most_dissim_idx[0])+": "+sent_data.iloc[most_dissim_idx[0]]['sentence']+'\n')
print(str(most_dissim_idx[1])+": "+sent_data.iloc[most_dissim_idx[1]]['sentence'])

## Exercise 9

**Q:** Use k-means clustering to assign the sentences into 20 clusters. List 5 example sentences from each cluster. 

In [None]:
# build k-means model (with k=20)
num_clusters = 20
km = KMeans(n_clusters=num_clusters, n_jobs=-1)

# fit k-means model to sentences
km.fit(X)

# get the cluster centers
V = km.cluster_centers_

# add cluster labels to data
cluster_labels = km.labels_.tolist()
sent_data['cluster'] = cluster_labels

# list 5 sentences from each cluster
for cluster_idx in range(0, 20):
    cluster_members = sent_data[sent_data['cluster'] == cluster_idx]
    print('Members of Cluster '+str(cluster_idx)+':')
    for idx, row in cluster_members.head(5).iterrows():
        print(row['sentence'])
    print('')