# United States Case Law Test

## __Wrangling__

In [2]:
from casewrangler import CaseWrangler
import os, sys, pickle, string
import pandas as pd
import numpy as np

import lucem_illud 
import gensim #For word2vec, etc
import nltk #For stop words and stemmers
from nltk.corpus import stopwords #For stopwords
import sklearn.metrics.pairwise #For cosine similarity
import sklearn.manifold #For T-SNE
import sklearn.decomposition #For PCA

import warnings
warnings.filterwarnings("ignore", category=DeprecationWarning) 

%matplotlib inline

import os.path
stop_words_nltk = stopwords.words('english')

In [2]:
cw = CaseWrangler()

usPath = os.path.abspath('United States-20181204-xml/data/data.jsonl.xz')
usCorpus = os.path.abspath('United States-20181204-xml/corpus')
usCases = cw.extractCases(usPath)

In [3]:
scCases = []
for case in usCases:
    if case['court']['name'] == 'Supreme Court of the United States':
        scCases.append(case)
        
scDF = pd.DataFrame.from_dict(scCases)
scDF = scDF.drop(columns=['citations', 'court', 'first_page', 'id', 'jurisdiction', 'last_page', 'name', 'reporter', 'volume'])

scDF['decision_date'] = pd.to_datetime(scDF['decision_date'])
scDF['docket_number'].replace(regex=True, inplace=True, to_replace=r'[No. ]',value=r'')
scDF['docket_number'].replace(regex=True, inplace=True, to_replace=r'[;]',value=r',')

scDF['opinions'] = [scDF.casebody[i]['data']['opinions'] for i in scDF.index]
scDF = scDF.drop(columns=['casebody'])

In [4]:
allOpinions = []
opTypes = ['majority', 'dissent', 'concurrence', 'concurring-in-part-and-dissenting-in-part']

for opinions in scDF.opinions:
    text = [opinion['text'] for opinion in opinions if opinion['type'] in opTypes]
    allOpinions.append(text)
    
scDF['opinionText'] = allOpinions
scDF['opinionText'] = [' '.join(text) for text in allOpinions]
# scDF['length'] = scDF.opinionText.str.len()

In [7]:
scdbDF = pd.read_excel('SCDB_2018_02_caseCentered_Docket.xlsx')
issueDF = scdbDF[['docket', 'issueArea']]
scDF_issue = scDF.merge(issueDF, how='inner', left_on='docket_number', right_on='docket')
scDF_issue.info()
# civCases = caseDFmerged[caseDFmerged['issueArea'] == 2]
# civCases.head()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 331443 entries, 0 to 331442
Data columns (total 7 columns):
decision_date        331443 non-null datetime64[ns]
docket_number        331443 non-null object
name_abbreviation    331443 non-null object
opinions             331443 non-null object
opinionText          331443 non-null object
docket               331443 non-null object
issueArea            327488 non-null float64
dtypes: datetime64[ns](1), float64(1), object(5)
memory usage: 20.2+ MB


## __Cleaning__

In [9]:
scDF_issue.opinionText = scDF_issue.opinionText.map(lambda x: x.lower())
scDF_issue.opinionText = scDF_issue.opinionText.map(lambda x: x.replace('\n', ' ')) 
scDF_issue.opinionText = scDF_issue.opinionText.map(lambda x: x.replace('§', 'section')) 
scDF_issue.opinionText = scDF_issue.opinionText.map(lambda x: x.replace('united states', 'u_s'))
scDF_issue.opinionText = scDF_issue.opinionText.map(lambda x: x.replace(' u. s. ', '_u_s_')) 
scDF_issue.opinionText = scDF_issue.opinionText.map(lambda x: x.replace(' u.s. ', 'u_s')) 
scDF_issue.opinionText = scDF_issue.opinionText.map(lambda x: x.replace(' v. ', '_v_')) 
scDF_issue.opinionText = scDF_issue.opinionText.map(lambda x: x.replace(' f. supp. ', '_f_supp_')) 
scDF_issue.opinionText = scDF_issue.opinionText.map(lambda x: x.replace(' u. s. c. ', '_u_s_c_'))
scDF_issue.opinionText = scDF_issue.opinionText.map(lambda x: x.replace('■', ''))
scDF_issue.opinionText = scDF_issue.opinionText.map(lambda x: x.replace('  ', ' '))
scDF_issue.opinionText = scDF_issue.opinionText.map(lambda x: x.translate(str.maketrans('', '', string.punctuation))) 

scDF_issue.head()

Unnamed: 0,decision_date,docket_number,name_abbreviation,opinionText,issueArea
0,1977-05-31,76-333,"United Air Lines, Inc. v. Evans",mr. justice stevens delivered the opinion of t...,2.0
1,1977-01-25,76-333,"United Air Lines, Inc. v. Evans","c. a. 7th cir. certiorari granted, ante, p. 91...",2.0
2,1977-01-17,76-333,"United Air Lines, Inc. v. Evans","c. a. 7th cir. certiorari granted, ante, p. 91...",2.0
3,1976-11-01,76-333,"United Air Lines, Inc. v. Evans",c. a. 7th cir. certiorari granted.,2.0
4,1977-05-31,76-777,Connor v. Finch,mr. justice stewart delivered the opinion of t...,2.0


In [9]:
scDF_issue = pd.read_csv('scDF_by_issue.csv')
scDF_issue.opinionText = scDF_issue.opinionText.astype(str)
scDF_issue.opinionText = scDF_issue.opinionText.map(lambda x: x.replace('  ', ' '))
scDF_issue.opinionText = scDF_issue.opinionText.map(lambda x: x.replace('cir.', 'circuit'))
scDF_issue.opinionText = scDF_issue.opinionText.map(lambda x: x.replace('c. a.', 'court of appeal'))
scDF_issue.head()

Unnamed: 0,decision_date,docket_number,name_abbreviation,opinionText,issueArea
0,1977-05-31,76-333,"United Air Lines, Inc. v. Evans",mr. justice stevens delivered the opinion of t...,2.0
1,1977-01-25,76-333,"United Air Lines, Inc. v. Evans",court of appeal 7th circuit certiorari granted...,2.0
2,1977-01-17,76-333,"United Air Lines, Inc. v. Evans",court of appeal 7th circuit certiorari granted...,2.0
3,1976-11-01,76-333,"United Air Lines, Inc. v. Evans",court of appeal 7th circuit certiorari granted.,2.0
4,1977-05-31,76-777,Connor v. Finch,mr. justice stewart delivered the opinion of t...,2.0


In [7]:
# scDF_issue.opinionText = scDF_issue.opinionText.map(lambda x: x.translate(str.maketrans('', '', string.punctuation))) 
# scDF_issue.head()

Unnamed: 0,decision_date,docket_number,name_abbreviation,opinionText,issueArea
0,1977-05-31,76-333,"United Air Lines, Inc. v. Evans",mr justice stevens delivered the opinion of th...,2.0
1,1977-01-25,76-333,"United Air Lines, Inc. v. Evans",court of appeal 7th circuit certiorari granted...,2.0
2,1977-01-17,76-333,"United Air Lines, Inc. v. Evans",court of appeal 7th circuit certiorari granted...,2.0
3,1976-11-01,76-333,"United Air Lines, Inc. v. Evans",court of appeal 7th circuit certiorari granted,2.0
4,1977-05-31,76-777,Connor v. Finch,mr justice stewart delivered the opinion of th...,2.0


In [10]:
scDF_issue['tokenized_text'] = \
    scDF_issue['opinionText'].apply(lambda x: [nltk.word_tokenize(s) for s in nltk.sent_tokenize(x)])

scDF_issue['sent_count'] = scDF_issue['tokenized_text'].apply(lambda x: len(x))

## __Word Embeddings__

In [None]:
scDF_issue_test = scDF_issue[scDF_issue.sent_count >= 5]
scDF_issue_test.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 65216 entries, 0 to 204554
Data columns (total 7 columns):
decision_date        65216 non-null object
docket_number        65216 non-null object
name_abbreviation    65216 non-null object
opinionText          65216 non-null object
issueArea            63760 non-null float64
tokenized_text       65216 non-null object
sent_count           65216 non-null int64
dtypes: float64(1), int64(1), object(5)
memory usage: 4.0+ MB


### Word2Vec

#### __(a) CBOW__

In [None]:
scW2V = gensim.models.word2vec.Word2Vec(scDF_issue_test['normalized_tokens'].sum())

In [494]:
print("A {} dimesional vector:".format(scW2V['abortion'].shape[0]))

A 100 dimesional vector:


In [495]:
scW2V.most_similar('abortion')

[('abortions', 0.7884458303451538),
 ('postviability', 0.6882057189941406),
 ('woman', 0.6716547012329102),
 ('pregnancy', 0.6633925437927246),
 ('pregnant', 0.6534528732299805),
 ('childbirth', 0.6517119407653809),
 ('pregnancies', 0.6327844858169556),
 ('surgery', 0.6324198246002197),
 ('fetus', 0.6008843183517456),
 ('physician', 0.5988410115242004)]

In [None]:
numWords = 150
targetWords = scW2V.wv.index2word[numWords]

wordsSubMatrix = []
for word in targetWords:
    wordsSubMatrix.append(scW2V[word])
wordsSubMatrix = np.array(wordsSubMatrix)
wordsSubMatrix

pcaWords = sklearn.decomposition.PCA(n_components = 50).fit(wordsSubMatrix)
reducedPCA_data = pcaWords.transform(wordsSubMatrix)
#T-SNE is theoretically better, but you should experiment
tsneWords = sklearn.manifold.TSNE(n_components = 2, early_exaggeration = 25).fit_transform(reducedPCA_data)

In [None]:
fig = plt.figure(figsize = (15,10))
ax = fig.add_subplot(111)
ax.set_frame_on(False)
plt.scatter(tsneWords[:, 0], tsneWords[:, 1], alpha = 0)#Making the points invisible 
for i, word in enumerate(targetWords):
    ax.annotate(word, 
                (tsneWords[:, 0][i],tsneWords[:, 1][i]), 
                size =  20 * (numWords - i) / numWords, 
                alpha = .8 * (numWords - i) / numWords + .2)
plt.xticks(())
plt.yticks(())
plt.show()

#### __(b) SGNS__

In [None]:
scW2V = gensim.models.word2vec.Word2Vec(scDF_issue_test['tokenized_text'].sum(), min_count=100, workers=8, sg=1)

In [6]:
print("A {} dimesional vector:".format(scW2V['abortion'].shape[0]))

KeyError: "word 'abortion' not in vocabulary"

In [495]:
scW2V.most_similar('abortion')

[('abortions', 0.7884458303451538),
 ('postviability', 0.6882057189941406),
 ('woman', 0.6716547012329102),
 ('pregnancy', 0.6633925437927246),
 ('pregnant', 0.6534528732299805),
 ('childbirth', 0.6517119407653809),
 ('pregnancies', 0.6327844858169556),
 ('surgery', 0.6324198246002197),
 ('fetus', 0.6008843183517456),
 ('physician', 0.5988410115242004)]

In [10]:
numWords = 50
targetWords = scW2V.wv.index2word[numWords]

wordsSubMatrix = []
for word in targetWords:
    wordsSubMatrix.append(scW2V[word])
wordsSubMatrix = np.array(wordsSubMatrix)
wordsSubMatrix

pcaWords = sklearn.decomposition.PCA(n_components = 50).fit(wordsSubMatrix)
reducedPCA_data = pcaWords.transform(wordsSubMatrix)
#T-SNE is theoretically better, but you should experiment
tsneWords = sklearn.manifold.TSNE(n_components = 2, early_exaggeration = 25).fit_transform(reducedPCA_data)

  explained_variance_ = (S ** 2) / (n_samples - 1)


ValueError: Expected n_neighbors > 0. Got 0

In [None]:
fig = plt.figure(figsize = (15,10))
ax = fig.add_subplot(111)
ax.set_frame_on(False)
plt.scatter(tsneWords[:, 0], tsneWords[:, 1], alpha = 0)#Making the points invisible 
for i, word in enumerate(targetWords):
    ax.annotate(word, 
                (tsneWords[:, 0][i],tsneWords[:, 1][i]), 
                size =  20 * (numWords - i) / numWords, 
                alpha = .8 * (numWords - i) / numWords + .2)
plt.xticks(())
plt.yticks(())
plt.show()

## __Confidence Intervals__

### Boostrapping

In [None]:
estimatesB=[]
for x in range(20):
    scW2VB = gensim.models.word2vec.Word2Vec(scDF_filter['normalized_sents'].sample(frac=1.0, replace=True).sum())
    try:
        estimatesB.append(cos_difference(scW2VB, 'liberty', 'constraint')[0,0])
    except KeyError:
        #Missing one of the words from the vocab
        pass
                                                      
estimatesB.sort()         
estimatesB

In [None]:
print("The 90% confidence interval for the cosine distance between liberty and constraint is:\n",
      estimatesB[1], estimatesB[-2])

### Subsampling

In [None]:
n_samples = 10
sample_indices = np.random.randint(0,n_samples,(len(scDF_filter),))

s_k =np.array([])
tau_k=np.array([])

for i in range(n_samples):
    sample_w2v = gensim.models.word2vec.Word2Vec(scDF_filter[sample_indices == i]['normalized_sents'].sum())
    try:
        #Need to use words present in most samples
        s_k = np.append(s_k, cos_difference(sample_w2v, 'liberty', 'constraint')[0,0])
    except KeyError:
        pass
    else:
        tau_k = np.append(tau_k, len(scDF_filter[sample_indices == i]))

print(s_k)
print(tau_k)

tau = tau_k.sum()
s = s_k.mean()
B_k = np.sqrt(tau_k) * s_k-s_k.mean()

In [None]:
print("The 90% confidence interval for the cosine distance between liberty and constraint is:\n",
      s-B_k[-2]/np.sqrt(tau), s-B_k[1]/np.sqrt(tau))

In [None]:
scW2V.save("WORD2VecTest.mm")

### Loss

In [None]:
scW2V_loss = gensim.models.word2vec.Word2Vec(size = 100, alpha=0.025, window=5, 
                                             min_count=5, hs=0, compute_loss = True)

scW2V_loss.build_vocab(scDF_filter['normalized_sents'].sum())
scW2V_loss.train(scDF_filter['normalized_sents'].sum(), total_examples=scW2V.corpus_count, epochs=1)
#Using a list so we can capture every epoch
losses = [scW2V_loss.running_training_loss]
losses[0]

In [None]:
for i in range(19):
    scW2V_loss.train(scDF_filter['normalized_sents'].sum(), 
                     total_examples=senReleasesW2V.corpus_count, epochs=1)
    losses.append(scW2V_loss.running_training_loss)
    print("Done epoch {}".format(i + 2), end = '\r')

In [None]:
lossesDF = pd.DataFrame({'loss' : losses, 'epoch' : range(len(losses))})
lossesDF.plot(y = 'loss', x = 'epoch', logy=False, figsize=(15, 7))
plt.show()

In [None]:
losses_dims=[]

for d in [50,100,150,200,250,300,350,400,450,500, 550, 600, 650, 700, 750]:
    scW2V_loss_dims = gensim.models.word2vec.Word2Vec(size = d, alpha=0.025, window=5, 
                                                      min_count=5, hs=0, compute_loss = True)
    scW2V_loss_dims.build_vocab(scDF_filter['normalized_sents'].sum())
    scW2V_loss_dims.train(scDF_filter['normalized_sents'].sum(), 
                     total_examples = scW2V.corpus_count, 
                     epochs=7)
    scW2V_loss_dims.train(scDF_filter['normalized_sents'].sum(), 
                     total_examples = scW2V.corpus_count, 
                     epochs=1)
    
    losses_dims.append(scW2V_loss_dims.running_training_loss/(10+d*10))

In [None]:
losses_dimsDF = pandas.DataFrame({'loss' : losses_dims, 'dimensions' : [50,100,150,200,250,300,350,400,450,500,550,600,650,700,750]})
losses_dimsDF.plot(y = 'loss', x = 'dimensions', logy=False, figsize=(15, 7))
plt.show()

## __Linguistic Change__

In [492]:
def calcSynNorm(model):
    """since syn0norm is now depricated"""
    return (model.wv.syn0 / np.sqrt((model.wv.syn0 ** 2).sum(-1))[..., np.newaxis]).astype(np.float32)

def smartProcrustesAlignGensim(base_embed, other_embed, words=None):
    """Procrustes align two gensim word2vec models (to allow for comparison between same word across models).
    Code ported from HistWords <https://github.com/williamleif/histwords> by William Hamilton <wleif@stanford.edu>.
    (With help from William. Thank you!)
    First, intersect the vocabularies (see 'intersectionAlignGensim' documentation).
    Then do the alignment on the other_embed model.
    Replace the other_embed model's syn0 and syn0norm numpy matrices with the aligned version.
    Return other_embed.
    If `words` is set, intersect the two models' vocabulary with the vocabulary in words (see 'intersectionAlignGensim' documentation).
    """
    base_embed = copy.copy(base_embed)
    other_embed = copy.copy(other_embed)
    # make sure vocabulary and indices are aligned
    in_base_embed, in_other_embed = intersectionAlignGensim(base_embed, other_embed, words=words)

    # get the embedding matrices
    base_vecs = calcSynNorm(in_base_embed)
    other_vecs = calcSynNorm(in_other_embed)

    # just a matrix dot product with numpy
    m = other_vecs.T.dot(base_vecs) 
    # SVD method from numpy
    u, _, v = np.linalg.svd(m)
    # another matrix operation
    ortho = u.dot(v) 
    # Replace original array with modified one
    # i.e. multiplying the embedding matrix (syn0norm)by "ortho"
    other_embed.wv.syn0norm = other_embed.wv.syn0 = (calcSynNorm(other_embed)).dot(ortho)
    return other_embed
    
def intersectionAlignGensim(m1,m2, words=None):
    """
    Intersect two gensim word2vec models, m1 and m2.
    Only the shared vocabulary between them is kept.
    If 'words' is set (as list or set), then the vocabulary is intersected with this list as well.
    Indices are re-organized from 0..N in order of descending frequency (=sum of counts from both m1 and m2).
    These indices correspond to the new syn0 and syn0norm objects in both gensim models:
        -- so that Row 0 of m1.syn0 will be for the same word as Row 0 of m2.syn0
        -- you can find the index of any word on the .index2word list: model.index2word.index(word) => 2
    The .vocab dictionary is also updated for each model, preserving the count but updating the index.
    """

    # Get the vocab for each model
    vocab_m1 = set(m1.wv.vocab.keys())
    vocab_m2 = set(m2.wv.vocab.keys())

    # Find the common vocabulary
    common_vocab = vocab_m1&vocab_m2
    if words: common_vocab&=set(words)

    # If no alignment necessary because vocab is identical...
    if not vocab_m1-common_vocab and not vocab_m2-common_vocab:
        return (m1,m2)

    # Otherwise sort by frequency (summed for both)
    common_vocab = list(common_vocab)
    common_vocab.sort(key=lambda w: m1.wv.vocab[w].count + m2.wv.vocab[w].count,reverse=True)

    # Then for each model...
    for m in [m1,m2]:
        # Replace old syn0norm array with new one (with common vocab)
        indices = [m.wv.vocab[w].index for w in common_vocab]
        old_arr = calcSynNorm(m)
        new_arr = np.array([old_arr[index] for index in indices])
        m.wv.syn0norm = m.wv.syn0 = new_arr

        # Replace old vocab dictionary with new one (with common vocab)
        # and old index2word with new one
        m.index2word = common_vocab
        old_vocab = m.wv.vocab
        new_vocab = {}
        for new_index,word in enumerate(common_vocab):
            old_vocab_obj=old_vocab[word]
            new_vocab[word] = gensim.models.word2vec.Vocab(index=new_index, count=old_vocab_obj.count)
        m.wv.vocab = new_vocab

    return (m1,m2)

In [None]:
def compareModels(df, category, sort = True):
    """If you are using time as your category sorting is important"""
    embeddings_raw = {}
    cats = sorted(set(df[category]))
    for cat in cats:
        #This can take a while
        print("Embedding {}".format(cat), end = '\r')
        subsetDF = df[df[category] == cat]
        #You might want to change the W2V parameters
        embeddings_raw[cat] = gensim.models.word2vec.Word2Vec(subsetDF['normalized_sents'].sum())
    #These are much quicker
    embeddings_aligned = {}
    for catOuter in cats:
        embeddings_aligned[catOuter] = [embeddings_raw[catOuter]]
        for catInner in cats:
            embeddings_aligned[catOuter].append(smartProcrustesAlignGensim(embeddings_aligned[catOuter][-1], embeddings_raw[catInner]))
    return embeddings_raw, embeddings_aligned

In [None]:
opinionDF = copy.deepcopy(scDF_issue)
opinionDF['year'] = opinionDF.decision_date.year
rawEmbeddings, comparedEmbeddings = compareModels(opinionDF, 'year')

In [None]:
def getDivergenceDF(word, embeddingsDict):
    dists = []
    cats = sorted(set(embeddingsDict.keys()))
    dists = {}
    for cat in cats:
        dists[cat] = []
        for embed in embeddingsDict[cat][1:]:
            dists[cat].append(np.abs(1 - sklearn.metrics.pairwise.cosine_similarity(embeddingsDict[cat][0][word],
                                                                             embed[word])[0,0]))
    return pandas.DataFrame(dists, index = cats)

In [None]:
targetWord = 'abortion'

pltDF = getDivergenceDF(targetWord, comparedEmbeddings)
fig, ax = plt.subplots(figsize = (15, 7))
seaborn.heatmap(pltDF, ax = ax, annot = False) #set annot True for a lot more information
ax.set_xlabel("Starting year")
ax.set_ylabel("Final year")
ax.set_ylabel("Final year")
ax.set_title("Yearly linguistic change for: '{}'".format(targetWord))
plt.show()

In [None]:
def findDiverence(word, embeddingsDict):
    cats = sorted(set(embeddingsDict.keys()))
    
    dists = []
    for embed in embeddingsDict[cats[0]][1:]:
        dists.append(1 - sklearn.metrics.pairwise.cosine_similarity(embeddingsDict[cats[0]][0][word], embed[word])[0,0])
    return sum(dists)

def findMostDivergent(embeddingsDict):
    words = []
    for embeds in embeddingsDict.values():
        for embed in embeds:
            words += list(embed.wv.vocab.keys())
    words = set(words)
    print("Found {} words to compare".format(len(words)))
    return sorted([(w, findDiverence(w, embeddingsDict)) for w in words], key = lambda x: x[1], reverse=True)

In [None]:
wordDivergences = findMostDivergent(comparedEmbeddings)

### Most Divergent

In [None]:
wordDivergences[:10]

In [None]:
targetWord = wordDivergences[0][0]

pltDF = getDivergenceDF(targetWord, comparedEmbeddings)
fig, ax = plt.subplots(figsize = (10, 7))
seaborn.heatmap(pltDF, ax = ax, annot = False) #set annot True for a lot more information
ax.set_xlabel("Starting year")
ax.set_ylabel("Final year")
ax.set_ylabel("Final year")
ax.set_title("Yearly linguistic change for: '{}'".format(targetWord))
plt.show()

### Least Divergent

In [None]:
wordDivergences[-10:]

In [None]:
targetWord = wordDivergences[-1][0]

pltDF = getDivergenceDF(targetWord, comparedEmbeddings)
fig, ax = plt.subplots(figsize = (10, 7))
seaborn.heatmap(pltDF, ax = ax, annot = False) #set annot True for a lot more information
ax.set_xlabel("Starting year")
ax.set_ylabel("Final year")
ax.set_ylabel("Final year")
ax.set_title("Yearly linguistic change for: '{}'".format(targetWord))
plt.show()