Code for the creation and testing of our LDA model for complexity

All of the cleaned and processed data has been stored in Drive.

All you need are the Shuffled.csv, final_data.csv, and doc_term_matrix.pkl files for running the last section

In [None]:
# Use if you want to run with GPU
%tensorflow_version 2.x
import tensorflow as tf
device_name = tf.test.gpu_device_name()
if device_name != '/device:GPU:0':
  raise SystemError('GPU device not found')
print('Found GPU at: {}'.format(device_name))

Found GPU at: /device:GPU:0


In [None]:
# Connect to Drive
from google.colab import drive
drive.mount('/content/gdrive')

Mounted at /content/gdrive


In [None]:
# install first or the cleantext library won't load
!pip install cleantext

In [None]:
import pandas as pd
import numpy as np
import math
import matplotlib.pyplot as plt
from scipy.stats import entropy
# Uncomment when you need to re-clean the data
# import cleantext
import re
from time import perf_counter

From here ONLY RUN THE LAST CELL OF CODE. If you run other sections/lines you risk messing up the data files and will have to run everything over again. 

WARNING:

DO NOT RUN the Document Matrix Creation code in colab. It exceeds 12gb of ram and will crash the colab process. I had to run it on my laptop locally

The next 2 sections deal with data cleaning

In [None]:
dataset = pd.read_csv('/content/gdrive/MyDrive/CS_490_DATA/final_data.csv').dropna()


# This section does everything but lemmatize the text
t1_start = perf_counter()
with tf.device('/device:GPU:0'): 
    for opinion in range(len(dataset)):

        text = dataset["Opinion"].iloc[opinion]
        text = re.sub(r'[^\w\s]', '', text)

        clean = cleantext.clean(text, clean_all= False, # Execute all cleaning operations
          extra_spaces=True ,  # Remove extra white spaces
          stemming= False, # Stem the words
          stopwords=True ,# Remove stop words
          lowercase=True ,# Convert to lowercase
          numbers=True ,# Remove all digits
          punct=True)
        
        dataset.at[opinion, 'Opinion'] = clean
        
    print("cleaning done")

t1_end = perf_counter()

print("Elapsed time:", t1_end - t1_start)

dataset.to_csv("/content/gdrive/MyDrive/CS_490_DATA/Cleaned_data.csv", index=False)


In [None]:
# This section lemmatizes the text using spacy library

dataset_clean = pd.read_csv('/content/gdrive/MyDrive/CS_490_DATA/Cleaned_data.csv').dropna()

import spacy
nlp = spacy.load('en_core_web_sm')


cleaned_data = dataset_clean
with tf.device('/device:GPU:0'): 
  for i in range(len(cleaned_data)):
      text = str(cleaned_data["Opinion"].iloc[i])
      # Create a Doc object
      doc = nlp(text, disable=["tagger", "parser", "ner","attribute_ruler"]) # disable removes unecessary functions that increase function time
      # Create list of tokens from given Doc object
      sentence = " ".join([token.lemma_ if token.lemma_ != '-PRON-' else token.lower_ for token in doc])
      cleaned_data.at[i,'Opinion'] = sentence
  print("lemma done")
  cleaned_data.to_csv('/content/gdrive/MyDrive/CS_490_DATA/Cleaned_2_data.csv')

In [None]:
dataset_to_be_shuffled = pd.read_csv('/content/gdrive/MyDrive/CS_490_DATA/Cleaned_2_data.csv').dropna()

shuffled = dataset_to_be_shuffled.sample(frac=1)
shuffled = shuffled.drop(columns = ["Unnamed: 0","Unnamed: 0.1"])
shuffled.to_csv("Shuffled.csv")

In [None]:
from sklearn.feature_extraction.text import CountVectorizer

count_vect = CountVectorizer(max_df=0.9, min_df=.05, stop_words='english')
doc_term_matrix = count_vect.fit_transform(shuffled['Opinion'].values.astype('U'))
print(doc_term_matrix.shape)
print("doc_term_done")

import joblib

# Save Document Term Matrix as a picle file
joblib.dump(doc_term_matrix, "doc_term_matrix.pkl")

In [None]:
final_dataset = pd.read_csv('/content/gdrive/MyDrive/CS_490_DATA/final_data.csv').dropna()


# Load in the Document Term Matrix
import joblib
doc_term_matrix = joblib.load("/content/gdrive/MyDrive/CS_490_DATA/doc_term_matrix.pkl")


# Split into traing and testing set
training = doc_term_matrix[0:4800]
test = doc_term_matrix[4800:]

with tf.device('/device:GPU:0'):  
  from sklearn.decomposition import LatentDirichletAllocation

  t3_start = perf_counter()

  LDA = LatentDirichletAllocation(n_components=20)
  LDA.fit(training)
  print(LDA.perplexity(test))
  topic_values = LDA.transform(doc_term_matrix)
  calc = []
  for i in range(len(topic_values)):
      entr = entropy(topic_values[i],base=20)
      calc.append(entr*100)


final_dataset['Topic_Score'] = calc
print(final_dataset[['Case_id','Topic_Score']])

t3_end = perf_counter()
print("LDA Elapsed time:", t3_end - t3_start)

From here on down is code for experiemntal results. Two topic visualization, one for the initial and best model. Also, some basic statistical results like the mean and max entropy scores for each court.

In [None]:
!pip install pyLDAvis

Collecting pyLDAvis
  Downloading pyLDAvis-3.3.1.tar.gz (1.7 MB)
[?25l[K     |▏                               | 10 kB 23.5 MB/s eta 0:00:01[K     |▍                               | 20 kB 19.4 MB/s eta 0:00:01[K     |▋                               | 30 kB 10.4 MB/s eta 0:00:01[K     |▉                               | 40 kB 8.6 MB/s eta 0:00:01[K     |█                               | 51 kB 4.7 MB/s eta 0:00:01[K     |█▏                              | 61 kB 5.5 MB/s eta 0:00:01[K     |█▍                              | 71 kB 5.6 MB/s eta 0:00:01[K     |█▋                              | 81 kB 5.6 MB/s eta 0:00:01[K     |█▉                              | 92 kB 6.2 MB/s eta 0:00:01[K     |██                              | 102 kB 5.3 MB/s eta 0:00:01[K     |██▏                             | 112 kB 5.3 MB/s eta 0:00:01[K     |██▍                             | 122 kB 5.3 MB/s eta 0:00:01[K     |██▋                             | 133 kB 5.3 MB/s eta 0:00:01[K     |██

In [None]:
import joblib
count_vect = joblib.load('/content/gdrive/MyDrive/CS_490_DATA/count_vect.pkl')
doc_term_matrix = joblib.load("/content/gdrive/MyDrive/CS_490_DATA/doc_term_matrix.pkl")

lda_init = joblib.load("/content/gdrive/MyDrive/CS_490_DATA/lda_init.pkl")
lda_best = joblib.load("/content/gdrive/MyDrive/CS_490_DATA/lda_best.pkl")

import pyLDAvis
import pyLDAvis.sklearn
pyLDAvis.enable_notebook()

pyLDAvis.enable_notebook()

https://scikit-learn.org/stable/modules/model_persistence.html#security-maintainability-limitations
  from collections import Iterable
  from collections import Mapping


In [None]:
vis1 = pyLDAvis.sklearn.prepare(lda_init, doc_term_matrix, count_vect)
vis1

  by='saliency', ascending=False).head(R).drop('saliency', 1)


In [None]:
vis2 = pyLDAvis.sklearn.prepare(lda_best, doc_term_matrix, count_vect)
vis2

  by='saliency', ascending=False).head(R).drop('saliency', 1)


In [None]:
dataset = pd.read_csv('/content/gdrive/MyDrive/CS_490_DATA/Shuffled.csv').dropna()

with tf.device('/device:GPU:0'):  
  from sklearn.decomposition import LatentDirichletAllocation

  topic_values = lda_best.transform(doc_term_matrix)
  calc = []
  for i in range(len(topic_values)):
      entr = entropy(topic_values[i],base=20) # make the base = n_components to normalize results between 0-1
      calc.append(entr*100)


dataset['Complexity'] = calc
print(dataset[['Case_id','Complexity']])

       Case_id  Complexity
0     20-01615   59.400737
1     20-56075   38.838116
2     20-02943   34.890994
3     20-50157   57.756222
4     19-03586   45.178934
...        ...         ...
5995  19-01639   67.555699
5996  20-10479   44.450517
5997  20-05027   67.370377
5998  20-02615   45.054450
5999  16-04831   48.812198

[6000 rows x 2 columns]


In [None]:
print(dataset["Complexity"].describe())

count    6000.000000
mean       39.383367
std        17.009937
min         0.111783
25%        28.200564
50%        41.806555
75%        52.204849
max        78.863183
Name: Complexity, dtype: float64


In [None]:
print(dataset.groupby("Court_id")["Complexity"].mean().sort_values(ascending=False))

Court_id
2     40.729527
1     40.723258
4     40.058614
12    39.573068
10    39.551457
9     39.496798
3     39.296477
7     39.269191
8     39.125632
5     38.949504
11    38.396673
6     37.430201
Name: Complexity, dtype: float64


In [None]:
print(dataset.groupby("Complexity")["Complexity"].max().sort_values(ascending=False))

Complexity
78.863183    78.863183
77.970967    77.970967
77.119405    77.119405
76.816179    76.816179
76.776103    76.776103
               ...    
0.299773      0.299773
0.233549      0.233549
0.228016      0.228016
0.115672      0.115672
0.111783      0.111783
Name: Complexity, Length: 5572, dtype: float64


In [None]:
print(dataset[dataset["Complexity"] == dataset["Complexity"].max()])

      Former_Index  Court_id   Case_id  \
2844           784         2  20-01228   

                                                Opinion  Complexity  
2844  case document page l doe v trump corporation u...   78.863183  


In [None]:
print(dataset[dataset["Complexity"] == dataset["Complexity"].min()])

      Former_Index  Court_id   Case_id  \
231           2978         6  20-04010   
1147          1244         3  19-03150   
1306           662         2  17-02898   
1747          1785         4  19-07508   
2185          5492        11  19-13315   
2444          5653        12  18-03016   
3353           212         1  19-02095   
3523          1145         3  19-03042   
3574          3958         8  12-02716   
4263          3398         7  20-02919   
4921          5004        11  18-12104   

                                                Opinion  Complexity  
231   case document file page recommend publication ...    0.111783  
1147  case document page date file precedential unit...    0.111783  
1306  case document page martinez de artiga v barr k...    0.111783  
1747  usca appeal doc file pg unpublished unite stat...    0.111783  
2185  case date file page publish unite state court ...    0.111783  
2444  usca case document file page unite state court...    0.111783  
3353 

In [None]:
import plotly.express as px

  if not isinstance(key, collections.Hashable):
Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
  np.divide(0.4, 1, casting="unsafe", dtype=np.float),
Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
  from .mio5_utils import VarReader5


In [None]:
fig = px.box(dataset, y="Complexity",title='Complexity Box-Plot')
fig.show()

In [None]:
fig = px.box(dataset, x="Court_id", y="Complexity",color="Court_id",title='Complexity By Court (Box-Plot)')
fig.show()

In [None]:
fig = px.histogram(dataset, x="Complexity",title="Complexity Histogram")
fig.show()

In [None]:
import plotly.figure_factory as ff

In [None]:
x = dataset["Complexity"]
hist_data = [x]
group_labels = ['distplot'] # name of the dataset

fig = ff.create_distplot(hist_data, group_labels)
fig.show()

In [None]:
x = []
y = []
for i in range(1,13):
  x.append(dataset[dataset["Court_id"] == i]["Complexity"].mean())
  y.append(i)


In [None]:
dat = pd.DataFrame({'Complexity': x,
                    'Court_id': y})

fig = px.bar(dat, x='Court_id', y='Complexity',color="Court_id",title="Average Complexity by Court")

fig.show()

In [None]:
data2 = pd.read_csv('/content/gdrive/MyDrive/CS_490_DATA/cleaned_2_with_pol_and_sub.csv').dropna()
print(data2)

In [None]:
dataset = dataset.sort_values(by=["Former_Index"],ascending= True).reset_index(drop=True)

In [None]:
print(dataset)

In [None]:
dataset["Polarity"] = data2["polarity"]
dataset["Subjectivity"] = data2["subjectivity"]

dataset["Uncleaned_Opinion"] = pd.read_csv("/content/gdrive/MyDrive/CS_490_DATA/final_data.csv")["Opinion"]

In [None]:
print(dataset)

In [None]:

fig = px.scatter_matrix(dataset,
    dimensions=["Complexity", "Polarity", "Subjectivity"],title="Correlation Plot of All Variables of Interest")
fig.show()

In [None]:
dataset = dataset.drop(["Former_Index"],axis=1)
for i in range(len(dataset)):
  words = dataset["Case_id"].iloc[i].split('-')
  string = words[0] + '-' + words[1][1:]
  dataset["Case_id"][i] = string


In [None]:
print(dataset)

      Court_id  Case_id                                            Opinion  \
0            1  21-1566  case document page date file entry i would uni...   
1            1  19-1622  case document page date file entry i would uni...   
2            1  20-1434  case document page date file entry i would uni...   
3            1  19-2104  case document page date file entry i would uni...   
4            1  20-1942  case document page date file entry i would uni...   
...        ...      ...                                                ...   
5995        12  21-5080  usca case document file page unite state court...   
5996        12  20-1045  usca case document file page unite state court...   
5997        12  19-5199  usca case document file page unite state court...   
5998        12  20-7110  usca case document file page unite state court...   
5999        12  18-3070  usca case document file page unite state court...   

      Complexity  Polarity  Subjectivity  \
0      25.709101 -0

In [None]:
dataset.to_csv("/content/gdrive/MyDrive/CS_490_DATA/Court_Complexity_Polarity_Subjectivity_data.csv",index=False)