# Work on citation scraping for each opinions and try to create clusters of opinions based on spectral analysis of an opinions graph

In [1]:
from bs4 import BeautifulSoup
import pandas
import numpy as np
import matplotlib as mlp
import matplotlib.pyplot as plt
import matplotlib.colors
import networkx as nx
from numpy.linalg import eigh
from sklearn.datasets import make_blobs
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_samples, silhouette_score

### Import the opinions sample

In [2]:
opinions = pandas.read_csv("opinions_sample.csv", sep=";")
print(list(opinions))

['absolute_url', 'author', 'author_str', 'cluster', 'date_created', 'date_modified', 'download_url', 'extracted_by_ocr', 'html', 'html_lawbox', 'html_with_citations', 'joined_by', 'local_path', 'opinions_cited', 'per_curiam', 'plain_text', 'resource_uri']


In [3]:
opinions.html[0]

'<p class="case_cite">544 U.S. 917</p>\n    <p class="parties">BERWICK<br><i>v.</i><br>UNITED STATES.</p>\n    <p class="docket">No. 04-8529.</p>\n    <p class="court">Supreme Court of United States.</p>\n    <p class="date">March 21, 2005.</p>\n    <div class="num" id="p1">\n      <span class="num">1</span>\n      <p class="indent">C. A. 2d Cir. Reported below: 107 Fed. Appx. 253. Motions of petitioners for leave to proceed <i>in forma pauperis</i> granted. Certiorari granted, judgments vacated, and cases remanded for further consideration in light of <i>United States</i> v. <i>Booker,</i> 543 U. S. 220 (2005).</p>\n    </div>\n    '

In [4]:
columns = ['case_cite','parties','docket','court','date','intent']
extracted_data = pandas.DataFrame(columns=columns)

### Extract ids for each opinions

In [5]:
extraction = opinions.absolute_url.str.split('/',expand = True)
extracted_id = extraction[extraction.columns[2]]

### Scrap the HTML for each opinions in order to create a new data frame based on HTML class

In [6]:
missing_html=[]
for i in range(len(opinions.html)):
    case_cite, parties, docket, court, date, indent = (list(),)*6
    try:
        soup = BeautifulSoup(opinions.html[i],"html.parser")
        try:
            case_cite = soup.findAll("p", {"class": "case_cite"})
        except:
            print("No Case Cite")
        try:
            parties = soup.findAll("p", {"class": "parties"})
        except:
            print("No parties")
        try:
            docket = soup.findAll("p", {"class": "docket"})
        except:
            print("No docket")
        try:
            court = soup.findAll("p", {"class": "court"})
        except:
            print("No court")
        try:
            date = soup.findAll("p", {"class": "date"})
        except:
            print("No date")
        try:
            indent = soup.findAll("p", {"class": "indent"})
        except:
            print("No indent")
        extracted_data.loc[i] = [[obs.get_text() for obs in case_cite],[obs.get_text() for obs in parties],[obs.get_text() for obs in docket],[obs.get_text() for obs in court],[obs.get_text() for obs in date],[obs.get_text() for obs in indent]]
    except:
        missing_html.append(i)

### Assign the ID for each observation

In [9]:
extracted_data = extracted_data.assign(id = extracted_id)
extracted_data.head()
extracted_data.case_cite = [[id.split()[0] for id in case] for case in extracted_data.case_cite]

### Create a sub dataframe with id and case cited

In [None]:
int_list = []
for index, element in extracted_data.iterrows():
    try :
        int_list.append(list(map(int, element.case_cite)))
    except:
        int_list.append([])
extracted_data.case_cite = int_list
case_relation = pandas.concat([extracted_data.id, extracted_data.case_cite], axis = 1)

In [None]:
case_relation[:10]

### Create a graph based on relation between cases

In [None]:
case_cite_graph = nx.DiGraph()

for index, element in case_relation.iterrows():
    case_cite_graph.add_node(element.id, attr_dict={"is_case": True})
    for link in element.case_cite:
        case_cite_graph.add_node(link, attr_dict={"is_case": False})
        case_cite_graph.add_edge(element.id, link)

In [None]:
%matplotlib inline
plt.figure(figsize=(40,40))
colors = ['r' if case_cite_graph.node[n]['is_case'] else 'b' for n in case_cite_graph]
nx.draw_networkx(case_cite_graph, node_size=80, pos = nx.spring_layout(case_cite_graph), node_color=colors, linewidths=0, width=0.1, with_labels = False)
plt.axis('off')

### Centrality computation of graph

In [None]:
from collections import Counter

In [None]:
stats = nx.closeness_centrality(case_cite_graph)
closness = Counter(stats).most_common()[:3]
print(closness)

In [None]:
stats = nx.in_degree_centrality(case_cite_graph)
in_degree = Counter(stats).most_common()[:3]
print(in_degree)

In [None]:
stats = nx.out_degree_centrality(case_cite_graph)
out_degree = Counter(stats).most_common()[:3]
print(out_degree)

### Laplacian matrix and spectral clustering of the graph

In [None]:
laplacian = nx.directed_laplacian_matrix(case_cite_graph)

In [None]:
eig_vals, eig_vectors = eigh(laplacian.A)

In [None]:
plt.figure()
plt.plot(eig_vectors[:,1], eig_vectors[:,2],'o')
plt.xlabel('second eigenvector value')
plt.ylabel('third eigenvector value')
plt.show()

In [None]:
X = np.array(list(zip(eig_vectors[:,1], eig_vectors[:,2])))

### Eclust analysis of the clustering

In [None]:
cluster_range = range(1, 20)
cluster_errors = []
for num_clusters in cluster_range:
    kmean = KMeans(num_clusters)
    kmean.fit(X)
    cluster_errors.append(kmean.inertia_)

clusters_df = pandas.DataFrame({ "num_clusters":cluster_range, "cluster_errors": cluster_errors })
plt.figure(figsize=(12,6))
plt.plot(clusters_df.num_clusters, clusters_df.cluster_errors, marker = "o" )

In [None]:
clusters_df

### Apply K-means to the dataset 

In [None]:
kmeans = KMeans(n_clusters=7)
kmeans = kmeans.fit(X)
labels = kmeans.predict(X)
centroids = kmeans.cluster_centers_
labels

In [None]:
plt.figure(figsize=(10,6))
plt.plot(eig_vectors[:,1], eig_vectors[:,2],'o')
plt.plot(centroids[:,0], centroids[:,1],'+')
plt.xlabel('second eigenvector value')
plt.ylabel('third eigenvector value')
plt.show()

In [None]:
print(len(case_cite_graph.nodes()))
print(len(labels))

In [None]:
len(missing_html)

### Assign cluster ID to the dataset 

In [None]:
for i in range(len(labels)):
    if nodes[i] in list(map(int, case_cite_graph.nodes())):
        mask = case_relation.id.astype('int64') == nodes[i]
        case_relation.loc[mask, 'cluster_id'] = labels[i]

In [None]:
case_relation.head()