# Work on citation scraping for each opinions and try to create clusters of opinions based on spectral analysis of an opinions graph

In [1]:
from bs4 import BeautifulSoup
import pandas
import numpy as np
import matplotlib as mlp
import matplotlib.pyplot as plt
import matplotlib.colors
import networkx as nx
from numpy.linalg import eigh
from sklearn.datasets import make_blobs
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_samples, silhouette_score

### Import the opinions sample

In [2]:
opinions = pandas.read_csv("opinions_sample.csv", sep=";")
print(list(opinions))

['absolute_url', 'author', 'author_str', 'cluster', 'date_created', 'date_modified', 'download_url', 'extracted_by_ocr', 'html', 'html_lawbox', 'html_with_citations', 'joined_by', 'local_path', 'opinions_cited', 'per_curiam', 'plain_text', 'resource_uri']


In [3]:
opinions.html[0]

'<p class="case_cite">544 U.S. 917</p>\n    <p class="parties">BERWICK<br><i>v.</i><br>UNITED STATES.</p>\n    <p class="docket">No. 04-8529.</p>\n    <p class="court">Supreme Court of United States.</p>\n    <p class="date">March 21, 2005.</p>\n    <div class="num" id="p1">\n      <span class="num">1</span>\n      <p class="indent">C. A. 2d Cir. Reported below: 107 Fed. Appx. 253. Motions of petitioners for leave to proceed <i>in forma pauperis</i> granted. Certiorari granted, judgments vacated, and cases remanded for further consideration in light of <i>United States</i> v. <i>Booker,</i> 543 U. S. 220 (2005).</p>\n    </div>\n    '

In [4]:
columns = ['case_cite','parties','docket','court','date','intent']
extracted_data = pandas.DataFrame(columns=columns)

### Extract ids for each opinions

In [5]:
extraction = opinions.absolute_url.str.split('/',expand = True)
extracted_id = extraction[extraction.columns[2]]

### Scrap the HTML for each opinions in order to create a new data frame based on HTML class

In [6]:
missing_html=[]
for i in range(len(opinions.html)):
    case_cite, parties, docket, court, date, indent = (list(),)*6
    try:
        soup = BeautifulSoup(opinions.html[i],"html.parser")
        try:
            case_cite = soup.findAll("p", {"class": "case_cite"})
        except:
            print("No Case Cite")
        try:
            parties = soup.findAll("p", {"class": "parties"})
        except:
            print("No parties")
        try:
            docket = soup.findAll("p", {"class": "docket"})
        except:
            print("No docket")
        try:
            court = soup.findAll("p", {"class": "court"})
        except:
            print("No court")
        try:
            date = soup.findAll("p", {"class": "date"})
        except:
            print("No date")
        try:
            indent = soup.findAll("p", {"class": "indent"})
        except:
            print("No indent")
        extracted_data.loc[i] = [[obs.get_text() for obs in case_cite],[obs.get_text() for obs in parties],[obs.get_text() for obs in docket],[obs.get_text() for obs in court],[obs.get_text() for obs in date],[obs.get_text() for obs in indent]]
    except:
        missing_html.append(i)

### Assign the ID for each observation

In [7]:
extracted_data = extracted_data.assign(id = extracted_id)
extracted_data


Unnamed: 0,case_cite,parties,docket,court,date,intent,id
0,[544 U.S. 917],[BERWICKv.UNITED STATES.],[No. 04-8529.],[Supreme Court of United States.],"[March 21, 2005.]",[C. A. 2d Cir. Reported below: 107 Fed. Appx. ...,143119
1,[536 U.S. 971],[MORALESv.UNITED STATES.],[No. 01-10512.],[Supreme Court of the United States.],"[June 28, 2002.]",[C. A. 11th Cir. Certiorari denied. Reported b...,122028
2,"[130 U.S. 83, 9 S.Ct. 435, 32 L.Ed. 870]",[CALTONv.PEOPLE OF THE TERRITORY OF UTAH.1],[],[],"[March 11, 1889.]","[Arthur Brown and John H. Mitchell, for plaint...",92451
3,[540 U.S. 1058],"[SAMSONv.LEWIS, WARDEN.]",[No. 03-6710.],[Supreme Court of United States.],"[December 1, 2003.]","[Appeal from the C. A. 9th Cir., Certiorari de...",134300
5,"[97 U.S. 318, 24 L.Ed. 1008]",[HURLEYv.JONES.],[],[],"[October Term, 1877]",[MOTION to reinstate a cause dismissed under t...,89793
6,[543 U.S. 966],"[STRONGv.McCUSKEY, JUDGE, UNITED STATES DISTRI...",[No. 04-6246.],[Supreme Court of United States.],"[November 1, 2004.]",[C. A. 7th Cir. Certiorari denied.],139897
7,[537 U.S. 815],[GRID RADIO ET AL.v.FEDERAL COMMUNICATIONS COM...,[No. 01-1662.],[Supreme Court of United States.],"[October 7, 2002.]",[CERTIORARI TO THE COURT OF APPEALS FOR THE DI...,122421
8,[543 U.S. 1024],"[HOUGHv.DRETKE, DIRECTOR, TEXAS DEPARTMENT OF ...",[No. 04-6598.],[Supreme Court of United States.],"[December 6, 2004.]",[C. A. 5th Cir. Certiorari denied.],140645
9,[544 U.S. 1039],[SANDERSv.CHICAGO & NORTHWESTERN RAILROAD CO. ...,[No. 04-9130.],[Supreme Court of United States.],"[May 16, 2005.]","[App. Ct. Ill., 1st Dist. Certiorari denied. R...",144705
11,"[251 U.S. 380, 40 S.Ct. 176, 64 L.Ed. 317]",[STROUDv.UNITED STATES.],[No. 276.],[],[],"[Petition for Rehearing Received Jan. 5, 1920....",99504


In [8]:
extracted_data.case_cite = [[id.split(id.split()[1]) for id in case] for case in extracted_data.case_cite]
extracted_data

Unnamed: 0,case_cite,parties,docket,court,date,intent,id
0,"[[544 , 917]]",[BERWICKv.UNITED STATES.],[No. 04-8529.],[Supreme Court of United States.],"[March 21, 2005.]",[C. A. 2d Cir. Reported below: 107 Fed. Appx. ...,143119
1,"[[536 , 971]]",[MORALESv.UNITED STATES.],[No. 01-10512.],[Supreme Court of the United States.],"[June 28, 2002.]",[C. A. 11th Cir. Certiorari denied. Reported b...,122028
2,"[[130 , 83], [9 , 435], [32 , 870]]",[CALTONv.PEOPLE OF THE TERRITORY OF UTAH.1],[],[],"[March 11, 1889.]","[Arthur Brown and John H. Mitchell, for plaint...",92451
3,"[[540 , 1058]]","[SAMSONv.LEWIS, WARDEN.]",[No. 03-6710.],[Supreme Court of United States.],"[December 1, 2003.]","[Appeal from the C. A. 9th Cir., Certiorari de...",134300
5,"[[97 , 318], [24 , 1008]]",[HURLEYv.JONES.],[],[],"[October Term, 1877]",[MOTION to reinstate a cause dismissed under t...,89793
6,"[[543 , 966]]","[STRONGv.McCUSKEY, JUDGE, UNITED STATES DISTRI...",[No. 04-6246.],[Supreme Court of United States.],"[November 1, 2004.]",[C. A. 7th Cir. Certiorari denied.],139897
7,"[[537 , 815]]",[GRID RADIO ET AL.v.FEDERAL COMMUNICATIONS COM...,[No. 01-1662.],[Supreme Court of United States.],"[October 7, 2002.]",[CERTIORARI TO THE COURT OF APPEALS FOR THE DI...,122421
8,"[[543 , 1024]]","[HOUGHv.DRETKE, DIRECTOR, TEXAS DEPARTMENT OF ...",[No. 04-6598.],[Supreme Court of United States.],"[December 6, 2004.]",[C. A. 5th Cir. Certiorari denied.],140645
9,"[[544 , 1039]]",[SANDERSv.CHICAGO & NORTHWESTERN RAILROAD CO. ...,[No. 04-9130.],[Supreme Court of United States.],"[May 16, 2005.]","[App. Ct. Ill., 1st Dist. Certiorari denied. R...",144705
11,"[[251 , 380], [40 , 176], [64 , 317]]",[STROUDv.UNITED STATES.],[No. 276.],[],[],"[Petition for Rehearing Received Jan. 5, 1920....",99504


### Create a sub dataframe with id and case cited

In [98]:
int_list = []
for index, element in extracted_data.iterrows():
        try :
            citation = ([(id[0]+id[1]) for id in element.case_cite])
            int_list.append(list(citation))
        except:
            int_list.append([]) 
            
extracted_data.case_cite = int_list
case_relation = pandas.concat([extracted_data.id, extracted_data.case_cite], axis = 1)

In [None]:
int_list = []
for index, element in extracted_data.iterrows():
    try :
        int_list.append(list(map(int, element.case_cite)))
    except:
        int_list.append([])
extracted_data.case_cite = int_list
case_relation = pandas.concat([extracted_data.id, extracted_data.case_cite], axis = 1)

In [99]:
case_relation.head()

Unnamed: 0,id,case_cite
0,143119,[544 917]
1,122028,[536 971]
2,92451,"[130 83, 9 435, 32 870]"
3,134300,[540 1058]
5,89793,"[97 318, 24 1008]"
6,139897,[543 966]
7,122421,[537 815]
8,140645,[543 1024]
9,144705,[544 1039]
11,99504,"[251 380, 40 176, 64 317]"


### Create a graph based on relation between cases

In [100]:
case_cite_graph = nx.DiGraph()

for index, element in case_relation.iterrows():
    case_cite_graph.add_node(element.id, attr_dict={"is_case": True})
    for link in element.case_cite:
        case_cite_graph.add_node(link, attr_dict={"is_case": False})
        case_cite_graph.add_edge(element.id, link)

In [109]:
%matplotlib inline
plt.figure(figsize=(40,40))
colors = ['r' if case_cite_graph.node[n]['is_case'] else 'b' for n in case_cite_graph]
nx.draw_networkx(case_cite_graph, node_size=80, pos = nx.spring_layout(case_cite_graph),node_color=colors, linewidths=0, width=0.1, with_labels = False)
plt.axis('off')

KeyError: 'is_case'

<Figure size 2880x2880 with 0 Axes>

### Centrality computation of graph

In [103]:
from collections import Counter

In [104]:
## Most common cases : 544 US 971, 537 US 1149, 543 US 872
stats = nx.closeness_centrality(case_cite_graph)
closness = Counter(stats).most_common()[:3]
print(closness)

[('544  971', 0.0007055918151349444), ('537  1149', 0.0006173928382430764), ('543  872', 0.0006173928382430764)]


In [105]:
stats = nx.in_degree_centrality(case_cite_graph)
in_degree = Counter(stats).most_common()[:3]
print(in_degree)

[('544  971', 0.0007055918151349444), ('537  1149', 0.0006173928382430764), ('543  872', 0.0006173928382430764)]


In [106]:
stats = nx.out_degree_centrality(case_cite_graph)
out_degree = Counter(stats).most_common()[:3]
print(out_degree)

[('92961', 0.0004409948844593403), ('99621', 0.0003527959075674722), ('93063', 0.0003527959075674722)]


In [117]:
stats = nx.degree(case_cite_graph)
degree = Counter(stats).most_common()[:3]
print(stats)

[('143119', 1), ('544  917', 5), ('122028', 1), ('536  971', 2), ('92451', 3), ('130  83', 1), ('9  435', 1), ('32  870', 1), ('134300', 1), ('540  1058', 2), ('89793', 2), ('97  318', 1), ('24  1008', 2), ('139897', 1), ('543  966', 1), ('122421', 1), ('537  815', 2), ('140645', 1), ('543  1024', 2), ('144705', 1), ('544  1039', 2), ('99504', 3), ('251  380', 1), ('40  176', 1), ('64  317', 1), ('85081', 3), ('12  421', 1), ('8  421', 1), ('3  610', 1), ('89487', 2), ('94  429', 1), ('24  129', 1), ('89681', 2), ('96  128', 1), ('24  772', 1), ('124369', 1), ('537  961', 2), ('124905', 1), ('537  1011', 1), ('123990', 1), ('537  933', 3), ('129977', 1), ('538  1059', 3), ('91307', 3), ('113  506', 1), ('5  612', 2), ('28  1102', 1), ('87849', 3), ('72  509', 1), ('18  524', 1), ('5  509', 1), ('141215', 1), ('543  1070', 4), ('143599', 1), ('544  958', 3), ('89853', 2), ('98  118', 1), ('25  86', 1), ('130882', 1), ('539  961', 3), ('139991', 1), ('543  974', 3), ('99681', 3), ('254  

### Laplacian matrix and spectral clustering of the graph

In [110]:
laplacian = nx.directed_laplacian_matrix(case_cite_graph)

In [111]:
eig_vals, eig_vectors = eigh(laplacian.A)

KeyboardInterrupt: 

In [None]:
plt.figure()
plt.plot(eig_vectors[:,1], eig_vectors[:,2],'o')
plt.xlabel('second eigenvector value')
plt.ylabel('third eigenvector value')
plt.show()

In [None]:
X = np.array(list(zip(eig_vectors[:,1], eig_vectors[:,2])))

### Eclust analysis of the clustering

In [None]:
cluster_range = range(1, 20)
cluster_errors = []
for num_clusters in cluster_range:
    kmean = KMeans(num_clusters)
    kmean.fit(X)
    cluster_errors.append(kmean.inertia_)

clusters_df = pandas.DataFrame({ "num_clusters":cluster_range, "cluster_errors": cluster_errors })
plt.figure(figsize=(12,6))
plt.plot(clusters_df.num_clusters, clusters_df.cluster_errors, marker = "o" )

In [None]:
clusters_df

### Apply K-means to the dataset 

In [None]:
kmeans = KMeans(n_clusters=7)
kmeans = kmeans.fit(X)
labels = kmeans.predict(X)
centroids = kmeans.cluster_centers_
labels

In [None]:
plt.figure()
plt.plot(eig_vectors[:,1], eig_vectors[:,2],'o')
plt.plot(centroids[:,0], centroids[:,1],'+')
plt.xlabel('second eigenvector value')
plt.ylabel('third eigenvector value')
plt.show()

In [None]:
print(len(case_cite_graph.nodes()))
print(len(labels))

In [None]:
len(missing_html)

### Assign cluster ID to the dataset 

In [None]:
for i in range(len(labels)):
    if nodes[i] in list(map(int, case_cite_graph.nodes())):
        mask = case_relation.id.astype('int64') == nodes[i]
        case_relation.loc[mask, 'cluster_id'] = labels[i]

In [None]:
case_relation.head()