In [42]:
# importing the libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import json
import string
import nltk
from nltk.corpus import stopwords

In [43]:
# downloading stopwords
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [44]:
# importing the dataset
df = pd.read_csv('/content/drive/MyDrive/Smart-India-Hackathon/datasets/clean_data.csv')
df.head(2)

Unnamed: 0.1,Unnamed: 0,ID,name,href,docket,term,first_party,second_party,facts,facts_len,majority_vote,minority_vote,first_party_winner,decision_type,disposition,issue_area
0,0,50606,Roe v. Wade,https://api.oyez.org/cases/1971/70-18,70-18,1971,Jane Roe,Henry Wade,"<p>In 1970, Jane Roe (a fictional name used in...",501,7,2,True,majority opinion,reversed,
1,1,50613,Stanley v. Illinois,https://api.oyez.org/cases/1971/70-5014,70-5014,1971,"Peter Stanley, Sr.",Illinois,<p>Joan Stanley had three children with Peter ...,757,5,2,True,majority opinion,reversed/remanded,Civil Rights


In [45]:
# some minimal preprocessing for the case abstracts
df.drop(['Unnamed: 0','facts_len'],inplace=True,axis=1)
df.set_index('ID',inplace=True)
df['facts'] = df['facts'].str.replace('<p>','')
df['facts'] = df['facts'].str.replace('</p>','')
df['facts'] = df['facts'].str.replace('\n','')
judgements_pre = df['facts']
df.head(2)

Unnamed: 0_level_0,name,href,docket,term,first_party,second_party,facts,majority_vote,minority_vote,first_party_winner,decision_type,disposition,issue_area
ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
50606,Roe v. Wade,https://api.oyez.org/cases/1971/70-18,70-18,1971,Jane Roe,Henry Wade,"In 1970, Jane Roe (a fictional name used in co...",7,2,True,majority opinion,reversed,
50613,Stanley v. Illinois,https://api.oyez.org/cases/1971/70-5014,70-5014,1971,"Peter Stanley, Sr.",Illinois,Joan Stanley had three children with Peter Sta...,5,2,True,majority opinion,reversed/remanded,Civil Rights


In [46]:
# custom pre-processsors
def remove_punctuations(text):
    text = [letter for letter in text if letter not in string.punctuation]
    text = ''.join(text)
    text = text.strip()
    return text

def remove_stop_words(text):
    text = [word for word in text.split(' ') if word not in stopwords.words('english')]
    text = ' '.join(text)
    text = text.strip()
    return text

In [47]:
# remove punctuations and stopwords
df['facts'] = df['facts'].apply(remove_punctuations)
df['facts'] = df['facts'].apply(remove_stop_words)

In [48]:
# create the facts vector
from sklearn.feature_extraction.text import TfidfVectorizer
vectorizer = TfidfVectorizer()
facts_matrix = vectorizer.fit_transform(df['facts'])

In [51]:
# user input
from sklearn.metrics.pairwise import linear_kernel
random_sample = vectorizer.transform([df['facts'].iloc[0]])
cosine_similarity = linear_kernel(random_sample,facts_matrix)
df['similarity'] = cosine_similarity.reshape(-1)
df.sort_values(by='similarity',ascending=False).head(2)

Unnamed: 0_level_0,name,href,docket,term,first_party,second_party,facts,majority_vote,minority_vote,first_party_winner,decision_type,disposition,issue_area,similarity
ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1
50606,Roe v. Wade,https://api.oyez.org/cases/1971/70-18,70-18,1971,Jane Roe,Henry Wade,In 1970 Jane Roe fictional name used court doc...,7,2,True,majority opinion,reversed,,1.0
51602,Maher v. Roe,https://api.oyez.org/cases/1976/75-1440,75-1440,1976,Maher,Roe,In wake Roe v Wade Connecticut Welfare Departm...,6,3,True,majority opinion,reversed/remanded,Privacy,0.234849


In [53]:
# saving the dependencies
pickle.dump(vectorizer,open('vectorizer.pkl','wb'))
pickle.dump(facts_matrix,open('facts_matrix.pkl','wb'))