In [125]:
from user.user import User
from corpus.corpus import Corpus
from corpus.visualise import Visualise
from classifier.classifier import Classifier
from classifier.perfect import Perfect
from majority.majority2 import Majority
from majority.baselines import Baseline
from majority.OptBaseline import Optimal
from corpus.stats import Stats

In [2]:
class Pipeline:
    # ---- Corpus Settings ----
    key = "/Users/josefvalvoda/Dropbox/key/node.pem" #pem key for SSH
    ip = "ec2-18-182-64-196.ap-northeast-1.compute.amazonaws.com" #Amazon EC2 IP
    user = "ubuntu"
    annotators = ["gr", "alice", "jasleen"]
    corPath = "corpus/corpus/"
    annPath = "./annotator/anno/"
    mainAnno = "gr" # Select the annotator to train with
    download = False # Downloads latest data from www.holj.ml
    MJ_size = 0.33 # Size of the test corpus, the rest is used for training ML

    # ---- Classifier Settings ----
    train = False # Retrains the classifier
    test_size = 0.33 # Selects best ML algorithm/hyper-parameters by evaluating on this size of MJ corpus.
    downsample = True # Train on the same amount of positive and negative samples
    info = True # Prints the results of the algorithm/parameters performance

In [3]:

pip = Pipeline()

# Get corpus
amazon = User(pip.key, pip.ip, pip.user, pip.annotators, pip.corPath, pip.annPath, pip.mainAnno)
holj_corpus = Corpus(amazon, pip.MJ_size, pip.download)
ML_corpus = holj_corpus.get_corpus(type = "ml")
ALL_corpus = holj_corpus.get_corpus(type = "all")

print("\n\nTraining Classifier")
#Train ML classifier
classifier = Classifier(ML_corpus, pip.test_size, pip.train)
predicted = classifier.get_prediction(ALL_corpus)



Training Classifier
size of our training corpus 1292 374 42525


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  'partagr': "NAN", 'partdisa': "NAN", 'fulldisa': "NAN", 'factagr': "NAN"})


In [126]:
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.feature_extraction import DictVectorizer

class ItemSelector(BaseEstimator, TransformerMixin):
    """For data grouped by feature, select subset of data at a provided key.

    The data is expected to be stored in a 2D data structure, where the first
    index is over features and the second is over samples.  i.e.

    >> len(data[key]) == n_samples

    Please note that this is the opposite convention to scikit-learn feature
    matrixes (where the first index corresponds to sample).

    ItemSelector only requires that the collection implement getitem
    (data[key]).  Examples include: a dict of lists, 2D numpy array, Pandas
    DataFrame, numpy record array, etc.

    >> data = {'a': [1, 5, 2, 5, 2, 8],
               'b': [9, 4, 1, 4, 1, 3]}
    >> ds = ItemSelector(key='a')
    >> data['a'] == ds.transform(data)

    ItemSelector is not designed to handle data grouped by sample.  (e.g. a
    list of dicts).  If your data is structured this way, consider a
    transformer along the lines of `sklearn.feature_extraction.DictVectorizer`.

    Parameters
    ----------
    key : hashable, required
        The key corresponding to the desired value in a mappable.
    """
    def __init__(self, key):
        self.key = key

    def fit(self, x, y=None):
        return self

    def transform(self, data_dict):
        return data_dict[self.key]
    
class Cue_phrase(BaseEstimator, TransformerMixin):
    """Extract features from each document for DictVectorizer"""

    def fit(self, x, y=None):
        return self

    def transform(self, posts):
        # print("original", posts.shape())
        cue = ["For these reasons", "allow the appeal", "dismiss the appeal", "I have had the advantage", "I agree with it", "For the reasons"]
        cue_tags = []
        for text in posts:
            record = "N"
            for c in cue:
                if c in text:
                    record += str(cue.index(c))
                else:
                    record += "N"
            # print(record)
            cue_tags.append({"myner" : record})

        return cue_tags

In [127]:
from sklearn.svm import SVC
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.pipeline import FeatureUnion
from sklearn.metrics import classification_report
from sklearn.model_selection import train_test_split

#len(ALL_corpus.groupby("case").describe()) # Test on ALL corpus
ML_corpus.groupby("case").describe() # Train on ML corpus
def format_train_corp(corpus):
    """
    Formats corpus to fit X, y format, removes unecessary categories.
    """

    corpus['relation'] = corpus['relation'].map({"NAN": "NAN", "other": "NAN",
    "error": "NAN", 'fullagr': "fullagr", 'ackn': "NAN", 'outcome': "NAN",
    'partagr': "NAN", 'partdisa': "NAN", 'fulldisa': "NAN", 'factagr': "NAN"})

    #corpus = self.corp_downsample(corpus)
    X = corpus[["body"]]
    y = corpus["relation"]


    return X, y


X, y = format_train_corp(ML_corpus)
#print(X, y)

text_clf = Pipeline([

            # Use FeatureUnion to combine the BoW and TextStats features
            ('union', FeatureUnion(
                transformer_list=[

                    # Pipeline for Cue phrases
                    #('Cue_phrases', Pipeline([
                     #   ('position', ItemSelector("body")),
                      #  ('cue', Cue_phrase()),  # returns a list of dicts
                       # ('cue_vect', DictVectorizer()),
                    #])),

                    # Pipeline for standard bag-of-words model for body
                    ('body_bow', Pipeline([
                        ('body', ItemSelector("body")),
                        ('vect', TfidfVectorizer()),
                    ])),
                ],
            )),
            ('svc', SVC(kernel='linear', probability=True))
        ])

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size= 0.1, random_state=42)
text_clf.fit(X_train, y_train)
predicted = text_clf.predict(X_test)
print(classification_report(predicted, y_test))

             precision    recall  f1-score   support

        NAN       1.00      0.99      0.99      4363
    fullagr       0.35      0.62      0.45        29

avg / total       0.99      0.99      0.99      4392



In [130]:
from sklearn.svm import SVC
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.pipeline import FeatureUnion
from sklearn.metrics import classification_report
from sklearn.model_selection import train_test_split

#len(ALL_corpus.groupby("case").describe()) # Test on ALL corpus
ML_corpus.groupby("case").describe() # Train on ML corpus
def format_train_corp(corpus):
    """
    Formats corpus to fit X, y format, removes unecessary categories.
    """

    corpus['relation'] = corpus['relation'].map({"NAN": "NAN", "other": "NAN",
    "error": "NAN", 'fullagr': "fullagr", 'ackn': "NAN", 'outcome': "NAN",
    'partagr': "NAN", 'partdisa': "NAN", 'fulldisa': "NAN", 'factagr': "NAN"})

    #corpus = self.corp_downsample(corpus)
    X = corpus[["body"]]
    y = corpus["relation"]


    return X, y


X, y = format_train_corp(ML_corpus)
#print(X, y)

text_clf = Pipeline([

            # Use FeatureUnion to combine the BoW and TextStats features
            ('union', FeatureUnion(
                transformer_list=[

                    # Pipeline for Cue phrases
                    #('Cue_phrases', Pipeline([
                     #   ('position', ItemSelector("body")),
                      #  ('cue', Cue_phrase()),  # returns a list of dicts
                       # ('cue_vect', DictVectorizer()),
                    #])),

                    # Pipeline for standard bag-of-words model for body
                    ('body_bow', Pipeline([
                        ('body', ItemSelector("body")),
                        ('vect', TfidfVectorizer()),
                    ])),
                ],
            )),
            ('svc', SVC(kernel='linear', probability=True))
        ])

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size= 0.1, random_state=42)
print(len(X_train), len(X_test))
text_clf.fit(X_train, y_train)
predicted = text_clf.predict(X_test)
print(classification_report(predicted, y_test))

39519 4392
             precision    recall  f1-score   support

        NAN       1.00      0.99      0.99      4366
    fullagr       0.31      0.62      0.42        26

avg / total       0.99      0.99      0.99      4392



In [131]:
X, y = format_train_corp(ALL_corpus)
predicted = text_clf.predict(X)
bod = X["body"].tolist()
ar = text_clf.predict_proba(X)

In [122]:
all_bod = []
for a, b, c, d in zip(ar, y, predicted, bod):
    if a[0] > a[1] and a[0] <= 0.90:
        print(a, b, c, d)
        all_bod.append(bod)
    if a[1] > a[0] and a[1] <= 0.90:
        print(a, b, c, d)
        all_bod.append(bod)
        
print(len(all_bod))

[0.87600752 0.12399248] fullagr NAN For the reasons he gives I would dismiss the appeal with costs . 
[0.87600752 0.12399248] NAN NAN For the reasons he gives I would dismiss the appeal with costs . 
[0.87600752 0.12399248] fullagr NAN For the reasons he gives I would dismiss the appeal with costs . 
[0.87600752 0.12399248] NAN NAN For the reasons he gives I would dismiss the appeal with costs . 
[0.58828798 0.41171202] fullagr NAN I agree with it , and for the reasons which he has given I too would dismiss the appeal . 
[0.58828798 0.41171202] NAN NAN I agree with it , and for the reasons which he has given I too would dismiss the appeal . 
[0.86084967 0.13915033] fullagr NAN For the reasons he gives , I too agree that the petition should be refused . 
[0.86084967 0.13915033] NAN NAN For the reasons he gives , I too agree that the petition should be refused . 
[0.87333999 0.12666001] fullagr NAN I agree with it ; and on the basis there set out I concur in the order proposed by my nobl

[0.37590642 0.62409358] fullagr fullagr For the reasons which they give I would allow the appeal and make the order proposed by my noble and learned friend , Lord Nolan . 
[0.37590642 0.62409358] NAN fullagr For the reasons which they give I would allow the appeal and make the order proposed by my noble and learned friend , Lord Nolan . 
[0.37590642 0.62409358] NAN fullagr For the reasons which they give I would allow the appeal and make the order proposed by my noble and learned friend , Lord Nolan . 
[0.37590642 0.62409358] fullagr fullagr For the reasons which they give I would allow the appeal and make the order proposed by my noble and learned friend , Lord Nolan . 
[0.37590642 0.62409358] fullagr fullagr For the reasons which they give I would allow the appeal and make the order proposed by my noble and learned friend , Lord Nolan . 
[0.37590642 0.62409358] NAN fullagr For the reasons which they give I would allow the appeal and make the order proposed by my noble and learned fri

[0.58644274 0.41355726] fullagr NAN I agree with it , and for the reasons which he has given I too would allow the appeal . 
[0.58644274 0.41355726] NAN NAN I agree with it , and for the reasons which he has given I too would allow the appeal . 
[0.89052554 0.10947446] fullagr NAN For the reasons he gives , I agree this appeal should be allowed . 
[0.89052554 0.10947446] NAN NAN For the reasons he gives , I agree this appeal should be allowed . 
[0.80751925 0.19248075] fullagr NAN For the reasons given by my noble and learned friend Lord Nicholls of Birkenhead , I would allow this appeal . 
[0.80751925 0.19248075] NAN NAN For the reasons given by my noble and learned friend Lord Nicholls of Birkenhead , I would allow this appeal . 
[0.58644274 0.41355726] fullagr NAN I agree with it , and for the reasons which he has given I too would allow the appeal . 
[0.58644274 0.41355726] NAN NAN I agree with it , and for the reasons which he has given I too would allow the appeal . 
[0.86198094 

In [None]:
#ALL_corpus[(ALL_corpus.case == 4) & (ALL_corpus.line == 88)].relation.item()
#wrong = predicted[predicted.relation != predicted.predictions]
wrong = predicted[predicted.predictions != "fullagr"]
case = wrong["case"]
w = wrong[["body", "relation", "predictions"]]
x = wrong[["body"]].values.tolist()
y = wrong[["relation"]].values.tolist()
z = wrong[["predictions"]].values.tolist()

#w.to_excel("output.xlsx")
unique = []
for i, j, k, c in zip(x, y, z, case):
    if j == ["fullagr"]:
        print(c)
        unique.append(i[0])
set(unique)

#for i, p in predicted.iterrows():
#    print(p["case"], p["relation"])

A full agreement with a reasoing of another judge has two components:
- A reference to a judge or judges (can be a pronoun instead of a full name).
- An indication of an agreement with their full reasoning.

In [60]:
#ALL_corpus[(ALL_corpus.case == 4) & (ALL_corpus.line == 88)].relation.item()
#wrong = predicted[predicted.relation != predicted.predictions]
wrong = predicted[predicted.predictions == "fullagr"]
case = wrong["case"]
w = wrong[["body", "relation", "predictions"]]
x = wrong[["body"]].values.tolist()
y = wrong[["relation"]].values.tolist()
z = wrong[["predictions"]].values.tolist()

#w.to_excel("output.xlsx")
unique = []
for i, j, k, c in zip(x, y, z, case):
    if j == ["NAN"]:
        #print(c)
        unique.append(i[0])
        
print(len(unique))
sent = list(set(unique))

for s in sent:
    print(s + "\n")

#for i, p in predicted.iterrows():
#    print(p["case"], p["relation"])

816
I respectfully agree with Lord Brown that , if such a surprising result were intended , it ought to have been enacted in the clearest of terms . 

In agreement with the majority of the Court of Appeal and with my noble and learned friend Lord Hope of Craighead , and for the same reasons , I would give a negative answer to the question posed at the outset of this opinion and dismiss the appeals . 

I would dismiss this appeal . 

The Court of Appeal concluded that it did not and I must at the outset pay tribute to the careful judgment of my noble and learned friend Lord Mance , which meticulously confronts and deals with every objection to his view of the case ; a tribute no less sincere for the opinion I have formed that he was wrong . 

For these reasons I would dismiss this appeal . 

For the reasons already given , I do not think that article 8 or article 11 is engaged . 

Had it been necessary , however , in common with all of your Lordships I would have reached the same conclu