# Sample Articles

We have to import:

* Trained models
* TF-IDF model

In [1]:
import pickle
import pandas as pd
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import punkt
from nltk.corpus.reader import wordnet
from nltk.stem import WordNetLemmatizer
from sklearn.feature_extraction.text import TfidfVectorizer
import numpy as np

#### Trained models

In [2]:
path_models = "Models/"

# SVM
path_svm = path_models + 'best_mnbc.pickle'
with open(path_svm, 'rb') as data:
    mnb_model = pickle.load(data)

#### TF-IDF object

In [3]:
path_tfidf = "Pickles/tfidf.pickle"
with open(path_tfidf, 'rb') as data:
    tfidf = pickle.load(data)

#### Category mapping dictionary

In [4]:
category_codes = {
    'electronics': 1,
    'hardware': 2,
    'machine': 3,
    'none': 4,
    'raw_materials': 5,
    'skilled_manpower' : 6,
    'unskilled_manpower' : 7,
    'vehicle/equipment_hiring' : 8
}

#### Feature engineering workflow

In [5]:
punctuation_signs = list("?:!.,;")
stop_words = list(stopwords.words('english'))

def create_features_from_text(text):
    
    # Dataframe creation
    lemmatized_text_list = []
    df = pd.DataFrame(columns=['Content'])
    df.loc[0] = text
    df['Content_Parsed_1'] = df['Content'].str.replace("\r", " ")
    df['Content_Parsed_1'] = df['Content_Parsed_1'].str.replace("\n", " ")
    df['Content_Parsed_1'] = df['Content_Parsed_1'].str.replace("    ", " ")
    df['Content_Parsed_1'] = df['Content_Parsed_1'].str.replace('"', '')
    df['Content_Parsed_2'] = df['Content_Parsed_1'].str.lower()
    df['Content_Parsed_3'] = df['Content_Parsed_2']
    for punct_sign in punctuation_signs:
        df['Content_Parsed_3'] = df['Content_Parsed_3'].str.replace(punct_sign, '')
    df['Content_Parsed_4'] = df['Content_Parsed_3'].str.replace("'s", "")
    wordnet_lemmatizer = WordNetLemmatizer()
    lemmatized_list = []
    text = df.loc[0]['Content_Parsed_4']
    text_words = text.split(" ")
    for word in text_words:
        lemmatized_list.append(wordnet_lemmatizer.lemmatize(word, pos="v"))
    lemmatized_text = " ".join(lemmatized_list)    
    lemmatized_text_list.append(lemmatized_text)
    df['Content_Parsed_5'] = lemmatized_text_list
    df['Content_Parsed_6'] = df['Content_Parsed_5']
    for stop_word in stop_words:
        regex_stopword = r"\b" + stop_word + r"\b"
        df['Content_Parsed_6'] = df['Content_Parsed_6'].str.replace(regex_stopword, '')
    df = df['Content_Parsed_6']
#     df = df.rename(columns={'Content_Parsed_6': 'Content_Parsed'})
    
    # TF-IDF
    features = tfidf.transform(df).toarray()
    
    return features

function that tells us the category given the category code:

In [6]:
def get_category_name(category_id):
    for category, id_ in category_codes.items():    
        if id_ == category_id:
            return category

function that includes the whole process:

In [7]:
def predict_from_text(text):
    
    # Predict using the input model
    prediction_mnb = mnb_model.predict(create_features_from_text(text))[0]
    
    # Return result
    category_mnb = get_category_name(prediction_mnb)
    
#     print("The predicted category using the MNB model is %s." %(category_mnb) )
    return (category_mnb)

In [8]:
def predict_from_text_prob(text):
    
    # Predict using the input model
    prediction_mnb_proba = mnb_model.predict_proba(create_features_from_text(text))[0]
    
    
#     print("The conditional probability is: %a" %(prediction_mnb_proba.max()*100))
    return (prediction_mnb_proba.max()*100)

### Text introduction and prediction

Please introduce `text`:

In [15]:
text = """

Procurement of Rough Terrain Crane (100T Capacity) for FM(M)

"""

In [16]:
predict_from_text(text)

'hardware'

In [17]:
predict_from_text_prob(text)

39.225385551288724

In [420]:
# test = pd.read_csv("C:\\Users\\Ayushi.Goel\\Documents\\Work\\Contract\\Untitled Folder\\document.csv")

In [421]:
# test.head()

In [422]:
# test["predicted_class"] = test.apply(lambda row : predict_from_text(row["Clause"]), axis = 1) 

In [423]:
# test["predicted_class_prob"] = test.apply(lambda row : predict_from_text_prob(row["Clause"]), axis = 1) 

In [424]:
# test.predicted_class.value_counts()

In [425]:
# test.head()

In [426]:
# a = test.sort_values(['predicted_class_prob'],ascending=False).groupby(["predicted_class","predicted_class_prob"])

In [427]:
# a.first()

In [471]:
# test1 = test = pd.read_excel("C:\\Users\\Ayushi.Goel\\Documents\\Work\\Contract\\Sample Data\\sample2.xlsx")
b = pd.read_csv(r"C:\Users\Ayushi.Goel\Documents\Work\Contract\Elastic_ContractSearch_Result.csv")

In [472]:
b.Label.value_counts()

Confidentiality            3
Key_personnel              3
Commencement_date          3
Limitation_of_liability    3
Liquidation_Damages        3
Delivery_Locations         3
Governing_Law              3
Extra                      3
Renewal_Term               3
Payment_Term               3
Indemnity                  3
Attrition_Rate             2
Jurisdiction               2
Name: Label, dtype: int64

In [473]:
filtered_list = [    'Limitation_of_liability',
    'Liquidation_damages',
    'Indemnity',
    'Governing_Law',
    'Confidentiality',
    'Key_personnel',
    'Commencement_date',
    'Renewal_Term',
    'Payment_Term'
]

# filtered_list = ["Extra", "Delivery_Locations", "Attrition_Rate"]

In [474]:
a = b[b["Label"].isin(filtered_list)]
# b.drop(b[b['Label'] = filtered_list].index, inplace = True)

In [475]:
# d= {}
# with open(r"C:\Users\Ayushi.Goel\Documents\Work\Contract\Text Classification Code - Copy\elastic_output.txt") as f:
#     for line in f:
#         (key,val) = line.split()
#         dict[key] = val
        

In [476]:
# def dictToDf(dictionary):
#     newDict = {}
#     for x in dictionary:
#         for y in dictionary[x]:
#             newDict[y] = x

#     df = pd.DataFrame(list(newDict.items()), columns=['Values','Label'])
#     cols = df.columns.tolist()
#     cols = cols[-1:] + cols[:-1]
#     df = df[cols]
#     return df

# dictToDf()

In [477]:
a.Label.value_counts()

Payment_Term               3
Governing_Law              3
Renewal_Term               3
Key_personnel              3
Confidentiality            3
Commencement_date          3
Limitation_of_liability    3
Indemnity                  3
Name: Label, dtype: int64

In [478]:
# a["Values"].replace(r"[\d]*[\.][\d]*", " ", regex = True, inplace = True)

In [479]:
a["Values"]

6         11.3    Payment. undisputed Fees will be d...
7           15.4.5    by not less than one hundred a...
8         14.    Price  . If the Benchmarking Result...
9           16.3    Strategic Provider  Software  an...
10        14.8.2    If an audit of Fees charged disc...
11        7.1.4    Strategic  Provider  agrees  that...
14        2.3.4    The Project Agreement will be int...
15        7.2    KABC    . The individuals specified...
16        7.3.3    If CUSTOMER decides that the Stra...
17          7.3     . Strategic Provider will replac...
21          15.3     .  Unless CUSTOMER provides not...
22         9.1    Managed  Agreements. Strategic Pro...
23          19.4    Notwithstanding Section 15.11  a...
26          3.1.2    the services   functions  and  ...
27         7.3.2    After the initial twenty - four ...
28       13.1     . At  any time after twenty - four...
29        18.2    Confidentiality.  Without prejudic...
30        18.2.3     When Strategic Provider pro

In [480]:
a["predicted_class"] = b.apply(lambda row : predict_from_text(row["Values"]), axis = 1) 
a["predicted_class_prob"] = b.apply(lambda row : predict_from_text_prob(row["Values"]), axis = 1) 

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """Entry point for launching an IPython kernel.
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  


In [481]:
sor = a.sort_values(['predicted_class_prob'],ascending=False).groupby(["Label","predicted_class_prob"])

In [482]:
sor.first()

Unnamed: 0_level_0,Unnamed: 1_level_0,Values,predicted_class
Label,predicted_class_prob,Unnamed: 2_level_1,Unnamed: 3_level_1
Commencement_date,53.101359,7.3.2 After the initial twenty - four ...,Key_personnel
Commencement_date,59.597338,3.1.2 the services functions and ...,Commencement_date
Commencement_date,77.343299,13.1 . At any time after twenty - four...,Commencement_date
Confidentiality,77.063862,18.2.3 When Strategic Provider provide...,Confidentiality
Confidentiality,89.302127,18.3 Unauthorized Use or . Without...,Confidentiality
Confidentiality,90.63585,18.2 Confidentiality. Without prejudic...,Confidentiality
Governing_Law,59.399945,2.3.4 The Project Agreement will be int...,Key_personnel
Governing_Law,99.922344,32.2 Project . Unless otherwise agre...,Governing_Law
Governing_Law,99.991436,32.1 . This Agreement will be govern...,Governing_Law
Indemnity,41.052647,16.3 Strategic Provider Software an...,Key_personnel


In [483]:
a1 = a.drop(a[a.Label != a.predicted_class].index)

In [536]:
sor = a.sort_values(['Label','predicted_class_prob'],ascending=True).groupby(["Label","predicted_class_prob"]).head(2)

In [537]:
sor1 = a1.sort_values(['Label','predicted_class_prob'],ascending=False).groupby(["Label","predicted_class_prob"]).head(2)

In [538]:
sor1

Unnamed: 0,Label,Values,predicted_class,predicted_class_prob
21,Renewal_Term,15.3 . Unless CUSTOMER provides not...,Renewal_Term,93.777946
34,Limitation_of_liability,23.5 Limitation on CUSTOMER . CUST...,Limitation_of_liability,59.246497
35,Limitation_of_liability,23.2.5 payments fines penalties or i...,Limitation_of_liability,58.125711
16,Key_personnel,7.3.3 If CUSTOMER decides that the Stra...,Key_personnel,84.562539
15,Key_personnel,7.2 KABC . The individuals specified...,Key_personnel,79.150004
17,Key_personnel,7.3 . Strategic Provider will replac...,Key_personnel,60.789021
32,Governing_Law,32.1 . This Agreement will be govern...,Governing_Law,99.991436
33,Governing_Law,32.2 Project . Unless otherwise agre...,Governing_Law,99.922344
29,Confidentiality,18.2 Confidentiality. Without prejudic...,Confidentiality,90.63585
31,Confidentiality,18.3 Unauthorized Use or . Without...,Confidentiality,89.302127


In [546]:
count=0
prev_label = ""
prev_value = ""
prev_prob = 0
sor3 = pd.DataFrame([],columns = ["Field","Values","Values_2","Confidence_Score(%)"])
for i,row in sor1.iterrows():
    prev_data = pd.DataFrame({'Field': row['Label'],'Values':prev_value,'Values_2':row['Values'],'Confidence_Score(%)':(row['predicted_class_prob']+prev_prob)/2}, index=[0])
    if count==1 and row['Label']==prev_label:
        sor3 = sor3.append(prev_data, ignore_index=True)  
        count=0
    elif count==0 and row['Label']==prev_label:
        continue
    elif count==0 and row['Label']!=prev_label:
        prev_label = row['Label']
        prev_prob = row['predicted_class_prob']
        prev_value = row['Values']
        count+=1
    elif count==1 and row['Label']!=prev_label:
        #sor3 = sor3.append(pd.DataFrame({'Label': row['Label'],'Values':row['Values'],'Values_2':"",'predicted_class_prob':row['predicted_class_prob']}, index=[0]), ignore_index=True)
        sor3 = sor3.append(pd.DataFrame({'Field': prev_label,'Values':prev_value,'Values_2':"",'Confidence_Score(%)':prev_prob}, index=[0]), ignore_index=True)

        count=0
        prev_label = row['Label']
        prev_prob = row['predicted_class_prob']
        prev_value = row['Values']
        count+=1

In [547]:
for x in set(np.array(a.Label))-set(np.array(a1.Label)):
    print (x)
    for i,row in sor.iterrows():
        
        prev_data = pd.DataFrame({'Field': row['Label'],'Values':prev_value,'Values_2':row['Values'],'Confidence_Score(%)':(row['predicted_class_prob']+prev_prob)/2}, index=[0])
    
        if count==1 and row['Label']==prev_label and row['Label']==x:
            sor3 = sor3.append(prev_data, ignore_index=True)  
            count=0
        elif count==0 and row['Label']==prev_label and row['Label']==x:
            continue
        elif count==0 and row['Label']!=prev_label and row['Label']==x:
            prev_label = row['Label']
            prev_prob = row['predicted_class_prob']
            prev_value = row['Values']
            count+=1
        elif count==1 and row['Label']!=prev_label and row['Label']==x:
            sor3 = sor3.append(pd.DataFrame({'Field': prev_label,'Values':prev_value,'Values_2':"",'Confidence_Score(%)':prev_prob}, index=[0]), ignore_index=True)

            count=0
            prev_label = row['Label']
            prev_prob = row['predicted_class_prob']
            prev_value = row['Values']
            count+=1

Indemnity
Payment_Term


In [548]:
sor3

Unnamed: 0,Field,Values,Values_2,Confidence_Score(%)
0,Renewal_Term,15.3 . Unless CUSTOMER provides not...,,93.777946
1,Limitation_of_liability,23.5 Limitation on CUSTOMER . CUST...,23.2.5 payments fines penalties or i...,58.686104
2,Key_personnel,7.3.3 If CUSTOMER decides that the Stra...,7.2 KABC . The individuals specified...,81.856272
3,Governing_Law,32.1 . This Agreement will be govern...,32.2 Project . Unless otherwise agre...,99.95689
4,Confidentiality,18.2 Confidentiality. Without prejudic...,18.3 Unauthorized Use or . Without...,89.968989
5,Commencement_date,13.1 . At any time after twenty - four...,3.1.2 the services functions and ...,68.470319
6,Indemnity,16.3 Strategic Provider Software an...,14.8.2 If an audit of Fees charged disc...,42.26674
7,Payment_Term,11.3 Payment. undisputed Fees will be d...,14. Price . If the Benchmarking Result...,33.854253


In [491]:
sor2 = a1.groupby(["Label"]).predicted_class_prob.nlargest(3)

In [387]:
sor2

Label                      
Commencement_date        60    97.133119
                         59    81.974119
                         58    59.597338
Confidentiality          64    99.442744
                         62    96.637049
                         61    90.635850
Governing_Law            68    99.991436
                         69    99.922344
                         28    99.785446
Indemnity                26    87.254608
                         23    41.810046
Key_personnel            38    84.562539
                         36    84.115274
                         37    79.150004
Limitation_of_liability  75    83.436744
                         72    59.246497
                         47    58.125711
Renewal_Term             48    93.777946
                         52    51.287944
Name: predicted_class_prob, dtype: float64

In [399]:
sor1.predicted_class[sor1.ix[(sor1["predicted_class"]=="Limitation_of_liability") & (sor1["predicted_class_prob"]<55)].index]="None"

.ix is deprecated. Please use
.loc for label based indexing or
.iloc for positional indexing

See the documentation here:
http://pandas.pydata.org/pandas-docs/stable/indexing.html#ix-indexer-is-deprecated
  """Entry point for launching an IPython kernel.
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """Entry point for launching an IPython kernel.


In [390]:
sor1.predicted_class[sor1.ix[(sor1["predicted_class"]=="Renewal_Term") & (sor1["predicted_class_prob"]<90)].index]="None"

.ix is deprecated. Please use
.loc for label based indexing or
.iloc for positional indexing

See the documentation here:
http://pandas.pydata.org/pandas-docs/stable/indexing.html#ix-indexer-is-deprecated
  """Entry point for launching an IPython kernel.
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """Entry point for launching an IPython kernel.


In [391]:
sor1.predicted_class[sor1.ix[(sor1["predicted_class"]=="Key_personnel") & (sor1["predicted_class_prob"]<75)].index]="None"

.ix is deprecated. Please use
.loc for label based indexing or
.iloc for positional indexing

See the documentation here:
http://pandas.pydata.org/pandas-docs/stable/indexing.html#ix-indexer-is-deprecated
  """Entry point for launching an IPython kernel.
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """Entry point for launching an IPython kernel.


In [392]:
sor1.predicted_class[sor1.ix[(sor1["predicted_class"]=="Indemnity") & (sor1["predicted_class_prob"]<85)].index]="None"

.ix is deprecated. Please use
.loc for label based indexing or
.iloc for positional indexing

See the documentation here:
http://pandas.pydata.org/pandas-docs/stable/indexing.html#ix-indexer-is-deprecated
  """Entry point for launching an IPython kernel.
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """Entry point for launching an IPython kernel.


In [393]:
sor1.predicted_class[sor1.ix[(sor1["predicted_class"]=="Indemnity") & (sor1["predicted_class_prob"]<85)].index]="None"

.ix is deprecated. Please use
.loc for label based indexing or
.iloc for positional indexing

See the documentation here:
http://pandas.pydata.org/pandas-docs/stable/indexing.html#ix-indexer-is-deprecated
  """Entry point for launching an IPython kernel.
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """Entry point for launching an IPython kernel.


In [394]:
sor1.predicted_class[sor1.ix[(sor1["predicted_class"]=="Governing_Law") & (sor1["predicted_class_prob"]<99)].index]="None"

.ix is deprecated. Please use
.loc for label based indexing or
.iloc for positional indexing

See the documentation here:
http://pandas.pydata.org/pandas-docs/stable/indexing.html#ix-indexer-is-deprecated
  """Entry point for launching an IPython kernel.
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """Entry point for launching an IPython kernel.


In [395]:
sor1.predicted_class[sor1.ix[(sor1["predicted_class"]=="Confidentiality") & (sor1["predicted_class_prob"]<90)].index]="None"

.ix is deprecated. Please use
.loc for label based indexing or
.iloc for positional indexing

See the documentation here:
http://pandas.pydata.org/pandas-docs/stable/indexing.html#ix-indexer-is-deprecated
  """Entry point for launching an IPython kernel.
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """Entry point for launching an IPython kernel.


In [396]:
sor1.predicted_class[sor1.ix[(sor1["predicted_class"]=="Commencement_date") & (sor1["predicted_class_prob"]<55)].index]="None"

.ix is deprecated. Please use
.loc for label based indexing or
.iloc for positional indexing

See the documentation here:
http://pandas.pydata.org/pandas-docs/stable/indexing.html#ix-indexer-is-deprecated
  """Entry point for launching an IPython kernel.
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """Entry point for launching an IPython kernel.


In [400]:
sor1

Unnamed: 0,Label,Values,predicted_class,predicted_class_prob
48,Renewal_Term,15.3 . Unless CUSTOMER provides not...,Renewal_Term,93.777946
52,Renewal_Term,4.4 Terms during Additional Period The ...,,51.287944
75,Limitation_of_liability,23.1 Exceptions to . This Article 23 do...,Limitation_of_liability,83.436744
72,Limitation_of_liability,23.5 Limitation on CUSTOMER . CUST...,Limitation_of_liability,59.246497
47,Limitation_of_liability,23.2.5 payments fines penalties or i...,Limitation_of_liability,58.125711
71,Limitation_of_liability,23.6 Inclusions in recovery Subject to ...,,46.905771
38,Key_personnel,7.3.3 If CUSTOMER decides that the Stra...,Key_personnel,84.562539
36,Key_personnel,910673252.1 4 at ABC 's request shall u...,Key_personnel,84.115274
37,Key_personnel,7.2 KABC . The individuals specified...,Key_personnel,79.150004
41,Key_personnel,7.2.2 Strategic Provider will not for ...,Key_personnel,77.728997


In [403]:
df_final = sor1.drop(sor1[sor1.predicted_class == "None"].index)

In [404]:
df_final

Unnamed: 0,Label,Values,predicted_class,predicted_class_prob
48,Renewal_Term,15.3 . Unless CUSTOMER provides not...,Renewal_Term,93.777946
75,Limitation_of_liability,23.1 Exceptions to . This Article 23 do...,Limitation_of_liability,83.436744
72,Limitation_of_liability,23.5 Limitation on CUSTOMER . CUST...,Limitation_of_liability,59.246497
47,Limitation_of_liability,23.2.5 payments fines penalties or i...,Limitation_of_liability,58.125711
38,Key_personnel,7.3.3 If CUSTOMER decides that the Stra...,Key_personnel,84.562539
36,Key_personnel,910673252.1 4 at ABC 's request shall u...,Key_personnel,84.115274
37,Key_personnel,7.2 KABC . The individuals specified...,Key_personnel,79.150004
41,Key_personnel,7.2.2 Strategic Provider will not for ...,Key_personnel,77.728997
26,Indemnity,22.1 Supplier's indemnities (a) Subje...,Indemnity,87.254608
68,Governing_Law,32.1 . This Agreement will be govern...,Governing_Law,99.991436


In [405]:
df_final.drop("Label", axis = 1 , inplace = True)

In [406]:
df_final

Unnamed: 0,Values,predicted_class,predicted_class_prob
48,15.3 . Unless CUSTOMER provides not...,Renewal_Term,93.777946
75,23.1 Exceptions to . This Article 23 do...,Limitation_of_liability,83.436744
72,23.5 Limitation on CUSTOMER . CUST...,Limitation_of_liability,59.246497
47,23.2.5 payments fines penalties or i...,Limitation_of_liability,58.125711
38,7.3.3 If CUSTOMER decides that the Stra...,Key_personnel,84.562539
36,910673252.1 4 at ABC 's request shall u...,Key_personnel,84.115274
37,7.2 KABC . The individuals specified...,Key_personnel,79.150004
41,7.2.2 Strategic Provider will not for ...,Key_personnel,77.728997
26,22.1 Supplier's indemnities (a) Subje...,Indemnity,87.254608
68,32.1 . This Agreement will be govern...,Governing_Law,99.991436


In [408]:
df_final.rename(index=str,columns={"predicted_class":"Field",'predicted_class_prob':'Confidence_Score(%)'})

Unnamed: 0,Values,Field,Confidence_Score(%)
48,15.3 . Unless CUSTOMER provides not...,Renewal_Term,93.777946
75,23.1 Exceptions to . This Article 23 do...,Limitation_of_liability,83.436744
72,23.5 Limitation on CUSTOMER . CUST...,Limitation_of_liability,59.246497
47,23.2.5 payments fines penalties or i...,Limitation_of_liability,58.125711
38,7.3.3 If CUSTOMER decides that the Stra...,Key_personnel,84.562539
36,910673252.1 4 at ABC 's request shall u...,Key_personnel,84.115274
37,7.2 KABC . The individuals specified...,Key_personnel,79.150004
41,7.2.2 Strategic Provider will not for ...,Key_personnel,77.728997
26,22.1 Supplier's indemnities (a) Subje...,Indemnity,87.254608
68,32.1 . This Agreement will be govern...,Governing_Law,99.991436


In [290]:
# b.predicted_class[b.ix[(b["predicted_class"]=="Limitation_of_Liability") & (b["predicted_class_prob"]<65)].index]="None"

In [291]:
# b.head(11)

In [292]:
# test1["predicted_class"] = test1.apply(lambda row : predict_from_text(row["Clause"]), axis = 1) 

In [293]:
# test1["predicted_class_prob"] = test1.apply(lambda row : predict_from_text_prob(row["Clause"]), axis = 1) 

In [294]:
# test1.predicted_class[test1.ix[(test1["predicted_class"]=="Limitation_of_Liability") & (test1["predicted_class_prob"]<65)].index]="None"

In [295]:
# test1.predicted_class[test1.ix[(test1["predicted_class"]=="Renewal_Term") & (test1["predicted_class_prob"]<65)].index]="None"

In [296]:
# test1.head(11)

In [297]:
# test1.head()

In [299]:
# for val, prob in zip(test1["predicted_class"], test1["predicted_class_prob"]):
#     print(val)
#     if val == "Limitation_of_Liability" and prob < 65:
#         test1.predicted_class = None  
#     elif val == "Renewal_Term" and prob < 70:
#         test1.predicted_class = None 
#     else:
#         test1.predicted_class = val

In [300]:
# test.head(11)

In [301]:
# test1.sort_values("predicted_class", ascending = True).groupby("predicted_class").head(11)

## Threshold determination

### Threshold