In [1]:
import pandas as pd
import os
import json
import nltk
from nltk.corpus import stopwords 
from nltk.tokenize import word_tokenize 
from nltk import pos_tag
import numpy as np
import pickle
import string
import random
import timeit

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.feature_extraction.text import TfidfVectorizer 

import warnings
warnings.simplefilter('ignore')

In [4]:
convdata = pd.read_csv("/Users/grayfloyd/Desktop/legal_help_clean.csv")

#show header of the dataset
convdata.head()

Unnamed: 0,MESSAGE,RESPONSE
0,What are the requirements for me to adopt my n...,You may wish to visit this site: https://app.a...
1,how to have a properly adopted in accordance w...,You may want to refer to this site:https://app...
2,we have been raising our grandchild from birth...,You would have to apply to the Family Courts: ...
3,I was legally adopted since young. Many years ...,Your biological parents will have to apply to ...
4,Can an adoption be reversed? Both adoptive par...,We are not aware of any such cases as the Adop...


In [5]:
convdata_json = json.loads(convdata.to_json(orient='records'))
convdata_json[0:2]

[{'MESSAGE': "What are the requirements for me to adopt my nephew whom I have been caring like my own for 7 years after parents' divorce?",
  'RESPONSE': 'You may wish to visit this site: https://app.adoption.gov.sg/AdoptionProcess.aspx'},
 {'MESSAGE': 'how to have a properly adopted in accordance with the laws of Singapore for my godson,who is a PR now',
  'RESPONSE': 'You may want to refer to this site:https://app.msf.gov.sg/Adoption/How-to-adopt-a-citizen-or-PR'}]

In [7]:
#greeting function
GREETING_INPUTS = ("hello", "hi", "greetings", "hello i need help", "good day","hey","i need help", "greetings")
GREETING_RESPONSES = ["Good day, How may i of help?", "Hello, How can i help?", "hello", "I am glad! You are talking to me."]
           
def greeting(sentence):
    for word in sentence.split():
        if word.lower() in GREETING_INPUTS:
            return random.choice(GREETING_RESPONSES)

In [8]:
#Wordnet Lemmatization 

lemmer = nltk.WordNetLemmatizer()
def LemTokens(tokens):
    return [lemmer.lemmatize(token) for token in tokens]

In [9]:
# Remove punctuation
def RemovePunction(tokens):
    return[t for t in tokens if t not in string.punctuation]

In [10]:
# Create a stopword list from the standard list of stopwords available in nltk
stop_words = set(stopwords.words('english'))
print(len(stop_words))

179


In [11]:
path = "/Users/grayfloyd/Desktop/"

def Talk_To_Javris(test_set_sentence):
    json_file_path = path+"conversation_json.json" 
    tfidf_vectorizer_pickle_path = path + "tfidf_vectorizer.pkl"
    tfidf_matrix_pickle_path = path+ "tfidf_matrix_train.pkl"
    
    i = 0
    sentences = []
    
    # ---------------Tokenisation of user input -----------------------------#
    
    tokens = RemovePunction(nltk.word_tokenize(test_set_sentence))
    pos_tokens = [word for word,pos in pos_tag(tokens, tagset='universal')]
    
    word_tokens = LemTokens(pos_tokens)
    
    filtered_sentence = []
    for w in word_tokens: 
        if w not in stop_words: 
            filtered_sentence.append(w)  
    
    filtered_sentence =" ".join(filtered_sentence).lower()
            
    test_set = (filtered_sentence, "")
    
    #For Tracing, comment to remove from print.
    #print('USER INPUT:'+filtered_sentence)
    
    # -----------------------------------------------------------------------#
        
    try: 
        # ---------------Use Pre-Train Model------------------#
        f = open(tfidf_vectorizer_pickle_path, 'rb')
        tfidf_vectorizer = pickle.load(f)
        f.close()
        
        f = open(tfidf_matrix_pickle_path, 'rb')
        tfidf_matrix_train = pickle.load(f)
        # ---------------------------------------------------#
    except: 
        # ---------------To Train------------------#
        
        start = timeit.default_timer()
        
        with open(json_file_path) as sentences_file:
            reader = json.load(sentences_file)
            
            # ---------------Tokenisation of training input -----------------------------#    
            
            for row in reader:
                db_tokens = RemovePunction(nltk.word_tokenize(row['MESSAGE']))
                pos_db_tokens = [word for word,pos in pos_tag(db_tokens, tagset='universal')]
                db_word_tokens = LemTokens(pos_db_tokens)
                
                db_filtered_sentence = [] 
                for dbw in db_word_tokens: 
                    if dbw not in stop_words: 
                        db_filtered_sentence.append(dbw)  
                
                db_filtered_sentence =" ".join(db_filtered_sentence).lower()
                
                #Debugging Checkpoint
                print('TRAINING INPUT: '+db_filtered_sentence)
                
                sentences.append(db_filtered_sentence)
                i +=1                
            # ---------------------------------------------------------------------------#
                
        tfidf_vectorizer = TfidfVectorizer() 
        tfidf_matrix_train = tfidf_vectorizer.fit_transform(sentences)
        
        #train timing
        stop = timeit.default_timer()
        print ("Training Time : ")
        print (stop - start) 
    
        f = open(tfidf_vectorizer_pickle_path, 'wb')
        pickle.dump(tfidf_vectorizer, f) 
        f.close()
    
        f = open(tfidf_matrix_pickle_path, 'wb')
        pickle.dump(tfidf_matrix_train, f) 
        f.close 
        # ------------------------------------------#
        
    #use the learnt dimension space to run TF-IDF on the query
    tfidf_matrix_test = tfidf_vectorizer.transform(test_set)

    #then run cosine similarity between the 2 tf-idfs
    cosine = cosine_similarity(tfidf_matrix_test, tfidf_matrix_train)
    
    #if not in the topic trained.no similarity 
    idx= cosine.argsort()[0][-2]
    flat =  cosine.flatten()
    flat.sort()
    req_tfidf = flat[-2]
    
    if (req_tfidf==0): #Threshold A
        
        not_understood = "Apology, I do not understand. Can you rephrase?"
        
        return not_understood, not_understood, 2
        
    else:
        
        cosine = np.delete(cosine, 0)

        #get the max score
        max = cosine.max()
        response_index = 0

        #if max score is lower than < 0.34 > (we see can ask if need to rephrase.)
        if (max <= 0.34): #Threshold B
            
            not_understood = "Apology, I do not understand. Can you rephrase?"
            
            return not_understood,not_understood, 2
        else:

                #if score is more than 0.91 list the multi response and get a random reply
                if (max > 0.91): #Threshold C
                    
                    new_max = max - 0.05 
                    # load them to a list
                    list = np.where(cosine > new_max) 
                   
                    # choose a random one to return to the user 
                    response_index = random.choice(list[0])
                else:
                    # else we would simply return the highest score
                    response_index = np.where(cosine == max)[0][0] + 2 

                j = 0 

                with open(json_file_path, "r") as sentences_file:
                    reader = json.load(sentences_file)
                    for row in reader:
                        j += 1 
                        if j == response_index: 
                            return row["RESPONSE"], row["MESSAGE"], max
                            break

In [None]:
flag=True
print("......................................................................................")
print('\x1b[1;37;40m' + 'Jarvis'+'\x1b[0m'+': '+ 'My name is Jarvis, a Lawyer Apprentice Bot.')
print('\x1b[1;37;40m' + 'Jarvis'+'\x1b[0m'+': '+ 'I will try my best to answer your query.')
print('\x1b[1;37;40m' + 'Jarvis'+'\x1b[0m'+': '+ 'If you want to exit, you can type < bye >.')
while(flag==True):
    print("......................................................................................")
    sentence = input('\x1b[0;30;47m' +"USER  "+'\x1b[0m'+":")
    print("......................................................................................")
    if(sentence.lower()!='bye'):
        if(greeting(sentence.lower())!=None):
            print('\x1b[1;37;40m' + 'JARVIS'+'\x1b[0m'+': '+ greeting(sentence.lower()))
        else:
            response_primary, response_message, line_id_primary = Talk_To_Javris(sentence)
            print('\x1b[1;37;40m' + 'JARVIS'+'\x1b[0m'+': '+response_primary)
            
            #For Tracing, comment to remove from print 
            #print("")
            #print("SCORE: "+str(line_id_primary))
            #print("COR_QUES:"+response_message)
            #print("")
    else:
        flag=False
print('\x1b[1;37;40m' + 'JARVIS'+'\x1b[0m'+': '+"Bye! Hope that i am of help.")

......................................................................................
[1;37;40mJarvis[0m: My name is Jarvis, a Lawyer Apprentice Bot.
[1;37;40mJarvis[0m: I will try my best to answer your query.
[1;37;40mJarvis[0m: If you want to exit, you can type < bye >.
......................................................................................
USER  :Hello
......................................................................................
[1;37;40mJARVIS[0m: hello
......................................................................................
USER  :Can I get divorced?
......................................................................................
[1;37;40mJARVIS[0m: No, you can apply for a divorce yourself, which is known in legal terms as  ?acting in person? . However, you will still need to comply with the legal and procedural requirements of the Family Court proceedings. For example, you must ensure that the necessary documents are prepare

In [14]:
nltk.download('universal_tagset')

[nltk_data] Downloading package universal_tagset to
[nltk_data]     /Users/grayfloyd/nltk_data...
[nltk_data]   Unzipping taggers/universal_tagset.zip.


True

In [16]:
nltk.download('wordnet')

[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/grayfloyd/nltk_data...


True

In [18]:
nltk.download('omw-1.4')

[nltk_data] Downloading package omw-1.4 to
[nltk_data]     /Users/grayfloyd/nltk_data...


True