In [10]:
import pandas 
import spacy 
import scipy 
import sklearn 
import nltk 
 
from sklearn import svm 
from sklearn.svm import LinearSVC 
from scipy import sparse 
from scipy.sparse import csr_matrix
from nltk import corpus, stem
from nltk.stem import wordnet
from nltk.stem.wordnet import WordNetLemmatizer 
from nltk import word_tokenize 
from nltk.stem.porter import PorterStemmer 
from nltk.corpus import stopwords

en_nlp = spacy.load("en_core_web_md")

def remove_irrelevant_features(df_question): 
    df_question_class = df_question.pop('Class') 

 
    df_question.pop('Question') 
    #df_question.pop('WH-Bigram') 

 
    return df_question_class 

 
def pre_process(dta): 
    return pandas.get_dummies(dta) 

 
def transform_data_matrix(df_question_train, df_question_predict): 

 
    df_question_train_columns = list(df_question_train.columns) 
    df_question_predict_columns = list(df_question_predict.columns) 

 
    df_question_trans_columns = list(set(df_question_train_columns + df_question_predict_columns)) 

 
    trans_data_train = {} 

 
    for feature in df_question_trans_columns: 
        if feature not in df_question_train: 
            trans_data_train[feature] = [0 for i in range(len(df_question_train.index))] 
        else: 
            trans_data_train[feature] = list(df_question_train[feature]) 

 
    df_question_train = pandas.DataFrame(trans_data_train) 
    df_question_train = csr_matrix(df_question_train) 

 
    trans_data_predict = {} 

 
    for feature in trans_data_train: 
        if feature not in df_question_predict: 
            trans_data_predict[feature] = 0 
        else: 
            trans_data_predict[feature] = list(df_question_predict[feature])   

 
    df_question_predict = pandas.DataFrame(trans_data_predict) 
    df_question_predict = csr_matrix(df_question_predict) 

 
    return df_question_train, df_question_predict 

 
def get_question_predict_data(en_doc): 
    sentence_list = list(en_doc.sents)[0:1] 
    en_nlp = spacy.load("en_core_web_md") 

 
    question_data_frame = [] 

 
    for sentence in sentence_list: 

 
        wh_bi_gram = [] 
        root_token, wh_pos, wh_nbor_pos, wh_word = [""] * 4 
        for token in sentence: 

 
        # if token is of WH question type 
            if token.tag_ == "WDT" or token.tag_ == "WP" or token.tag_ == "WP$" or token.tag_ == "WRB": 
                wh_pos = token.tag_ 
                wh_word = token.text 
                wh_bi_gram.append(token.text) 
                wh_bi_gram.append(str(en_doc[token.i + 1])) 
                wh_nbor_pos = en_doc[token.i + 1].tag_ 
     
            # if token is the root of sentence 
            if token.dep_ == "ROOT": 
                root_token = token.tag_ 
     
        question_data_frame_obj = {'WH': wh_word, 'WH-POS': wh_pos, 'WH-NBOR-POS': wh_nbor_pos, 'Root-POS': root_token} 
        question_data_frame.append(question_data_frame_obj) 
         
        df_question = pandas.DataFrame(question_data_frame) 

 
    return df_question 

 
def support_vector_machine(df_question_train, df_question_class, df_question_predict): 
    lin_clf = LinearSVC() 
    lin_clf.fit(df_question_train, df_question_class) 
    prediction = lin_clf.predict(df_question_predict) 
    return prediction, lin_clf 

 
def classify_question(en_doc): 
    training_data_path = "qclassify.csv" 
    df_question_train = pandas.read_csv(training_data_path, sep='|', header=0, encoding='cp1252') 
     
    df_question_class = remove_irrelevant_features(df_question_train) 
    df_question_predict = get_question_predict_data(en_doc) 
    df_question_train = pre_process(df_question_train) 
    df_question_predict = pre_process(df_question_predict) 
     
    df_question_train, df_question_predict = transform_data_matrix(df_question_train, df_question_predict) 
    predicted_class, svc_clf = support_vector_machine(df_question_train, df_question_class, df_question_predict) 
    return predicted_class 

def process(question): 
    if question.lower().startswith("who " or "when " or "where " or "what " or "why " or "is " or "was " or "will " or "are " or "were " or "do " or "does " or "did " or "have " or "has " or "can "): 
        return 1 
    elif question.lower().startswith("how "): 
        return 2 
    else: 
        return 0

      
def make_min(sentence,Qtype): 
    sentence = sentence.upper() 
    sentence = ' '.join([word for word in sentence.split() if word not in (stopwords.words('english'))]) 
    sentenceParts = [] 
    ps = PorterStemmer() 
    if Qtype == "q": 
        for i in range(process(sentence),len(sentence.strip("?").split())): 
            sentenceParts.append(ps.stem(sentence.strip("?").split()[i])) 
    else: 
        for each in sentence.strip("?").split(): 
            sentenceParts.append(ps.stem(each)) 
    return sentenceParts
  
def findAnAns(question): 
    corpusFile = open("corpus.txt","r") 
    data = corpusFile.read() 
    corpusFile.close() 
 
    data = data.split("." or "!" or "?") 
 
    linedata = [] 
    questionParts = make_min(question,"q") 

    confidenceLevels = [] 
    for sent in data: 
        confidenceLevel = [] 
        matched = 0 
        answerParts = make_min(sent, "a") 
        for each in questionParts: 
            if each in answerParts: 
                matched += 1 
        confidenceLevel.append(sent) 
        confidenceLevel.append(matched*100/len(questionParts)) 
        confidenceLevels.append(confidenceLevel)
    
    for ans in sorted(confidenceLevels, key=lambda x: x[1], reverse=True): 
        if question_class == [u'HUM']:
            return ans 

    
if __name__ == "__main__": 
    question = input()
    question_class = classify_question(en_nlp(u'' + question))
    if question_class == [u'HUM']:
        print (question)
        reply = findAnAns(question) 
        if reply[1] > 0: 
            print (reply[0]) 
            print (str(reply[1]) + "%" + " confidence") 
        else: 
            print ("Only low confidence answer found") 
            print ("Ans",answer) 
            print (str(matched*100/len(questionParts)) +"% "+ "confidence")
    else:
        print("invalid question")

"What did Obama execute"
invalid question
