In [1]:
import tensorflow as tf
import tensorflow_hub as hub
import numpy as np

embed = hub.load("https://tfhub.dev/google/universal-sentence-encoder/4")

In [2]:
import numpy as np
import pandas as pd
import spacy
import en_core_web_sm
nlp = en_core_web_sm.load()


data=pd.read_json('./../data/essay_prompt_corpus.json')
train_test_id=pd.read_csv('./../data/train-test-split.csv',sep=";")
train_id = train_test_id[train_test_id.SET == 'TRAIN'].index
test_id = train_test_id[train_test_id.SET == 'TEST'].index
train=data.loc[data['id'].isin(train_id+1)]
test=data.loc[data['id'].isin(test_id+1)]
train.to_json('./../data/train_essay.json',orient = 'records')
test.to_json('./../data/test_essay.json',orient = 'records')

essayText = ""
trainFile = './../data/train_essay.json';
testFile = './../data/test_essay.json'
jsonFileInput = pd.read_json(trainFile, 'r')

In [3]:
jsonFileInput = pd.read_json(trainFile, 'r')
trainEssaySentenceDF = pd.DataFrame(columns=['Essay_id','Actual_prompt'])
for row in jsonFileInput.iterrows():
    essayText = row[1]['text']
    promptSentence = row[1]['prompt']
    dataToAppend = {'Essay_id':row[1]['id'],'Actual_prompt':promptSentence}
    trainEssaySentenceDF = trainEssaySentenceDF.append(dataToAppend, ignore_index=True)

trainEssaySentenceDF.head()

Unnamed: 0,Essay_id,Actual_prompt
0,365,Way to reduce the amount of traffic?
1,134,Qualification is still the fundamental determi...
2,131,The government should allocate more funds to p...
3,198,Improve roads or public transports
4,330,"In personal live, we have some responsibilitie..."


## Collecting all nouns from the train prompt list

In [4]:
inputDF = trainEssaySentenceDF
import nltk
nounList = list()
for index,row in inputDF.iterrows():
    txt = row['Actual_prompt']
    for (word, pos) in nltk.pos_tag(nltk.word_tokenize(txt)):
        if pos[0] == 'N':
            nounList.append(word)

## Creating "noun" to "prompt phrase" memory 

In [5]:
actual_promptList = trainEssaySentenceDF['Actual_prompt']
learnedPromptDF = pd.DataFrame(columns=['noun','prompt'])
for i in range(len(nounList)):
    for prompt in actual_promptList:
        prompt = prompt.lower()
        if nounList[i].lower() in prompt.split():
            if(len(prompt.split(nounList[i]))>1):
                data = {'noun':nounList[i],'prompt':prompt.split(nounList[i])[1]}
                learnedPromptDF = learnedPromptDF.append(data,ignore_index = True)

learnedPromptDF= learnedPromptDF.drop_duplicates(keep='first')

In [6]:
learnedPromptDF.head()

Unnamed: 0,noun,prompt
0,amount,of traffic?
1,amount,of time
2,amount,of violence in television programs
3,amount,of money on libraries and sports?
4,amount,of control on media information


##  Example: Predict a "prompt phrase" using a noun - if the noun is "amount"

In [7]:
learnedPromptDF[learnedPromptDF['noun']=='amount']

Unnamed: 0,noun,prompt
0,amount,of traffic?
1,amount,of time
2,amount,of violence in television programs
3,amount,of money on libraries and sports?
4,amount,of control on media information


## Discourse markers indicator list

In [8]:
import re

claim_indicator_word_list = ["considering all","in the end","i advocate","after analyzing","many believe",
                       "in a nutshell","i favor","personally",
                       "all the above","i support","hence","above reasons",
                       "to summarize","to conclude","to conclude,","consequently",
                       "in my opinion",
                       "agree","it seems to me",
                       "to sum up","in conclusion","i would conclude","therefore","in summary",
                       "i firmly believe","my view", "i believe",
                       "i agree","i prefer","i completely agree","i strongly prefer",
                       "all in all","from my experience",
                       "i think","i suppose","my point of view",
                       "based on the reasons",
                       "accordingly", "as far as","to me", "thus", "to sum up"]

## Extracting important sentence in Test Data using discourse indicator list

In [9]:
jsonFileInput = pd.read_json(testFile, 'r')
testEssaySentenceDF = pd.DataFrame(columns=['Essay_id','text','promptIndicatorSentenceList','Actual_prompt'])
for row in jsonFileInput.iterrows():
    essayText = row[1]['text']
#     counter = counter + 1
    tokens = nlp(essayText)
    
    promptSentence =row[1]['prompt']
    sentList = list()
    indicatorSentenceSet = set()
    counter=0
    for sent in tokens.sents:
        if(counter==0):
            counter = counter + 1
            indicatorSentenceSet.add(sent.string.strip())
        sentList.append(sent.string.strip())
        searchString = sent.string.strip()
        foundFlag = False
        for i in range(len(claim_indicator_word_list)):
            if claim_indicator_word_list[i] in searchString.lower():
                searchString = searchString.lower().replace(claim_indicator_word_list[i],"")
                foundFlag = True
        if(foundFlag):
            indicatorSentenceSet.add(searchString)
    promptIndicatorSentenceList = list(indicatorSentenceSet)
    dataToAppend = {'Essay_id':row[1]['id'],'text':sentList,'promptIndicatorSentenceList':promptIndicatorSentenceList,'Actual_prompt':promptSentence}
    testEssaySentenceDF = testEssaySentenceDF.append(dataToAppend, ignore_index=True)

testEssaySentenceDF.head()

Unnamed: 0,Essay_id,text,promptIndicatorSentenceList,Actual_prompt
0,373,"[""Capital punishment or the death penalty is a...","[, capital punishment is a form of legalized r...",Capital punishment; 51% countries have polishe...
1,61,[Computer-a device which has given a whole new...,[Computer-a device which has given a whole new...,"Computers - use, future prospects and over-dep..."
2,180,"[During our life, it is inevitable that we may...","[in , groups provide a place for people to gai...",Why are groups or organizations important to p...
3,211,[Students have become more and more stressed d...,"[however, that it is not a good idea because ...",Non academic subjects should be removed from s...
4,229,[There is an argument regrading weather lettin...,"[some people might say i am silly, but mistak...",Friendship is more important than mistake by a...


## Discourse Indicator sentence list of the first essay

In [10]:
testEssaySentenceDF['promptIndicatorSentenceList'][0]

[', capital punishment is a form of legalized revenge, it is an easy way for serious crimes, and nobody has rights to take others life; , it neither demines crimes of violence nor be essential to control violence in society.',
 '"Capital punishment or the death penalty is a legal process whereby a person is put to death by the state as a punishment for a crime."',
 ', death penalty neither controls the violent in society nor creates a violent culture.',
 ',  it is no evidence about the reduction of crime rates due to the death penalty because of many reasons.']

## Pick one sentence as possible prompt from the indicator sentence list by sentence ranking

In [11]:
# code that ranks sentences based on commonly occuring words 
possiblePromptsDF = pd.DataFrame(columns=['id','promptIndicatorSentenceList','prompt','Actual_prompt'])
from nltk.tokenize import word_tokenize
for index,row in testEssaySentenceDF.iterrows():
    corpus = row['promptIndicatorSentenceList']
    tokenized_claim_words = [word_tokenize(i) for i in corpus]
    word_list_claim=[]
    for i in tokenized_claim_words:
        for j in i:
            word_list_claim.append(j)
    claim_words_lower= [word.lower() for word in word_list_claim]
    count_freq = nltk.FreqDist(claim_words_lower)
    common=count_freq.most_common(10)
    sentence_ranks= [0 for i in range(len(corpus))]
    for i in range(len(corpus)):
        for j in range(len(common)):
            if  common[j][0] in corpus[i]:
                sentence_ranks[i]+=1    
    if(len(sentence_ranks)==0):
        sentence_ranks=[0]
        prompt=" "
    else: 
        #print('Index',index)
        toprank=max(sentence_ranks)
        #print('sentence_rank',sentence_ranks)
        
        highest_ranker_pos=sentence_ranks.index(toprank)
        #print('highest rank position',highest_ranker_pos)
        prompt=row['promptIndicatorSentenceList'][highest_ranker_pos]
        #print(prompt)

    data = {'id':row['Essay_id'],'promptIndicatorSentenceList':row['promptIndicatorSentenceList'],'prompt':prompt,'Actual_prompt':row['Actual_prompt']}
    possiblePromptsDF = possiblePromptsDF.append(data,ignore_index=True)

In [12]:
possiblePromptsDF.head()

Unnamed: 0,id,promptIndicatorSentenceList,prompt,Actual_prompt
0,373,"[, capital punishment is a form of legalized r...",", capital punishment is a form of legalized re...",Capital punishment; 51% countries have polishe...
1,61,[Computer-a device which has given a whole new...,Computer-a device which has given a whole new ...,"Computers - use, future prospects and over-dep..."
2,180,"[in , groups provide a place for people to gai...","During our life, it is inevitable that we may ...",Why are groups or organizations important to p...
3,211,"[however, that it is not a good idea because ...",", school education should not only focus on th...",Non academic subjects should be removed from s...
4,229,"[some people might say i am silly, but mistak...","however, that friendship is more important th...",Friendship is more important than mistake by a...


In [13]:
possiblePromptsDF = possiblePromptsDF[['id','prompt','Actual_prompt']]
possiblePromptsDF.head()

Unnamed: 0,id,prompt,Actual_prompt
0,373,", capital punishment is a form of legalized re...",Capital punishment; 51% countries have polishe...
1,61,Computer-a device which has given a whole new ...,"Computers - use, future prospects and over-dep..."
2,180,"During our life, it is inevitable that we may ...",Why are groups or organizations important to p...
3,211,", school education should not only focus on th...",Non academic subjects should be removed from s...
4,229,"however, that friendship is more important th...",Friendship is more important than mistake by a...


## Predicting prompts in the test data using "noun" to "prompt phrase" memory

In [14]:
promptWithReplaceFromLearnedPrompt = pd.DataFrame(columns=['id','prompt'])

from scipy import spatial
inputDF = possiblePromptsDF

for index,row in inputDF.iterrows():
    txt = row['prompt']
    generatedSet = ""
    nounList = list()
    for (word, pos) in nltk.pos_tag(nltk.word_tokenize(txt)):
        if pos[0] == 'N':
            nounList.append(word)
    for i in range(len(nounList)):
#         print(nounList[i])
        possiblePrompts = list(learnedPromptDF[learnedPromptDF['noun']==nounList[i]]['prompt'])
        if(len(possiblePrompts)>0):
            prevSimilarity = 0
            currentPossiblePrompt = ""
            for j in range(len(possiblePrompts)):
                embeddings = embed([txt,possiblePrompts[j]])
                vA = embeddings[0]
                vB = embeddings[1]
                A=np.array(vA)
                B=np.array(vB)

                similarity = 1 - spatial.distance.cosine(A, B)
                if(similarity>prevSimilarity and similarity>0.1):
                    prevSimilarity = similarity
                    currentPossiblePrompt = possiblePrompts[j]
            
            if(len(currentPossiblePrompt)>0):
                generatedPrompt = " ".join(currentPossiblePrompt.split()[0:5])
                txt = txt.replace(nounList[i],nounList[i]+" "+generatedPrompt)
    data = {'id':row['id'],'prompt':txt}
    promptWithReplaceFromLearnedPrompt = promptWithReplaceFromLearnedPrompt.append(data,ignore_index=True)

In [15]:
promptWithReplaceFromLearnedPrompt.head()

Unnamed: 0,id,prompt
0,373,", capital punishment is necessary or not? is a..."
1,61,Computer-a device which has given a whole new ...
2,180,"During our life starting from the the birth, i..."
3,211,", school student either male or female shoulds..."
4,229,"however, that friendship is more important th..."


## Writing the predictions in json file

In [16]:
with open('./../data/predictions.json', 'w', encoding='utf-8') as file:
    promptWithReplaceFromLearnedPrompt.to_json(file, force_ascii=False,orient='records')