In [211]:
import numpy as np
import pandas as pd
import nltk
from nltk.tokenize import sent_tokenize
nltk.download('punkt') # one time execution


[nltk_data] Downloading package punkt to
[nltk_data]     /Users/sajjadpervaiz/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [212]:
csv_df = pd.read_csv('train-test-split.csv', delimiter=';')

In [213]:
csv_df = csv_df.loc[csv_df['SET'] == 'TEST']
csv_df.head()

Unnamed: 0,ID,SET
3,essay004,TEST
4,essay005,TEST
5,essay006,TEST
20,essay021,TEST
41,essay042,TEST


In [214]:
test_essays_id_strings = csv_df['ID']
test_essays_ids = [e_id.split('essay')[1] for e_id in test_essays_id_strings]

In [215]:
df = pd.read_json('essay_prompt_corpus.json')

In [216]:
df.head(5)

Unnamed: 0,id,prompt,text
0,365,Way to reduce the amount of traffic?,Big cities nowadays can be described as meltin...
1,134,Qualification is still the fundamental determi...,With the increased educational level of workfo...
2,131,The government should allocate more funds to p...,The issue of governmental financing provision ...
3,198,Improve roads or public transports,Many people think that governments should spen...
4,330,"In personal live, we have some responsibilitie...","People should sometimes do things, even though..."


In [217]:
# select only test rows from corpus
df = df.loc[df['id'].isin(test_essays_ids)]

In [218]:
df.shape

(80, 3)

In [219]:
def tokenize_essay_sentences(essay_id, text):
    tokenized_essays = {essay_id: sent_tokenize(text)}
    return tokenized_essays

In [220]:
essay_sentences = [tokenize_essay_sentences(essay_id, text) for essay_id, text in zip(df['id'], df['text'])]

In [None]:
nltk.download('stopwords')# one time execution

In [223]:
from nltk.corpus import stopwords
stop_words = stopwords.words('english')

In [224]:
# function to remove stopwords
def remove_stopwords(sen):
  sen_new = " ".join([i for i in sen if i not in stop_words])
  return sen_new

In [225]:
clean_essay_list = []
for essay in essay_sentences:
    for k, v in essay.items():
        clean_sentences = [remove_stopwords(s.split()) for s in v]
        clean_essay = {k: clean_sentences}
        clean_essay_list.append(clean_essay)

In [None]:
# download pretrained GloVe word embeddings
#! wget http://nlp.stanford.edu/data/glove.6B.zip

In [228]:
#! unzip glove*.zip

In [229]:
# Extract word vectors
word_embeddings = {}
f = open('glove.6B.100d.txt', encoding='utf-8')
for line in f:
    values = line.split()
    word = values[0]
    coefs = np.asarray(values[1:], dtype='float32')
    word_embeddings[word] = coefs
f.close()

In [230]:
vectorized_essays = {}
for s in clean_essay_list:
    for key, val in s.items():
        sentence_vectors = []
        for i in val:
            if len(i) != 0:
                v = sum([word_embeddings.get(w, np.zeros((100,))) for w in i.split()])/(len(i.split())+0.001)
            else:
                v = np.zeros((100,))
            sentence_vectors.append(v)
        vectorized_essays[key] = sentence_vectors


The next step is to find similarities among the sentences. We will use cosine similarity to find similarity between a pair of sentences. Let's create an empty similarity matrix for this task and populate it with cosine similarities of the sentences.

In [231]:
from sklearn.metrics.pairwise import cosine_similarity
import networkx as nx

In [232]:
def create_sim_matrix(sim_matrix, length_of_sentences, essay_id):
    target_essay_sentence_vector = vectorized_essays[essay_id]
    for m in range(length_of_sentences):
        for j in range(length_of_sentences):
            if m != j:
                sim_matrix[m][j] = cosine_similarity(target_essay_sentence_vector[m].reshape(1,100), target_essay_sentence_vector[j].reshape(1,100))[0,0]
    return sim_matrix

In [233]:
# find similarities between the sentences of each essay.
output = []
for e in essay_sentences:
    for k, v in e.items():
        sim_mat = np.zeros([len(v), len(v)])
        sm = create_sim_matrix(sim_mat, len(v), k)
        nx_graph = nx.from_numpy_array(sm)
        scores = nx.pagerank(nx_graph)
        ranked_sentences = sorted(((scores[i],s) for i,s in enumerate(v)), reverse=True)
        # Generate summary
        essay_obj = {'id': k, 'prompt': ranked_sentences[0][1]}
        output.append(essay_obj)

In [234]:
output

[{'id': 373,
  'prompt': 'To conclude, capital punishment is a form of legalized revenge, it is an easy way for serious crimes, and nobody has rights to take others life; thus, it neither demines crimes of violence nor be essential to control violence in society.'},
 {'id': 61,
  'prompt': 'Nanotechnology is working to make computers to the order of microscopic levels.'},
 {'id': 180,
  'prompt': 'We begin taking part in groups or organizations during the early days of life, and especially most people continue their social participation in all through life.'},
 {'id': 211,
  'prompt': 'Thus, school education should not only focus on the academic development of a student as it is much more crucial to teach them how to be independent and live a good life.'},
 {'id': 229,
  'prompt': 'I agree if a friend makes very a big mistake, we will need to end a friendship.'},
 {'id': 278,
  'prompt': "Although multitudes of people think that sports and social activities are not as necessary as acad

In [235]:
import json
json_dump = json.dumps(output, indent=4, ensure_ascii=False)
with open('predictions.json', "w", encoding='utf-8') as outfile:
    outfile.write(json_dump)