# Research Paper Project 
The data set for this project consists of titles and abstracts for different research papers and corresponding classifications by general topic area. Possible topic areas are:
* Computer Science
* Physics
* Mathematics
* Statistics
* Quantitative Biology
* Quantitative Finance


In [1]:
#@title Load your dataset
import numpy as np
import pandas as pd
import cv2
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
import string 
import nltk
from nltk.corpus import stopwords

nltk.download('stopwords')

!wget -q --show-progress "https://storage.googleapis.com/inspirit-ai-data-bucket-1/Data/AI%20%2B%20X/Group/Engineering/Research%20Paper/research_papers.csv"
research_data = pd.read_csv('research_papers.csv')
research_data = research_data.drop(columns=["ID"])

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.




In [2]:
research_data

Unnamed: 0,TITLE,ABSTRACT,Computer Science,Physics,Mathematics,Statistics,Quantitative Biology,Quantitative Finance
0,Reconstructing Subject-Specific Effect Maps,Predictive models allow subject-specific inf...,1,0,0,0,0,0
1,Rotation Invariance Neural Network,Rotation invariance and translation invarian...,1,0,0,0,0,0
2,Spherical polyharmonics and Poisson kernels fo...,We introduce and develop the notion of spher...,0,0,1,0,0,0
3,A finite element approximation for the stochas...,The stochastic Landau--Lifshitz--Gilbert (LL...,0,0,1,0,0,0
4,Comparative study of Discrete Wavelet Transfor...,Fourier-transform infra-red (FTIR) spectra o...,1,0,0,1,0,0
...,...,...,...,...,...,...,...,...
20967,Contemporary machine learning: a guide for pra...,Machine learning is finding increasingly bro...,1,1,0,0,0,0
20968,Uniform diamond coatings on WC-Co hard alloy c...,Polycrystalline diamond coatings have been g...,0,1,0,0,0,0
20969,Analysing Soccer Games with Clustering and Con...,We present a new approach for identifying si...,1,0,0,0,0,0
20970,On the Efficient Simulation of the Left-Tail o...,The sum of Log-normal variates is encountere...,0,0,1,1,0,0


In [2]:
X_train, X_test, y_train, y_test = train_test_split(research_data['ABSTRACT'], research_data['TITLE'], test_size=0.2, train_size=0.8, random_state=8)
print(X_train.shape)
print(X_test.shape)
print(y_train.shape)
print(X_test.shape)

(16777,)
(4195,)
(16777,)
(4195,)


In [None]:
import csv

train_data = zip(X_train, y_train)
val_data = zip(X_test, y_test)

myFile = open('train_data.csv', 'w')
writer = csv.writer(myFile)
writer.writerow(["abstract", "title"])
for tr in train_data:
    writer.writerow(tr)
myFile.close()

myFile = open('validation_data.csv', 'w')
writer = csv.writer(myFile)
writer.writerow(["abstract", "title"])
for tr in val_data:
    writer.writerow(tr)
myFile.close()

In [9]:
import csv

predictions = open("generated_predictions2.txt", "r")
val2_data = zip(predictions, y_test)

myFile = open('final_test2.csv', 'w')
writer = csv.writer(myFile)
writer.writerow(["abstract", "title"])
for data in val2_data:
    writer.writerow(data)
myFile.close()


# Dataset exploration and visualization

In [None]:
from collections import Counter

stpwrds = stopwords.words('english')
punctuation = string.punctuation

# try changing to visualize more or less words
num_words = 30

# text cleaning maintaining all of our text as one string
text = " ".join(research_data['ABSTRACT'])
# text = text.lower() # try adding this back in and see what happens!
text = "".join(_ for _ in text if _ not in punctuation)
text = [t for t in text.split() if t not in stpwrds and not t.isdigit()]

# We can use Counter to find the most frequent words in all our titles!
words = [_[0] for _ in Counter(text).most_common(num_words)]
frequency = [_[1] for _ in Counter(text).most_common(num_words)]

# Making our plot look nice!
plt.figure(figsize=(8,12));
ax = sns.barplot(x=frequency, y=words)
ax.spines['top'].set_visible(False)
ax.spines['right'].set_visible(False)
plt.title("Most Frequent Keywords used in Abstracts of Research Papers");
plt.xlabel("Frequency", fontsize=14);
plt.yticks(fontsize=14);
plt.xticks(fontsize=14);

KeyboardInterrupt: ignored

In [None]:
research_data['Computer Science'].sum()

In [None]:
categories = research_data[["Computer Science", "Physics", "Mathematics", "Statistics", "Quantitative Biology", "Quantitative Finance"]]
cat_names = categories.columns
categories = categories.transpose()
cat_sums = categories.sum(axis=1)

for i in range(len(cat_names)):
  print("{cat_name} is the category of {times} research papers.".format(cat_name=cat_names[i], times=str(cat_sums[i])))

fig, ax1 = plt.subplots()
ax1.pie(cat_sums, labels=cat_names, autopct='%.0f%%', shadow=True, startangle=90, radius=2, textprops={'size': 'larger'})
ax1.axis('equal')  # Equal aspect ratio ensures that pie is drawn as a circle.

plt.show()

In [None]:
comp_science_abstracts = research_data["ABSTRACT"].where(research_data["Computer Science"] == 1)
comp_science_abstracts = comp_science_abstracts.dropna()
physics_abstracts = research_data["ABSTRACT"].where(research_data["Physics"] == 1)
physics_abstracts = physics_abstracts.dropna()
mathematics_abstracts = research_data["ABSTRACT"].where(research_data["Mathematics"] == 1)
mathematics_abstracts = mathematics_abstracts.dropna()
stats_abstracts = research_data["ABSTRACT"].where(research_data["Statistics"] == 1)
stats_abstracts = stats_abstracts.dropna()
biology_abstracts = research_data["ABSTRACT"].where(research_data["Quantitative Biology"] == 1)
biology_abstracts = biology_abstracts.dropna()
finance_abstracts = research_data["ABSTRACT"].where(research_data["Quantitative Finance"] == 1)
finance_abstracts = finance_abstracts.dropna()

num_words = 8

In [None]:
#@title Computer Science
text = " ".join(comp_science_abstracts)
# text = text.lower() # try adding this back in and see what happens!
text = "".join(_ for _ in text if _ not in punctuation)
text = [t for t in text.split() if t not in stpwrds and not t.isdigit()]

# We can use Counter to find the most frequent words in all our titles!
words = [_[0] for _ in Counter(text).most_common(num_words)]
frequency = [_[1] for _ in Counter(text).most_common(num_words)]

# Making our plot look nice!
plt.figure(figsize=(5,3));
ax = sns.barplot(x=frequency, y=words)
ax.spines['top'].set_visible(False)
ax.spines['right'].set_visible(False)
plt.title("Most Frequent Keywords used in Computer Science abstracts");
plt.xlabel("Frequency", fontsize=14);
plt.yticks(fontsize=14);
plt.xticks(fontsize=14);

In [None]:
#@title Physics
text = " ".join(physics_abstracts)
# text = text.lower() # try adding this back in and see what happens!
text = "".join(_ for _ in text if _ not in punctuation)
text = [t for t in text.split() if t not in stpwrds and not t.isdigit()]

# We can use Counter to find the most frequent words in all our titles!
words = [_[0] for _ in Counter(text).most_common(num_words)]
frequency = [_[1] for _ in Counter(text).most_common(num_words)]

# Making our plot look nice!
plt.figure(figsize=(5,3));
ax = sns.barplot(x=frequency, y=words)
ax.spines['top'].set_visible(False)
ax.spines['right'].set_visible(False)
plt.title("Most Frequent Keywords used in Physics abstracts");
plt.xlabel("Frequency", fontsize=14);
plt.yticks(fontsize=14);
plt.xticks(fontsize=14);

In [None]:
#@title Mathematics
text = " ".join(mathematics_abstracts)
# text = text.lower() # try adding this back in and see what happens!
text = "".join(_ for _ in text if _ not in punctuation)
text = [t for t in text.split() if t not in stpwrds and not t.isdigit()]

# We can use Counter to find the most frequent words in all our titles!
words = [_[0] for _ in Counter(text).most_common(num_words)]
frequency = [_[1] for _ in Counter(text).most_common(num_words)]

# Making our plot look nice!
plt.figure(figsize=(5,3));
ax = sns.barplot(x=frequency, y=words)
ax.spines['top'].set_visible(False)
ax.spines['right'].set_visible(False)
plt.title("Most Frequent Keywords used in Mathematic abstracts");
plt.xlabel("Frequency", fontsize=14);
plt.yticks(fontsize=14);
plt.xticks(fontsize=14);

In [None]:
#@title Statistics
text = " ".join(stats_abstracts)
# text = text.lower() # try adding this back in and see what happens!
text = "".join(_ for _ in text if _ not in punctuation)
text = [t for t in text.split() if t not in stpwrds and not t.isdigit()]

# We can use Counter to find the most frequent words in all our titles!
words = [_[0] for _ in Counter(text).most_common(num_words)]
frequency = [_[1] for _ in Counter(text).most_common(num_words)]

# Making our plot look nice!
plt.figure(figsize=(5,3));
ax = sns.barplot(x=frequency, y=words)
ax.spines['top'].set_visible(False)
ax.spines['right'].set_visible(False)
plt.title("Most Frequent Keywords used in Statistics Abstracts");
plt.xlabel("Frequency", fontsize=14);
plt.yticks(fontsize=14);
plt.xticks(fontsize=14);

In [None]:
#@title Quantitative Biology
text = " ".join(biology_abstracts)
# text = text.lower() # try adding this back in and see what happens!
text = "".join(_ for _ in text if _ not in punctuation)
text = [t for t in text.split() if t not in stpwrds and not t.isdigit()]

# We can use Counter to find the most frequent words in all our titles!
words = [_[0] for _ in Counter(text).most_common(num_words)]
frequency = [_[1] for _ in Counter(text).most_common(num_words)]

# Making our plot look nice!
plt.figure(figsize=(5,3));
ax = sns.barplot(x=frequency, y=words)
ax.spines['top'].set_visible(False)
ax.spines['right'].set_visible(False)
plt.title("Most Frequent Keywords used in Quantitative Biology abstracts");
plt.xlabel("Frequency", fontsize=14);
plt.yticks(fontsize=14);
plt.xticks(fontsize=14);

In [None]:
#@title Quantitative Finance
text = " ".join(finance_abstracts)
# text = text.lower() # try adding this back in and see what happens!
text = "".join(_ for _ in text if _ not in punctuation)
text = [t for t in text.split() if t not in stpwrds and not t.isdigit()]

# We can use Counter to find the most frequent words in all our titles!
words = [_[0] for _ in Counter(text).most_common(num_words)]
frequency = [_[1] for _ in Counter(text).most_common(num_words)]

# Making our plot look nice!
plt.figure(figsize=(5,3));
ax = sns.barplot(x=frequency, y=words)
ax.spines['top'].set_visible(False)
ax.spines['right'].set_visible(False)
plt.title("Most Frequent Keywords used in Quantitative Finance abstracts");
plt.xlabel("Frequency", fontsize=14);
plt.yticks(fontsize=14);
plt.xticks(fontsize=14);

# Data preprocessing

In [3]:
#@title Preprocessor
import nltk
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
 
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('punkt')

def process_lang_data(text):
  
  cleaned_text = []
  punctuation = string.punctuation
  our_stopwords = stopwords.words('english')
  lemmatizer = WordNetLemmatizer()

  for token in word_tokenize(text):
    if token not in punctuation and token not in our_stopwords:
      lemmatized = lemmatizer.lemmatize(token)
      cleaned_text.append(lemmatized)

  return cleaned_text

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


In [4]:
#@title BOW preprocessing
from sklearn.feature_extraction.text import CountVectorizer
import nltk
nltk.download('omw-1.4')

bow = CountVectorizer(analyzer=process_lang_data, max_features=1000, stop_words='english', lowercase=True) # using the function we made above
bow.fit(X_train)                             # fitting to our training data
bow_train = bow.transform(X_train).toarray() # then transforming both training and testing data
bow_test = bow.transform(X_test).toarray()

[nltk_data] Downloading package omw-1.4 to /root/nltk_data...
  "The parameter 'stop_words' will not be used"


In [5]:
#@title TdifVectorizer preprocessing
from sklearn.feature_extraction.text import TfidfVectorizer

tfidf = TfidfVectorizer(analyzer=process_lang_data, max_features=1000, lowercase=True)
tfidf.fit(X_train)
tfidf_train = tfidf.transform(X_train).toarray()
tfidf_test = tfidf.transform(X_test).toarray()

In [None]:
#@title Word 2 Vec preprocessing
import gensim
# have to pre-tokenize
tokenize = research_data['ABSTRACT'].apply(word_tokenize)

# take a look at the documentation to see what these parameters are changing!
w2vec_model = gensim.models.Word2Vec(tokenize, min_count = 1, size = 10, window = 5, sg = 1)
w2vec_model.train(tokenize, total_examples = len(research_data['ABSTRACT']),epochs=20)
X_w2vec = w2vec_model[w2vec_model.wv.vocab]


  if __name__ == '__main__':


In [None]:
#@title Continue to train Word 2 Vec model with titles too
predictable_tokenized_titles = []
for i in range(len(research_data)):
  predictable_tokenized_titles.append(word_tokenize(research_data.iloc[i][0]))

w2vec_model.build_vocab(predictable_tokenized_titles, update=True)
w2vec_model.train(predictable_tokenized_titles, total_examples=w2vec_model.corpus_count, epochs=w2vec_model.epochs)



(3517581, 4257100)

In [8]:
#@title Pre-trained word 2 vec preprocessing
import spacy
from spacy.lang.en.examples import sentences 
from spacy.lang.en.stop_words import STOP_WORDS

text_to_nlp = spacy.load("en_core_web_lg")

def tokenize_vecs(text):
    clean_tokens = []
    for token in text_to_nlp(text):
        if (not token.is_stop) & (token.lemma_ != '-PRON-') & (not token.is_punct): 
          # -PRON- is a special all inclusive "lemma" spaCy uses for any pronoun, we want to exclude these 
            clean_tokens.append(token)
    return np.array(clean_tokens)

def sum_and_avg_vectors():
  '''
    Returns the average of the embedding vectors
  '''
  tokenized_vectors = research_data["ABSTRACT"].apply(tokenize_vecs)
  return np.array(tokenized_vectors.apply(lambda x: np.sum([w.vector for w in x])/len(x)))



In [7]:
! python -m spacy download en_core_web_lg

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting en-core-web-lg==3.4.1
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_lg-3.4.1/en_core_web_lg-3.4.1-py3-none-any.whl (587.7 MB)
[K     |████████████████████████████████| 587.7 MB 17 kB/s 
Installing collected packages: en-core-web-lg
Successfully installed en-core-web-lg-3.4.1
[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('en_core_web_lg')


In [None]:
w2vec_model.save("w2vec_trained2.model")

In [57]:
import gensim
imported_w2vec = gensim.models.Word2Vec.load("w2vec_trained2.model")

# Random Sampling
This is a baseline model to sample frequent words with a random, but reasonable, title length

In [29]:
def bow_sample(num_sample):
  rand_sample = num_sample
  indexes = []
  for i in range(round(np.random.randint(6, 10))):
    high_index = 0
    for j in range(len(bow_test[rand_sample])):
      if j in indexes:
        continue
      if bow_test[rand_sample][j] > bow_test[rand_sample][high_index]:
        high_index = j
    indexes.append(high_index)
    high_index = 0
  return indexes, rand_sample

In [37]:
def generate_bow_sampling_title(num_sample):
  rand_sample_title_lst = []
  indexes, rand_sample = bow_sample(num_sample)
  sample = bow_test[rand_sample]
  vocab = bow.vocabulary_
  for i in indexes:
    for j in vocab.items():
      if i == j[1]:
        rand_sample_title_lst.append(j[0])
  rand_sample_title = " ".join(rand_sample_title_lst)
  return [y_test.iloc[num_sample], rand_sample_title]

In [53]:
y_test.iloc[0]

'Rate Optimal Binary Linear Locally Repairable Codes with Small Availability'

In [40]:
bow_title = generate_bow_sampling_title(0)
print(bow_title)

['Rate Optimal Binary Linear Locally Repairable Codes with Small Availability', 'code binary linear rate associated class size 2 A']


In [39]:
X_test.iloc[0]

"  A locally repairable code with availability has the property that every code\nsymbol can be recovered from multiple, disjoint subsets of other symbols of\nsmall size. In particular, a code symbol is said to have $(r,t)$-availability\nif it can be recovered from $t$ disjoint subsets, each of size at most $r$. A\ncode with availability is said to be 'rate-optimal', if its rate is maximum\namong the class of codes with given locality, availability, and alphabet size.\nThis paper focuses on rate-optimal binary, linear codes with small\navailability, and makes four contributions. First, it establishes tight upper\nbounds on the rate of binary linear codes with $(r,2)$ and $(2,3)$\navailability. Second, it establishes a uniqueness result for binary\nrate-optimal codes, showing that for certain classes of binary linear codes\nwith $(r,2)$ and $(2,3)$-availability, any rate optimal code must be a direct\nsum of shorter rate optimal codes. Third, it presents novel upper bounds on the\nrates 

In [47]:
bow_sim_scores = []
for i in range(len(X_test)):
  both_titles = generate_bow_sampling_title(i)
  gen_title = text_to_nlp(both_titles[1])
  actual_title = text_to_nlp(both_titles[0])
  bow_sim_scores.append(actual_title.similarity(gen_title))
  

  


In [48]:
bow_similarity = sum(bow_sim_scores) / len(bow_sim_scores)
bow_similarity

0.5435828548328143

# Adjacency matrix of words
This baseline model creates a matrix of adjacency of words, showing how much words appear next to each other

In [41]:
def adjacency_matrix_words(num_sample):
  rand_sample = num_sample#np.random.randint(len(X_test))
  cleaned_text = process_lang_data(X_test.iloc[rand_sample])
  tokenized_text = []
  tokens = []
  for i in cleaned_text:
    token = i.lower()
    tokenized_text.append(token)
    if token not in tokens:
      tokens.append(token)

  adjacency_matrix = []
  for i in range(len(tokens)):
    adjacency_matrix.append([])

  for i in range(len(adjacency_matrix)):
    for j in range(len(tokens)):
      adjacency_matrix[i].append(0)

  for i in range(len(tokenized_text)-1):
    for j in range(len(tokens)):
      if tokenized_text[i] == tokens[j]:
        for k in range(len(tokens)):
          if tokenized_text[i-1] == tokens[k] and i != 0:
            adjacency_matrix[j][k] += 1
          if tokenized_text[i+1] == tokens[k]:
            adjacency_matrix[j][k] += 1

  df = pd.DataFrame(adjacency_matrix)
  df.columns = tokens
  df.index = tokens
  return rand_sample, adjacency_matrix, df, tokens

In [42]:
def generate_title_adjacency_matrix(num_sample):
  rand_sample, adjacency_matrix, adj_df, tokens = adjacency_matrix_words(num_sample)
  sample = tfidf_test[rand_sample]
  high_index = 0
  for i in range(len(sample)):
    if sample[i] >= sample[high_index]:
      high_index = i

  # find the most common word in the acjacency matrix
  high_index_word = ""
  vocab = tfidf.vocabulary_
  for i in vocab.items():
    if i[1] == sample[high_index]:
      high_index_word = i[0]

  matrix_index = 0
  for i in range(len(tokens)):
    if tokens[i] == high_index_word:
      matrix_index = i

  title_words = []
  already_used_indexes = []

  for i in range(round(np.random.randint(6, 10))):
    next_word_index = 0
    for j in range(len(adjacency_matrix[matrix_index])):
      if adjacency_matrix[matrix_index][j] >= adjacency_matrix[matrix_index][next_word_index] and j not in already_used_indexes:
        next_word_index = j
    title_words.append(tokens[next_word_index])
    matrix_index = next_word_index
    already_used_indexes.append(next_word_index)

  gen_title = " ".join(title_words)
  return [y_test.iloc[rand_sample], gen_title]

In [43]:
title_adj_mtrx = generate_title_adjacency_matrix(0)
print(title_adj_mtrx)

['Rate Optimal Binary Linear Locally Repairable Codes with Small Availability', 'r -availability r,3 2 code linear']


In [49]:
adjmatrix_sim_scores = []
for i in range(len(X_test)):
  both_titles = generate_title_adjacency_matrix(i)
  gen_title = text_to_nlp(both_titles[1])
  actual_title = text_to_nlp(both_titles[0])
  adjmatrix_sim_scores.append(actual_title.similarity(gen_title))

  


In [50]:
adjacency_matrix_similarity = sum(adjmatrix_sim_scores) / len(adjmatrix_sim_scores)
adjacency_matrix_similarity

0.6021719563735695

# Recurrent Neural Network (LSTM)
Using word2vec to input the abstract into it and training it with word2vec titles

In [58]:
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers

In [59]:
w2vec_vocab_dict = dict({})
for idx, key in enumerate(imported_w2vec.wv.vocab):
    w2vec_vocab_dict[key] = imported_w2vec.wv[key]

In [60]:
# Take predictable papers and add them to a new pd dataframe. Predictable papers have to have 150 words or more in the abstract
predictable_tokenized_titles = []
predictable_tokenized_abstracts = []
for i in range(len(research_data)):
  if len(word_tokenize(research_data.loc[i][1])) >= 150 and len(word_tokenize(research_data.loc[i][0])) >= 6:
    predictable_tokenized_titles.append(word_tokenize(research_data.iloc[i][0]))
    predictable_tokenized_abstracts.append(word_tokenize(research_data.loc[i][1]))

predictable_papers = pd.DataFrame(list(zip(predictable_tokenized_titles, predictable_tokenized_abstracts)), columns=["Tokenized titles", "Tokenized abstracts"])

In [None]:
len(predictable_tokenized_titles)

11435

In [61]:
# Convert all predictable papers' abstracts and titles to their word 2 vec vectors. Abstract vector length are cut down to only 150 words
titles_vectors = np.empty((11435, 6, 10))
abstract_vectors = np.empty((11435, 150, 10))
for i in range(len(predictable_papers)):
  titl_vecs = np.empty((6, 10))
  abst_vecs = np.empty((150, 10))
  for j in range(len(predictable_papers.loc[i][0][:6])):
    titl_vecs[j] = imported_w2vec.wv.get_vector(predictable_papers.loc[i][0][:6][j])
  for j in range(len(predictable_papers.loc[i][1][:150])):
    abst_vecs[j] = imported_w2vec.wv.get_vector(predictable_papers.loc[i][1][:150][j])
  titles_vectors[i] = titl_vecs
  abstract_vectors[i] = abst_vecs

In [None]:
len(abstract_vectors)

11435

In [19]:
predictable_papers["Titles vecs"] = titles_vectors.tolist()
predictable_papers["Abstracts vecs"] = abstract_vectors.tolist()


In [20]:
lstm_X_train, lstm_X_test, lstm_y_train, lstm_y_test = train_test_split(abstract_vectors, titles_vectors, test_size=0.2, train_size=0.8, random_state=8)
print(lstm_X_train.shape)
print(lstm_X_test.shape)
print(lstm_y_train.shape)
print(lstm_y_test.shape)

(9148, 150, 10)
(2287, 150, 10)
(9148, 6, 10)
(2287, 6, 10)


In [21]:
model = keras.Sequential()
model.add(layers.LSTM(units=10, input_shape=(150, 10), return_sequences=True)) # (*, 10)
model.add(layers.BatchNormalization())
# model.add(layers.AveragePooling2D(pool_size=(25, 1), strides=25, padding='same'))
model.add(layers.Reshape((6, 250), input_shape=(150, 10)))
#  model.add(tf.keras.layers.Reshape((3, 4), input_shape=(12,)))
model.add(layers.Dense(units=10))
model.compile(
    loss=keras.losses.MeanAbsoluteError(),
    optimizer="sgd",
    metrics=["accuracy"],
)
model.summary()

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 lstm (LSTM)                 (None, 150, 10)           840       
                                                                 
 batch_normalization (BatchN  (None, 150, 10)          40        
 ormalization)                                                   
                                                                 
 reshape (Reshape)           (None, 6, 250)            0         
                                                                 
 dense (Dense)               (None, 6, 10)             2510      
                                                                 
Total params: 3,390
Trainable params: 3,370
Non-trainable params: 20
_________________________________________________________________


In [22]:
model.fit(
    lstm_X_train, lstm_y_train, batch_size=64, epochs=10
)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.callbacks.History at 0x7fac61456310>

In [23]:
lstm_y_predict = model.predict(lstm_X_test)



In [None]:
lstm_y_predict.shape

(2287, 6, 10)

In [34]:
from scipy import spatial
from numpy import dot
from numpy.linalg import norm

In [36]:
def cos_sim(A, B):
  avg_a = 0
  avg_b = 0
  counter = 0
  for i in range(len(A)):
    avg_a += A[i]
    avg_b += B[i]
    counter += 1
  avg_a /= counter
  avg_b /= counter
  cosine = np.dot(avg_a,avg_b)/(norm(avg_a)*norm(avg_b))
  return cosine

def cosine_similarity(list_1, list_2):
  cos_sim = dot(list_1, list_2) / (norm(list_1) * norm(list_2))
  return cos_sim

In [39]:
lstm_sim_scores = []
for i in range(len(lstm_y_test)):
  lstm_sim_scores.append(cos_sim(lstm_y_test[i], lstm_y_predict[i]))

In [40]:
lstm_similarity_score = sum(lstm_sim_scores) / len(lstm_sim_scores)
lstm_similarity_score

0.9032022955504698

# GPT-2 summarizer performance

In [10]:
import csv

predictions = open("generated_predictions.txt", "r")

In [13]:
lst_predictions = predictions.readlines()

In [17]:
summarizer_sim_scores = []
for i in range(len(y_test)):
  gen_title = text_to_nlp(lst_predictions[i])
  actual_title = text_to_nlp(y_test.iloc[i])
  summarizer_sim_scores.append(actual_title.similarity(gen_title))

  """


In [18]:
summarizer_similarity = sum(summarizer_sim_scores) / len(summarizer_sim_scores)
summarizer_similarity

0.657356229869745

In [44]:
y_test.iloc[0]

'Rate Optimal Binary Linear Locally Repairable Codes with Small Availability'

In [56]:
lst_predictions[0]

"code symbol said to have $(r,t)$-availability if it can be recovered. if rate is maximum, it is said to be 'rate-optimal'\n"