In [None]:
! pip install fuzzywuzzy summa transformers sentencepiece nltk chardet



In [None]:
!pip install python-Levenshtein



In [None]:
from google.colab import drive

drive.mount("/content/drive")

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
base_dir = "/content/drive/My Drive"

In [None]:
# !unzip "/content/drive/My Drive/ICLR_2018-20230920T071401Z-001" -d "/content/drive/My Drive"
# !unzip "/content/drive/My Drive/ICLR_2018" -d "/content/drive/My Drive"

In [None]:
import os
import torch
import torch.nn as nn
from torch.utils.data import DataLoader, TensorDataset
from fuzzywuzzy import fuzz
from fuzzywuzzy import process
# from summa.summarizer import summarize
import torch
from transformers import AutoTokenizer, AutoModel
import transformers
from nltk.tokenize import sent_tokenize
import json
import pickle as pkl
import os
import pandas as pd
import chardet

In [None]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

In [None]:
tokenizer = AutoTokenizer.from_pretrained("allenai/scibert_scivocab_uncased")
model = AutoModel.from_pretrained("allenai/scibert_scivocab_uncased").to(device)

In [None]:
model1 = transformers.PegasusForConditionalGeneration.from_pretrained("google/pegasus-xsum").to(device)
tokenizer1 = transformers.PegasusTokenizer.from_pretrained("google/pegasus-xsum")

Some weights of PegasusForConditionalGeneration were not initialized from the model checkpoint at google/pegasus-xsum and are newly initialized: ['model.encoder.embed_positions.weight', 'model.decoder.embed_positions.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [None]:
def get_encoding(file):
    with open(file, 'rb') as f:
        result = chardet.detect(f.read())
        # print(result["encoding"])
    return result['encoding']

def read_json(file_path):
    with open(file_path, 'rb') as file:
        data = file.read().decode('utf-8-sig')
        data = json.loads(data)
    return data

In [None]:
def summarize(txt):
    chunk_size = 300
    input_chunks = [txt[i:i + chunk_size] for i in range(0, len(txt), chunk_size)]
    summaries = []
    for chunk in input_chunks:
        input_ids = tokenizer1.encode(chunk, return_tensors="pt", max_length=1024, truncation=True).to(device)
        summary_ids = model1.generate(input_ids, max_length=30, min_length=20, length_penalty=2.0, num_beams=4, early_stopping=True)
        summary_text = tokenizer1.decode(summary_ids[0], skip_special_tokens=True)
        summaries.append(summary_text)
    final_summary = " ".join(summaries)
    return final_summary

In [None]:
def paper_tokenization(paper_path):
    json_data = read_json(paper_path)
    # l=json_data['metadata']['sections']
    if 'sections' not in json_data:
        return None
    l = json_data['sections']
    sections=['abstract', 'introduction', 'related works', 'problem definition', 'idea', 'methodology', 'experiments', 'results', 'tables & figures', 'analysis', 'future work', 'overall', 'bibliography', 'external']
    d={}
    count_no_head = 0
    summary = ''
    for section in l:
        txt = section['text']
        summary_txt=txt
        while len(summary_txt.split())>200:
            summary_txt = summarize(summary_txt)
        summary += summary_txt

    final_summary = summary
    # while len(final_summary.split())>250:
    #     final_summary=summarize(final_summary)
    print(len(final_summary))
    tokens = tokenizer.tokenize(final_summary)
    token_ids = tokenizer.convert_tokens_to_ids(tokens)
    input_ids = tokenizer.encode(final_summary, max_length=512, add_special_tokens=True, truncation=True, padding='max_length')
    input_ids = torch.tensor(input_ids).to(device)
    attention_mask = (input_ids != tokenizer.pad_token_id).long()
    input_ids = input_ids.unsqueeze(0)
    print(input_ids.shape)
    attention_mask = attention_mask.unsqueeze(0)
    outputs = model(input_ids, attention_mask=attention_mask)
    sentence_embeddings = outputs[0].squeeze()
    # d[k]=sentence_embeddings
    print(f"Paper embedding shape: {sentence_embeddings.shape}")
    # del sentence_embeddings
    del outputs
    del input_ids
    del token_ids
    del tokens
    del l
    del json_data
    return sentence_embeddings

In [None]:
def review_tokenizer(review_path):
    with open(review_path, "r", encoding=get_encoding(review_path)) as file:
        content = file.read()

    tokens = tokenizer.tokenize(content)
    token_ids = tokenizer.convert_tokens_to_ids(tokens)
    input_ids = tokenizer.encode(content, add_special_tokens=True, max_length=512, truncation=True, padding='max_length')
    # Create the attention mask
    input_ids = torch.tensor(input_ids).to(device)
    attention_mask = (input_ids != tokenizer.pad_token_id).long()
    # input_ids, attention_mask = tokenizer.pad(input_ids, padding='max_length', max_length=512, return_tensors='pt', return_attention_mask=True)
    input_ids = input_ids.unsqueeze(0)
    attention_mask = attention_mask.unsqueeze(0)
    outputs = model(input_ids, attention_mask=attention_mask)
    sentence_embeddings = outputs[0].squeeze()
    print(f"Review embedding shape: {sentence_embeddings.shape}")
    del outputs
    del input_ids
    del token_ids
    del tokens
    del content
    del file
    return sentence_embeddings

In [None]:
final_papers = {
   'paper_name': [],
   'tokenized_paper': [],
   'tokenized_reviews': []
}
# with open(os.path.join(base_dir, 'embeddings_ICLR_2018.pkl'), 'rb') as fr:
#    final_papers = pkl.load(fr)

In [None]:
print("Completed paper names:")
print(final_papers['paper_name'])

# init_path='Complete_Dataset'
# init_path = ''
# years = os.listdir(init_path)
years = ["ICLR_2018"]
# Change batch size as per your laptop's capacity
batch_size = 25
# This n is for self, to remember which iteration is going on, how many more are remaining, etc. Not used in code explicitly
n = 1

df = pd.read_csv(os.path.join(base_dir, "data_latest_final2.csv"))

Completed paper names:
[]


In [None]:
# import pickle

# filename = 'embeddings_ICLR_2018_test1.pkl'

# with open(filename, 'wb') as file:
#     pickle.dump(final_papers, file)

In [None]:
# final_papers = {
#    'paper_name': [],
#    'tokenized_paper': [],
#    'tokenized_reviews': []
# }
with open(os.path.join(base_dir, 'embeddings_ICLR_2018_test1.pkl'), 'rb') as fr:
   final_papers = pkl.load(fr)

print("Completed paper names:")
print(final_papers['paper_name'])

# init_path='Complete_Dataset'
# init_path = ''
# years = os.listdir(init_path)
years = ["ICLR_2018"]
# Change batch size as per your laptop's capacity
batch_size = 6
# This n is for self, to remember which iteration is going on, how many more are remaining, etc. Not used in code explicitly
n = 53
#n=52 is over

df = pd.read_csv(os.path.join(base_dir, "data_latest_final2.csv"))

for year in years:
    path2 = os.path.join(base_dir, year)
    acceptance_list = os.listdir(path2)
    # for cat in acceptance_list:
    # Change this list when required
    for cat in ["Poster_Papers"]:
    # for cat in ["Rejected_Papers"]:
        path3 = os.path.join(path2, cat)
        papers = os.listdir(path3)
        papers.sort()
        # You will have to change indices here, based on number of papers in a folder.
        batch_papers = papers[batch_size * n : batch_size * n + batch_size]
        print(f"Current ID range {batch_size * n} to {batch_size * n + batch_size - 1}")
        # For last batch, you can use the following commented line and put appropriate indices
        # batch_papers = papers[495:496]
        for paper in batch_papers:
            print(f"\nCurrent Paper: {paper}")
            path4 = os.path.join(path3, paper)
            docs = os.listdir(path4)
            reviews = []
            count_pdf = len([x for x in docs if x[-4:] == '.pdf'])
            # Here, the metadata (Science Parse) file is used (I had named it {some text here}_new_metadata.json)
            count_json = len([x for x in docs if x[-17:] == 'new_metadata.json'])
            review_original_old = [f"{year}_{cat}_{paper}_({x})" for x in docs if x[-4:] == '.txt' and not x.__contains__("MetaReview")]
            review_original = []
            for name in review_original_old:
                name_tmp = name
                row = df[df["paper_name"].str.contains(name_tmp.split('(')[1][:-5])]
                if not row.empty:
                    review_original.append(name)
            count_txt = len(review_original)
            if count_json != 0 and count_txt != 0:
                for doc in docs:
                    if doc[-17:] == 'new_metadata.json':
                        tokenized_paper = paper_tokenization(os.path.join(path4, doc))
                    # if doc[-4:] == '.txt':
                        # tokenized_review = review_tokenizer(path4 + '/' + doc)
                        # reviews.append(tokenized_review)
                for review in review_original:
                  #  tokenized_review = review_tokenizer(path4 + '/' + review.split('(')[1][:-1])
                   print(f"Current review: {review}")
                   tokenized_review = review_tokenizer(os.path.join(path4, review.split('(')[1][:-1]))
                   reviews.append(tokenized_review)
                if tokenized_paper != None:
                  final_papers['paper_name'].append(f"{year}_{cat}_{paper}")
                  # final_papers['review_names'].append(review_original)
                  final_papers['tokenized_paper'].append(tokenized_paper)
                  # reviews = torch.stack(reviews)
                  # final_papers['tokenized_reviews'].append(reviews)
                  final_papers['tokenized_reviews'].append([(x, y) for x, y in zip(review_original, reviews)])
                  print(f"Completed {paper}")
                else:
                  print(f"Not Completed {paper}")
                del tokenized_paper
                del tokenized_review
            else:
               print(f"{paper} not available in csv")
            del review_original
            del docs
            del reviews
        del batch_papers
        del papers
    print(year)

with open(os.path.join(base_dir, 'embeddings_ICLR_2018_test1.pkl'), 'wb') as f:
    pkl.dump(final_papers, f)

Completed paper names:
['ICLR_2018_Rejected_Papers_B13EC5u6W', 'ICLR_2018_Rejected_Papers_B16_iGWCW', 'ICLR_2018_Rejected_Papers_B16yEqkCZ', 'ICLR_2018_Rejected_Papers_B1CEaMbR-', 'ICLR_2018_Rejected_Papers_B1CNpYg0-', 'ICLR_2018_Rejected_Papers_B1CQGfZ0b', 'ICLR_2018_Rejected_Papers_B1D6ty-A-', 'ICLR_2018_Rejected_Papers_B1EGg7ZCb', 'ICLR_2018_Rejected_Papers_B1EPYJ-C-', 'ICLR_2018_Rejected_Papers_B1EVwkqTW', 'ICLR_2018_Rejected_Papers_B1KFAGWAZ', 'ICLR_2018_Rejected_Papers_B1NGT8xCZ', 'ICLR_2018_Rejected_Papers_B1NOXfWR-', 'ICLR_2018_Rejected_Papers_B1X4DWWRb', 'ICLR_2018_Rejected_Papers_B1ZZTfZAW', 'ICLR_2018_Rejected_Papers_B1bgpzZAZ', 'ICLR_2018_Rejected_Papers_B1i7ezW0-', 'ICLR_2018_Rejected_Papers_B1kIr-WRb', 'ICLR_2018_Rejected_Papers_B1mAkPxCZ', 'ICLR_2018_Rejected_Papers_B1mSWUxR-', 'ICLR_2018_Rejected_Papers_B1nLkl-0Z', 'ICLR_2018_Rejected_Papers_B1nxTzbRZ', 'ICLR_2018_Rejected_Papers_B1spAqUp-', 'ICLR_2018_Rejected_Papers_B1suU-bAW', 'ICLR_2018_Rejected_Papers_B1tC-LT6W', '

In [None]:
# with open(os.path.join(base_dir, 'embeddings_ICLR_2018_test1.pkl'), 'wb') as f:
#     pkl.dump(final_papers, f)