Master's thesis Psychology  
University of Zurich  
Author: Berit Barthelmes  

In [66]:
# import required modules

from fetch_articles_crossref import fetch_articles_crossref
from fetch_articles_ebsco import fetch_articles_ebsco 
from fetch_articles_scihub import fetch_articles_scihub
import csv
import os
import re

In [None]:
# read DOI data from file and fetch from crossref and scihub

with open('../crossref_relevant_dois.csv', newline='') as f:
    reader = csv.reader(f)
    dois = list(reader)[0][1:]
    print(dois)
    fetch_articles_crossref(dois)
    fetch_articles_scihub(dois)

In [None]:
# fetch available articles from ebsco

fetch_articles_ebsco()

In [59]:
# helper function to retrieve the DOIs of all unavailable articles based on an XML document 

import xml.etree.ElementTree as ET

def ebsco_get_unavailable_article_dois(filepath):
  tree = ET.parse(filepath)
  
  # get root element
  root = tree.getroot()

  # iterate over articles
  unavailable_article_dois = []

  for article in root.findall("rec"):
    # iterate child elements of item
    formats = article.find(".//header/controlInfo/artinfo/formats")
    if formats is None:
      doi = article.find(".//ui[@type='doi']")
      if doi is not None:
        unavailable_article_dois.append(doi.text)

  return unavailable_article_dois

In [None]:
# check which articles are not available to download from ebsco according to the XML overview file

ebsco_search_results = "../ebsco_articles.xml"
ebsco_unavailable_article_dois = ebsco_get_unavailable_article_dois(ebsco_search_results)
print("Articles that are not retrievable: ", len(ebsco_unavailable_article_dois))
print(ebsco_unavailable_article_dois)

In [67]:
# fetch unavailable ebsco articles from scihub

fetch_articles_scihub(ebsco_unavailable_article_dois)

In [None]:
# preparation of articles for doc2vec (deprecated)

# read in XML article files and extract plain text to use it for doc2vec
# plain text is then saved as a .txt file with paragraphs seperated by a double line break

# xml_dirs = [os.fsencode('../articles_xml/crossref/api/'),
#             os.fsencode('../articles_xml/crossref/scihub/'),
#             os.fsencode('../articles_xml/ebsco/website/'),
#             os.fsencode('../articles_xml/ebsco/scihub/')]

# for xml_dir in xml_dirs:
#     for file in os.listdir(xml_dir):
#         directory = os.fsdecode(xml_dir)
#         print(file)
#         filename = os.fsdecode(file)
#         path = os.path.join(directory, filename)
#         if filename.endswith(".xml"):
#             paragraphs = helpers.xml_to_txt(path)
#             txt_file_path = f"{os.path.splitext(path)[0]}.txt"
#             txt_file_path = re.sub("\.\.\/articles_xml",
#                                    "../articles_txt", txt_file_path)
#             print(txt_file_path)
#             with open(txt_file_path, "w") as f:
#                 for i, paragraph in enumerate(paragraphs):
#                     if paragraph is not None:
#                         # print(paragraph)
#                         f.write(f"{paragraph}")
#                         if i != len(paragraphs)-1:
#                             f.write(f"\n\n")

In [None]:
# cosine similarity calculation with the help of doc2vec (deprecated)

# from gensim.models import doc2vec
# from gensim.models.doc2vec import TaggedDocument
# import numpy as np
# from numpy.linalg import norm

# import os


# def generate_embeddings(corpus):
#     tagged_documents = []

#     for paragraph_id, paragraph in enumerate(corpus):
#         words = paragraph.split()
#         tagged_documents.append(TaggedDocument(words, tags=["{0:0>4}".format(paragraph_id)]))
#     print("Paragraph count: ", len(tagged_documents))

#     d2v_model = doc2vec.Doc2Vec(vector_size=50, min_count=10, epochs=30)
#     d2v_model.build_vocab(tagged_documents)

#     d2v_model.train(tagged_documents, total_examples=d2v_model.corpus_count, epochs=d2v_model.epochs)

#     print("Vector count: ", len(d2v_model.dv))
#     return d2v_model.dv

# def calc_cos_similarities(embeddings):
#     q_emb = embeddings[0]
#     cos_similarities = []
#     i = 0
#     for i in range(1, len(embeddings)):
#         p_emb = embeddings[i]
#         cos_similarities.append(np.dot(q_emb, p_emb)/(norm(q_emb)*norm(p_emb)))
#     return cos_similarities


# query = ""

# with open("../memory_decay_query.txt", "r") as f:
#     query = f.read()

# txt_dirs = [os.fsencode('../articles_txt/crossref/api/'),
#             os.fsencode('../articles_txt/crossref/scihub/'),
#             os.fsencode('../articles_txt/ebsco/website/'),
#             os.fsencode('../articles_txt/ebsco/scihub/')]

# for txt_dir in txt_dirs:
#     for file in os.listdir(txt_dir): #[0:10]:
#         directory = os.fsdecode(txt_dir)
#         # print(file)
#         filename = os.fsdecode(file)
#         path = os.path.join(directory, filename)
#         if filename.endswith(".txt"):
#             embeddings = ""
            
#             with open(path, "r") as f:
#                 paragraphs = f.read().split("\n\n")
#                 # include the query in the text corpus
#                 corpus = paragraphs[:]
#                 corpus.insert(0, query)
#                 embeddings = generate_embeddings(corpus)
#                 print(len(embeddings))
            
#             cos_similarities = calc_cos_similarities(embeddings)
#             print(len(cos_similarities))
#             relevant_count = np.minimum(5, len(cos_similarities))
#             relevant_indices = np.argpartition(cos_similarities, -relevant_count)[-relevant_count:]
#             print(relevant_indices)
#             relevant_paragraphs = [paragraphs[i] for i in relevant_indices]

#             # Write most relevant paragraphs to .txt file
#             txt_file_path = re.sub("\.\.\/articles_txt",
#                                    "../articles_paragraphs", path)
#             # print(txt_file_path)
#             with open(txt_file_path, "w") as f:
#                 for i, paragraph in enumerate(relevant_paragraphs):
#                     # if paragraph is not None:
#                         # print(paragraph)
#                     f.write(f"{paragraph}")
#                     if i != len(paragraphs)-1:
#                         f.write(f"\n\n")

In [2]:
# import GPT4 functionality for embedding and rating of paragraphs

import os 
import xml.etree.ElementTree as ET
import matplotlib.pyplot as plt
import pandas as pd
from dotenv import load_dotenv
from langchain import OpenAI
from langchain.embeddings import OpenAIEmbeddings
from openai.embeddings_utils import get_embedding, cosine_similarity
import openai

import warnings
warnings.filterwarnings('ignore')

load_dotenv()
openai.api_key = os.getenv("GPT4_KEY")

# Number of articles to process (embedding/rating)
M = 5

In [6]:
# queries the vector embeddings to find the 5 most relevant paragraphs for the classification task

def search_paragraphs(df, query, n=5):
    query_embedding = get_embedding(
        query,
        engine="text-embedding-ada-002"
    )
    df["similarity"] = df['embeddings'].apply(lambda x: cosine_similarity(x, query_embedding))

    results = (
        df.sort_values("similarity", ascending=False)
        .head(n)
        .sort_index()
    )

    return df, results

In [3]:
# load the csv file and extract the text

xml_dir = "../articles_xml_final_211023/"
input_csv = pd.read_csv("../pre_result.csv")
dfs = []
i = 0

for i, row in input_csv.iterrows():
  filename = row["filename"]
  tree = ET.parse(f"{xml_dir}{filename}")
  root = tree.getroot()

  # finds all p tags in namespace http://www.tei-c.org/ns/1.0 and creates a list of these tags.
  paragraphs = root.findall(".//{http://www.tei-c.org/ns/1.0}p")

  # extracts the text content of each paragraph tag and stores it in a new list called paragraphs.
  paragraphs = [''.join(p.itertext()) for p in paragraphs]

  # create a pandas data.frame with one column for the paragraphs and appends it to a list of data frames (one for each article)
  # only create data.frame if the article is not empty
  if paragraphs:
      i+=1
      df = pd.DataFrame(paragraphs, columns=['paragraphs'])
      df.insert(0, 'filename', filename)
      dfs.append(df)
  

In [None]:
# convert the paragraphs into embeddings & save embeddings as csv

for df in dfs:
    embeddings = OpenAIEmbeddings(openai_api_key=openai.api_key, model="text-embedding-ada-002", max_retries=10, request_timeout=20000)
    df['embeddings'] = embeddings.embed_documents(df["paragraphs"].tolist())
    print('Article name: ', df['filename'][0])
    print('# of paragraphs: ', len(df['embeddings']))

In [None]:
# load csv with embeddings

query = """The concept of memory decay in scientific psychology describes that \
        memory traces are stored with an initial strength value and that this strength decays passively over time unless it is reactivated. \
        Reactivation of memory traces according to the memory decay theory can be done by practice. Once the activation level for a stored memory trace becomes too low, the memory trace is lost. \
        The memory decay theory concerns memory loss in healthy individuals. Changes solely due to aging processes and abnormal changes in memory capacity due to impairments like dementia are not the explanatory focus of this theory."""

for i, df in enumerate(dfs):
    df, res = search_paragraphs(df, query, n=3)
    res = res.reset_index()
    dfs[i] = res
    print(res['paragraphs'])

In [9]:
# append paragraph, paragraph embeddings, cosine similarities, GPT4 rating, GPT4 rationale columns to csv

final_dfs = []

for df in dfs:
  input_df = input_csv.loc[input_csv['filename'] == df['filename'][0]].copy()

  row1_df = df.iloc[0].to_frame().T
  row2_df = df.iloc[1].to_frame().T
  row3_df = df.iloc[2].to_frame().T

  row1_df.reset_index(drop=True, inplace=True)
  row2_df.reset_index(drop=True, inplace=True)
  row3_df.reset_index(drop=True, inplace=True)

  new_df = pd.DataFrame({"p1": row1_df["paragraphs"], "p1_embedding": row1_df["embeddings"], "p1_cos_similarity": row1_df["similarity"], "p1_rating_category": "", "p1_rating_rationale": "", 
                        "p2": row2_df["paragraphs"], "p2_embedding": row2_df["embeddings"], "p2_cos_similarity": row2_df["similarity"], "p2_rating_category": "", "p2_rating_rationale": "", 
                        "p3": row3_df["paragraphs"], "p3_embedding": row3_df["embeddings"], "p3_cos_similarity": row3_df["similarity"], "p3_rating_category": "", "p3_rating_rationale": ""},
                        index=[0])
  
  final_dfs.append(new_df)

df_embeddings_similarities = pd.concat(final_dfs, ignore_index=True)
input_csv_with_embeddings_similiarities = pd.merge(input_csv, df_embeddings_similarities, how="outer", left_index=True, right_index=True)

input_csv_with_embeddings_similiarities.to_csv("final.csv")