Skip to content

Commit

Permalink
Update code, add keywords, see version 0.3.14
Browse files Browse the repository at this point in the history
  • Loading branch information
s2010515 committed Jul 11, 2023
1 parent d472308 commit 7ddaba8
Show file tree
Hide file tree
Showing 9 changed files with 110 additions and 9 deletions.
9 changes: 9 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -136,6 +136,8 @@ The dataframe columns are:
- Abstract (from PubMed metadata).
- mesh <class 'list'>
- MeSH (Medical Subject Headings) provided by Medline.
- keywords <class 'list'>
- This field contains largely non-MeSH subject terms that describe the content of an article. Beginning in January 2013, author-supplied keywords.
- authors <class 'list'>
- journal <class 'str'>
- pub_type <class 'list'>
Expand Down Expand Up @@ -272,6 +274,13 @@ A: It seems that you are on a shared computer, you need to identify who is the o

## Version

### Version 0.3.14
-> Add the keyword field from the medline file to the result.

-> Fixed data type, when reading the medline file, in case of add_mesh.

-> Fixed code where 1 article was missing if using list of PMIDs as update.

### Version 0.3.13
-> Since Crossref retired the API key feature to let Elsevier and Wiley identified the author of the publication request. wiley_api_key and elsevier_api_key optional parameters have been added as input parameters. These are not mandatory parameters but increase greatly the retrieval rate as they give access to Wiley and Elsevier publications respectively.

Expand Down
1 change: 1 addition & 0 deletions cadmus/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -63,3 +63,4 @@
from cadmus.post_retrieval.parsed_to_df import parsed_to_df
from cadmus.retrieval.edirect import pipeline
from cadmus.pre_retrieval.display_export_path import display_export_path
from cadmus.pre_retrieval.add_keywords import add_keywords
4 changes: 4 additions & 0 deletions cadmus/main/bioscraping.py
Original file line number Diff line number Diff line change
Expand Up @@ -37,6 +37,7 @@
from cadmus.post_retrieval.clean_up_dir import clean_up_dir
from cadmus.pre_retrieval.add_mesh_remove_preprint import add_mesh_remove_preprint
from cadmus.pre_retrieval.change_output_structure import change_output_structure
from cadmus.pre_retrieval.add_keywords import add_keywords

def bioscraping(input_function, email, api_key, wiley_api_key = None, elsevier_api_key = None, start = None, idx = None , full_search = None, keep_abstract = True, click_through_api_key = 'XXXXXXXX-XXXXXXXX-XXXXXXXX-XXXXXXXX'):
# first bioscraping checks whether this is an update of a previous search or a new search.
Expand All @@ -57,6 +58,9 @@ def bioscraping(input_function, email, api_key, wiley_api_key = None, elsevier_a
if 'mesh' not in original_df.columns:
print('Implementing changes to your previous result due to change in the library.')
original_df = add_mesh_remove_preprint(original_df)
if 'keywords' not in original_df.columns:
print('Implementing changes to your previous result due to change in the library.')
original_df = add_keywords(original_df)
if original_df.iloc[0].content_text == 0 or original_df.iloc[0].content_text == 1:
pass
else:
Expand Down
3 changes: 2 additions & 1 deletion cadmus/pre_retrieval/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,4 +7,5 @@
from cadmus.pre_retrieval.check_for_retrieved_df import check_for_retrieved_df
from cadmus.pre_retrieval.add_mesh_remove_preprint import add_mesh_remove_preprint
from cadmus.pre_retrieval.change_output_structure import change_output_structure
from cadmus.pre_retrieval.display_export_path import display_export_path
from cadmus.pre_retrieval.display_export_path import display_export_path
from cadmus.pre_retrieval.add_keywords import add_keywords
82 changes: 82 additions & 0 deletions cadmus/pre_retrieval/add_keywords.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,82 @@
import json
import pandas as pd
import subprocess
import zipfile
import glob
import os

def add_keywords(df):

#retrieving the names of the file present in the medline file to extract previously fectched mesh terms
command = subprocess.getstatusoutput(f"ls -lR ./output/medline/txts")
command = list(command)
command = command[1]
command = str(command).split('\n')
my_medline_files = []
for i in range(2,len(command)):
my_medline_files.append(command[i].split()[-1])

total_list = []
for i in range(len(my_medline_files)):
my_file = ''
with zipfile.ZipFile(f"./output/medline/txts/{my_medline_files[i]}", "r") as z:
for filename in z.namelist():
with z.open(filename) as f:
my_file = f.read()
f.close()
z.close()
total_list.append(str(str(my_file.decode('utf-8'))))

total_list = total_list[0].split('\n')

my_pmid_filtered = []
my_kw_filtered = []
current_kw = []
current = False
for i in range(len(total_list)):
if total_list[i][:4] == 'PMID' and current == False:
my_pmid_filtered.append(total_list[i])
current = True
if total_list[i][:2] == 'OT' and total_list[i][:3] != 'OTO':
current_kw.append(total_list[i])
if total_list[i][:4] == 'PMID' and current == True:
my_kw_filtered.append(current_kw)
current_kw = []
my_pmid_filtered.append(total_list[i])
my_kw_filtered.append(current_kw)

for i in range(len(my_pmid_filtered)):
my_pmid_filtered[i] = my_pmid_filtered[i].replace('PMID- ', '')
for i in range(len(my_kw_filtered)):
for j in range(len(my_kw_filtered[i])):
my_kw_filtered[i][j] = my_kw_filtered[i][j].replace('OT - ', '')

df_keywords = pd.DataFrame(list(zip(my_pmid_filtered, my_kw_filtered)),
columns =['pmid', 'keywords'])

df_keywords = df_keywords.drop_duplicates(subset=['pmid'])
for index, row in df_keywords.iterrows():
if df_keywords.keywords.loc[index] == []:
df_keywords.loc[index, 'keywords'] = None

df = df.reset_index().merge(df_keywords, on='pmid').set_index('index')
df = df[['pmid', 'pmcid', 'title', 'abstract', 'mesh', 'keywords', 'authors', 'journal', 'pub_type', 'pub_date', 'doi', 'issn', 'crossref', 'full_text_links', 'licenses', 'pdf', 'xml', 'html', 'plain', 'pmc_tgz', 'xml_parse_d', 'html_parse_d', 'pdf_parse_d', 'plain_parse_d', 'content_text']]

df.pub_date = df.pub_date.astype(str)
result = df.to_json(orient="index")
if len(glob.glob('./output/retrieved_df/retrieved_df2.json.zip')) == 0:
with zipfile.ZipFile("./output/retrieved_df/retrieved_df2.json.zip", mode="w", compression=zipfile.ZIP_DEFLATED, compresslevel=9) as zip_file:
dumped_JSON: str = json.dumps(result, indent=4)
zip_file.writestr("retrieved_df2.json", data=dumped_JSON)
zip_file.testzip()
zip_file.close()
else:
os.rename('./output/retrieved_df/retrieved_df2.json.zip', './output/retrieved_df/temp_retrieved_df2.json.zip')
with zipfile.ZipFile("./output/retrieved_df/retrieved_df2.json.zip", mode="w", compression=zipfile.ZIP_DEFLATED, compresslevel=9) as zip_file:
dumped_JSON: str = json.dumps(result, indent=4)
zip_file.writestr("retrieved_df2.json", data=dumped_JSON)
zip_file.testzip()
zip_file.close()
os.remove('./output/retrieved_df/temp_retrieved_df2.json.zip')

return df
11 changes: 6 additions & 5 deletions cadmus/pre_retrieval/add_mesh_remove_preprint.py
Original file line number Diff line number Diff line change
Expand Up @@ -26,14 +26,12 @@ def add_mesh_remove_preprint(df):
my_file = f.read()
f.close()
z.close()
total_list.extend(my_file)
total_list.append(str(str(my_file.decode('utf-8'))))

for i in range(len(total_list)):
total_list[i] = total_list[i].replace('\n', '')
total_list = total_list[0].split('\n')

my_pmid_filtered = []
my_mh_filtered = []
current_pmid = []
current_mh = []
current = False
for i in range(len(total_list)):
Expand All @@ -58,8 +56,11 @@ def add_mesh_remove_preprint(df):
columns =['pmid', 'mesh'])

df_mesh = df_mesh.drop_duplicates(subset=['pmid'])
for index, row in df_mesh.iterrows():
if df_mesh.mesh.loc[index] == []:
df_mesh.loc[index, 'mesh'] = None

df = df.merge(df_mesh, on='pmid')
df = df.reset_index().merge(df_mesh, on='pmid').set_index('index')
df = df[['pmid', 'pmcid', 'title', 'abstract', 'mesh', 'authors', 'journal', 'pub_type', 'pub_date', 'doi', 'issn', 'crossref', 'full_text_links', 'licenses', 'pdf', 'xml', 'html', 'plain', 'pmc_tgz', 'xml_parse_d', 'html_parse_d', 'pdf_parse_d', 'plain_parse_d', 'content_text']]

index_to_keep = []
Expand Down
2 changes: 2 additions & 0 deletions cadmus/pre_retrieval/creation_retrieved_df.py
Original file line number Diff line number Diff line change
Expand Up @@ -63,6 +63,7 @@ def creation_retrieved_df(medline_file_name):
if abstract == None or abstract == '':
abstract = record.get('OAB')
mesh_terms = record.get('MH')
keywords = record.get('OT')
authors = record.get('AU')
journal_title = record.get('JT')
pub_type = record.get('PT')
Expand All @@ -81,6 +82,7 @@ def creation_retrieved_df(medline_file_name):
'title': title,
'abstract': abstract,
'mesh': mesh_terms,
'keywords': keywords,
'authors':authors,
'journal':journal_title,
'pub_type':pub_type,
Expand Down
5 changes: 3 additions & 2 deletions cadmus/retrieval/search_terms_to_medline.py
Original file line number Diff line number Diff line change
Expand Up @@ -52,13 +52,14 @@ def search_terms_to_medline(query_string, api_key):
d = f.read()
f.close()
z.close()
d = str(str(d.decode('utf-8')) + str(search_results)).encode('utf-8')
d = str(str(d.decode('utf-8')) + '\n' + '\n' + str(search_results)).encode('utf-8')
os.rename('./output/medline/txts/medline_output.txt.zip', './output/medline/txts/temp_medline_output.txt.zip')
with zipfile.ZipFile("./output/medline/txts/medline_output.txt.zip", mode="a", compression=zipfile.ZIP_DEFLATED, compresslevel=9) as zip_file:
zip_file.writestr("medline_output.txt", data=d)
zip_file.testzip()
zip_file.close()
os.remove('./output/medline/txts/temp_medline_output.txt.zip')
print('Medline Records retrieved and saved')
else:
#to avoid errors for large pmids list. We now chunk into smaller set of 9000. Finally we append every chunk in the medline text file.
for i in range(len(query_string)):
Expand All @@ -76,7 +77,7 @@ def search_terms_to_medline(query_string, api_key):
d = f.read()
f.close()
z.close()
d = str(str(d.decode('utf-8')) + str(search_results)).encode('utf-8')
d = str(str(d.decode('utf-8')) + '\n' + '\n' + str(search_results)).encode('utf-8')
os.rename('./output/medline/txts/medline_output.txt.zip', './output/medline/txts/temp_medline_output.txt.zip')
with zipfile.ZipFile("./output/medline/txts/medline_output.txt.zip", mode="a", compression=zipfile.ZIP_DEFLATED, compresslevel=9) as zip_file:
zip_file.writestr("medline_output.txt", data=d)
Expand Down
2 changes: 1 addition & 1 deletion setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@

setuptools.setup(
name="cadmus",
version="0.3.13",
version="0.3.14",
author="Jamie Campbell, Ian Simpson, Antoine Lain",
author_email="Jamie.campbell@igmm.ed.ac.uk, Ian.Simpson@ed.ac.uk, Antoine.Lain@ed.ac.uk",
description="This projects is to build full text retrieval system setup for generation of large biomedical corpora from published literature.",
Expand Down

0 comments on commit 7ddaba8

Please sign in to comment.