In [1]:
#Start

#### Code Descript

The code loads the JSON file containing PubMed abstracts for PMIDs generated in Part 3. It converts this into a Pandas DataFrame with columns for PMID and Abstract text. It also loads the filtered SNP data TSV from Part 2. The mentions are converted to lowercase for case-insensitive matching.

The abstracts are converted to a dictionary, also lowercased. For each row in the SNP data, it checks if the mention text is found in the corresponding abstract using this dictionary. Based on this, two lists are created - PMIDs with matches and without. After de-duplicating these lists, the code verifies all matching PMIDs exist in the abstracts DataFrame.

Finally, it filters the abstracts DataFrame to only include rows with PMIDs that had matches. This filtered set of 107,631 abstracts with validated SNP mentions is saved to a new JSON file.

Finally, it matches SNP mentions to abstracts, filters out non-matches, and saves the result containing verified abstracts with mutation mentions as saves into filtered_snp_abstracts.json

In [3]:
import pandas as pd
import numpy as np

In [10]:
# Input file after part 3
json_file = 'snp_abstracts.json'

snp_abstracts = pd.read_json(json_file, orient='index')
snp_abstracts.columns = ['Abstract']
snp_abstracts.reset_index(inplace=True)
snp_abstracts.rename(columns={'index': 'ID'}, inplace=True)
print(snp_abstracts.head())

         ID                                           Abstract
0  26280318  11β-Hydroxylase deficiency (11OHD) represents ...
1  18951437  Genetic testing often results in the finding o...
2  21975197  The Vitamin Intervention for Stroke Prevention...
3   8702993  We examined the structure-function relationshi...
4  34386500  Inflammation is regulated by the host and is a...


In [15]:
null_per_column = snp_abstracts.isnull().sum()
print("Null values per column:")
print(null_per_column)

Null values per column:
ID          0
Abstract    0
dtype: int64


In [16]:
tsv_file = 'filtered_snp_data.tsv'

snp_df = pd.read_csv(tsv_file, delimiter='\t')
print(snp_df.head())  

       PMID   Concept ID    Mentions Resource
0  20635000  rs779184767       C243A    tmVar
1  28956000  rs200771233         C>T    tmVar
2  28956000   rs61741349  rs61741349    tmVar
3  28956000    rs2071543    Gln49Lys    tmVar
4  28956000   rs28940578  rs28940578    tmVar


In [17]:
# Sum of null values per column in snp_df
null_per_column = snp_df.isnull().sum()
print("Null values per column:")
print(null_per_column)

Null values per column:
PMID          0
Concept ID    0
Mentions      0
Resource      0
dtype: int64


In [19]:
# Convert snp_abstracts to dictionary: key is ID and value is Abstract
# Convert abstracts to lowercase for case in-sensitive serach
abstracts_dict = snp_abstracts.set_index('ID')['Abstract'].str.lower().to_dict()

# Convert mentions to lowercase for case-insensitive search
snp_df['Mentions'] = snp_df['Mentions'].str.lower()

# Check if each mention in snp_df is in the corresponding abstract in abstracts_dict
snp_df['IsMentionInAbstract'] = snp_df.apply(lambda row: abstracts_dict.get(row['PMID'], '').find(row['Mentions']) != -1, axis=1)

# matching_pmid and no_match_pmid lists based on the result
matching_pmid = snp_df[snp_df['IsMentionInAbstract']]['PMID'].tolist()
no_match_pmid = snp_df[~snp_df['IsMentionInAbstract']]['PMID'].tolist()

# Cleanup: Drop the helper column
snp_df.drop('IsMentionInAbstract', axis=1, inplace=True)

In [37]:
# Convert to sets to ensure unique PMIDs
matching_pmid = list(set(matching_pmid))
no_match_pmid = list(set(no_match_pmid))

In [38]:
len(matching_pmid)

107631

In [None]:
#Check if all the PMID in matching_pmid exists in snp_abstracts.

In [41]:
# Convert PMIDs in snp_abstracts to a set
abstracts_pmids = set(snp_abstracts['ID'])

# Check if all PMIDs in matching_pmid are in snp_abstracts
all_exist = set(matching_pmid).issubset(abstracts_pmids)

print(all_exist)

True


In [45]:
# Filter the DataFrame
filtered_abstracts = snp_abstracts[snp_abstracts['ID'].isin(matching_pmid)]
# Save to result to file
filtered_abstracts.to_json('filtered_snp_abstracts.json', orient='records', lines=True)
print(f'Abstracts that containing the snp mentions are saved in filtered_snp_abstracts.json')
print(f'Total Abstracts Saved {len(filtered_abstracts)}')

Abstracts that containing the snp mentions are saved in filtered_snp_abstracts.json
Total Abstracts Saved 107631


In [None]:
#END