# Import Required Libraries
Import the necessary libraries, including pandas, Bio.Entrez, and BioMed.

In [21]:
# Import libraries
import pandas as pd
from Bio import Entrez
Entrez.email = "araz.rawshani@gu.se"  # Replace with your actual email address

## Search for AI papers by invited reserachers to Sanofi's meeting

### Define search terms

In [None]:
# Define Search Parameters

# List of authors to search for
authors = [
    "Grenne B",
    "van den Boogaard M",
    "Kylmala Minna",
    "Vejlstrup Niels",
    "Oerlemans Marish",
    "Rawshani A",
]

# List of keywords to search for in the title or abstract
keywords = [
    "deep learning",
    "neural network*",
    "machine learning",
    "lstm"
]

# Combine authors and keywords into a single search query
author_query = " OR ".join([f'"{author}"[Author]' for author in authors])
keyword_query = " OR ".join([f'"{keyword}"' for keyword in keywords])
search_query = f"({author_query}) AND ({keyword_query})"

# Display the search query
print("Search Query:", search_query)

### Perform search

In [None]:
# Search PubMed
handle = Entrez.esearch(db="pubmed", term=search_query, retmax=200)
record = Entrez.read(handle)
handle.close()

# Fetch details of the articles
id_list = record["IdList"]
handle = Entrez.efetch(db="pubmed", id=",".join(id_list), rettype="medline", retmode="text")
records = handle.read()
handle.close()

# Parse the fetched records
from Bio import Medline
import pandas as pd

medline_records = Medline.parse(records.splitlines())
articles = []

for record in medline_records:
    article = {
        "PMID": record.get("PMID", ""),
        "Title": record.get("TI", ""),
        "Abstract": record.get("AB", ""),
        "Authors": record.get("AU", []),
        "Journal": record.get("JT", ""),
        "Publication Date": record.get("DP", "")
    }
    articles.append(article)

# Convert the list of articles to a pandas DataFrame
df = pd.DataFrame(articles)

# Display the DataFrame
df

In [None]:
# Define the list of authors to search for
authors = [
    "Grenne",
    "Boogaard",
    "Kylmala",
    "Vejlstrup",
    "Oerlemans",
    "Rawshani",
]

# Function to identify authors in the row
def identify_authors(row, authors):
    if row.get('Authors'):
        identified_authors = [author for author in authors if any(author in au for au in row['Authors'])]
        return ", ".join(identified_authors)
    else:
        return ""

# Create the "AuthorIdentified" column
df["AuthorIdentified"] = df.apply(identify_authors, axis=1, authors=authors)

# Display the DataFrame with the new column
df

In [None]:
import plotly.express as px

# Count the occurrences of each author in the IdentifiedAuthor column
author_counts = df["AuthorIdentified"].value_counts().reset_index()
author_counts.columns = ["Author", "Count"]

# Create a bar chart
fig = px.bar(author_counts, x="Author", y="Count", title="Counts for Each Identified Author")
fig.show()

In [None]:
# Display each paper's title and abstract
for index, row in df.iterrows():
    print(f"Title: {row['Title']}")

In [27]:
# save df to csv
df.to_csv("pubmed_search_results_sanofi_team.csv", index=False)

In [None]:
#read pubmed_search_results_sanofi_team_manually_labelled.csv
df2 = pd.read_excel("pubmed_search_results_sanofi_team_manually_labelled.xlsx")
df2

In [None]:
import plotly.express as px

# Count the occurrences of each modality and topic
modality_counts = df2.groupby(['Modality', 'Topic']).size().reset_index(name='Count')

# Create a bar chart
fig = px.bar(modality_counts, x='Modality', y='Count', color='Topic', title='Counts for Each Modality Colored by Topic')
fig.show()

## Total reserach on deep learning in HCM

In [55]:
# Define search parameters
keywords = ["deep learning", "neural network*", "machine learning", "transformer*", "gradient boost*", "artificial intelligence"]
conditions = ["hypertrophic cardiomyopath*", "hypertrophic obstructive cardiomyopath*"]

# Function to search PubMed
def search_pubmed(keywords, conditions):
    Entrez.email = "your.email@example.com"  # Always tell NCBI who you are
    query = f'({" OR ".join(keywords)}) AND ({" OR ".join(conditions)})'
    handle = Entrez.esearch(db="pubmed", term=query, retmax=1000)
    record = Entrez.read(handle)
    handle.close()
    return record["IdList"]

# Function to fetch details of articles
def fetch_details(id_list):
    ids = ",".join(id_list)
    handle = Entrez.efetch(db="pubmed", id=ids, rettype="medline", retmode="text")
    records = Medline.parse(handle)
    return list(records)

# Search PubMed and fetch details
id_list = search_pubmed(keywords, conditions)
articles = fetch_details(id_list)

# Save data to pandas DataFrame
df_all = pd.DataFrame(articles)

In [None]:
df_all.shape

## Which studies are clearly HCM studies?

In [None]:
# Fill NaN values with an empty string
df_all["TI"] = df_all["TI"].fillna("")
df_all["AB"] = df_all["AB"].fillna("")

# Create the column "ClearlyHCM" which is 1 if the title contains "hypertrophic card*", or "HCM", or "HOCM"
df_all["HCM_In_Title"] = df_all["TI"].str.contains("hypertrophic card*|HCM|HOCM", case=False, regex=True).astype(int)
df_all["HCM_In_Abstract"] = df_all["AB"].str.contains("hypertrophic card*|HCM|HOCM", case=False, regex=True).astype(int)


# Create the column "ECG_In_Title" which is set to "Yes" if the title contains "ECG" or "electrocardio*", and "No" otherwise
df_all["ECG_In_Title"] = df_all["TI"].apply(lambda x: "ECG" if "ECG" in x or "electrocardio" in x.lower() else "Other")
df_all["ECG_In_Abstract"] = df_all["AB"].apply(lambda x: "ECG" if "ECG" in x or "electrocardio" in x.lower() else "Other")


# Reorder columns in this order: PMID, HCM_In_Title, HCM_In_Abstract, ECG_In_Title, ECG_In_Abstract, TI, AB, and all others
df_all = df_all[["PMID", "HCM_In_Title", "HCM_In_Abstract", "ECG_In_Title", "ECG_In_Abstract", "TI", "AB"]]

# Sort df_all by HCM_In_Title in descending order, then by HCM_In_Abstract in descending order, then by ECG_In_Title in descending order, and then by ECG_In_Abstract in descending order
df_all = df_all.sort_values(by=["HCM_In_Title", "HCM_In_Abstract", "ECG_In_Title", "ECG_In_Abstract"], ascending=False)

In [58]:
# in columns HCM_In_Title, HCM_In_Abstract replace 1 with "Yes" and 0 with "No"
df_all["HCM_In_Title"] = df_all["HCM_In_Title"].replace({1: "Yes", 0: "No"})
df_all["HCM_In_Abstract"] = df_all["HCM_In_Abstract"].replace({1: "Yes", 0: "No"})

# in columns ECG_In_Title, ECG_In_Abstract replace "ECG" with "Yes" and "Other" with "No"
df_all["ECG_In_Title"] = df_all["ECG_In_Title"].replace({"ECG": "Yes", "Other": "No"})
df_all["ECG_In_Abstract"] = df_all["ECG_In_Abstract"].replace({"ECG": "Yes", "Other": "No"})

df_all.to_csv("df_all.csv", index=False)