In [1]:
import os
from Bio import Entrez
import pandas as pd
from datetime import date
from datetime import datetime
import re

## Make an Output Directory and Get the Input Files

In [2]:
# make an output directory
outdir = 'results/hichip_db/'
os.makedirs(outdir, exist_ok=True)

# get the newest and second newest paper tables
old = "/Users/Cardiff/Downloads/geo-paper-tracker/results/hichip_db/HiChIP_Databases_Mar_8.xlsx"
new = "/Users/Cardiff/Downloads/geo-paper-tracker/results/hichip_db/GEO_Query.2022_03_09_11_53.xlsx"
old_df = pd.read_excel(old)
new_df = pd.read_excel(new)

##  Compare Old and New GEO IDs

In [3]:
# make a regex to extract GEO ID
GEO_pattern = re.compile('GSE[0123456789]+')

# make a set of old GEO IDs
old_GEOs = set()
for x in old_df['GEO / Data link'].tolist():
    GEO = GEO_pattern.findall(x)
    old_GEOs.update(GEO)

# make a set of new GEO IDs
new_GEOs = set()
for x in new_df['GEO / Data link'].tolist():
    GEO = GEO_pattern.findall(x)
    new_GEOs.update(GEO)

# compare the sets of GEO IDs
differences_GEOs = new_GEOs.difference(old_GEOs)

In [4]:
# create an indicate column of papers whose GEO ID is not in the old table 
bools = []
for geoid_strings in new_df['GEO / Data link'].tolist():
    
    geo_status = False
    for geoid in geoid_strings.split():
        if geoid in differences_GEOs:
            geo_status = True
            break
    bools.append(geo_status)

differences_df = new_df.loc[bools]

# replace na with empty string
differences_df.fillna("", inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  return super().fillna(


In [5]:
differences_df

Unnamed: 0,Paper Title,Journal,Authors,Year,DOI,GEO / Data link,Any other information,Organism,Tissue/Cell Line,Other matched data,Presenter,Date Added
2,A primed immune transcriptional program is act...,Neuron,Meijer et al.,2022,https://doi.org/10.1016/j.neuron.2021.12.034,GSE166177\nGSE166179,Other\nGenome binding/occupancy profiling by h...,Mus musculus,,GSM5065199: H3K27ac_HiChIP_IFN_mOPC_rep1\nGSM5...,,2022-03-09
3,A small set of accessible enhancers enables br...,Nucleic Acids Res,Zaurin et al.,2021,https://doi.org/10.1093/nar/gkab1125,GSE179666,Expression profiling by high throughput sequen...,Homo sapiens,,GSM5425946: aHiChIP PR T0\nGSM5425945: aHiChIP...,,2022-03-09
4,A topological atlas reveals layers of genome r...,Cell,Johnstone et al.,2020,https://doi.org/10.1016/j.cell.2020.07.030,GSE133928,Genome binding/occupancy profiling by high thr...,Homo sapiens,,GSM3930281: MGH1904_HiChIP\nGSM3930275: BRD316...,,2022-03-09
5,Allelic specificity of IGH-DUX4 translocation ...,Nat Commun,Tian et al.,2019,https://doi.org/10.1038/s41467-019-10637-8,GSE115494,Genome binding/occupancy profiling by high thr...,Homo sapiens,,GSM3179300: H3K27Ac HiChip,,2022-03-09
6,Allelic specificity of IGH-DUX4 translocation ...,Nat Commun,Tian et al.,2019,https://doi.org/10.1038/s41467-019-10637-8,GSE115492,Genome binding/occupancy profiling by high thr...,Homo sapiens,,GSM3179300: H3K27Ac HiChip,,2022-03-09
...,...,...,...,...,...,...,...,...,...,...,...,...
119,UTX condensation underlies its tumor suppressi...,Nature,Shi et al.,2021,https://doi.org/10.1038/s41586-021-03903-7,GSE149420,Genome binding/occupancy profiling by high thr...,Homo sapiens; Mus musculus,,GSM5379698: HiChIP_H3K27ac_WT\nGSM5379695: HiC...,,2022-03-09
120,Widespread Long-range Cis-Regulatory Elements ...,Nat Plants,Ricci et al.,2019,https://doi.org/10.1038/s41477-019-0547-0,GSE120304,Expression profiling by high throughput sequen...,Zea mays,,GSM4284450: HiChIP B73 leaf H3K4me3\nGSM428445...,,2022-03-09
121,YY1 is a structural regulator of enhancer-prom...,Cell,Weintraub et al.,2017,https://doi.org/10.1016/j.cell.2017.11.008,GSE99521\nGSE99519,Expression profiling by high throughput sequen...,Homo sapiens; Mus musculus,,GSM2774003: HiChIP_mES_C3_UT_H3K27ac_rep1\nGSM...,,2022-03-09
123,cLoops2: a full-stack comprehensive analytical...,Nucleic Acids Res,Cao et al.,2021,https://doi.org/10.1093/nar/gkab1233,GSE179010,Other,Homo sapiens,,GSM5403564: ChIC_GM12878_IgG_Rep1\nGSM5403558:...,,2022-03-09


## Save the Output File

In [6]:
# determining the current year, month and day
today = date.today()
date_str = today.strftime("%Y_%m_%d")

# determining current hour and minute
now = datetime.now()
time_str = now.strftime("%H_%M")

# setting the output filename
output = os.path.join(outdir, "GEO_Compare.{}_{}".format(date_str, time_str))

In [7]:
output

'results/hichip_db/GEO_Compare.2022_03_09_11_55'

In [8]:
differences_df.to_excel(output + ".xlsx", index=False)