In [13]:
import pandas as pd

In [14]:
gwas = pd.read_csv("../patent/dedup_GWAS_patents_only.csv")
sample_df = pd.read_parquet("../patent/non_GWAS_patents_only.parquet")
df = pd.concat([sample_df, gwas])

In [15]:
df_filtered = df[(df["patent_year"] >= 2000) & (df["patent_year"] <= 2020)]
df_filtered = df_filtered[(df_filtered["gwas_year"].isna()) | (df_filtered["gwas_year"] <= 2020)]

# and .99

cutoff = .99

# # probability
df_filtered = df_filtered[(df_filtered["dis_prob"] > cutoff) & (df_filtered["gene_prob"] > cutoff)]

In [16]:
df_filtered

Unnamed: 0,patent_id,gene_id,disease_id,gene_prob,dis_prob,gene_abstract,gene_title,dis_abstract,dis_title,patent_year,PUBMEDID,gwas_year
6706362,6009875,146,D000070639,0.996795,0.999992,0.0,0.0,0.0,0.0,2000,,
6706363,6009875,146,D000070676,0.996795,0.999992,0.0,0.0,0.0,0.0,2000,,
6706364,6009875,146,D000070779,0.996795,0.999992,0.0,0.0,0.0,0.0,2000,,
6706365,6009875,146,D000071075,0.996795,0.999999,0.0,0.0,0.0,0.0,2000,,
6706366,6009875,146,D000071380,0.996795,0.999992,0.0,0.0,0.0,0.0,2000,,
...,...,...,...,...,...,...,...,...,...,...,...,...
490458,10877039,4313,D012002,0.993383,0.999999,0.0,0.0,2.0,1.0,2020,27207650.0,2016.0
490481,10877049,9369,D000544,0.996324,0.999985,0.0,0.0,4.0,0.0,2020,20197096.0,2010.0
490482,10877049,9369,D000690,0.996324,0.999985,0.0,0.0,5.0,0.0,2020,22959728.0,2012.0
490483,10877049,9369,D003704,0.996324,0.999985,0.0,0.0,5.0,0.0,2020,20197096.0,2010.0


In [17]:
# --- 3. Define Parameters ---

# Define the MeSH ID for Inflammatory Bowel Disease
ibd_mesh_id = 'D015212'
# Define the output filename for the new sample file
output_filename = 'ibd_patent_sample.parquet'


# --- 4. Filter the Data for IBD ---

print("Filtering for IBD-related patents...")

# Ensure the disease_id column is a string and strip any leading/trailing whitespace
# to make the filter more robust.
df_filtered['disease_id'] = df_filtered['disease_id'].astype(str).str.strip()

# Filter the DataFrame to get only the rows matching the IBD MeSH ID
ibd_patent_sample_df = df_filtered[df_filtered['disease_id'] == ibd_mesh_id].copy()

print(f"Found {len(ibd_patent_sample_df)} patent entries related to IBD (MeSH ID: {ibd_mesh_id}).")


# --- 5. Save the Filtered Data ---

print(f"Saving the filtered data to '{output_filename}'...")

# Save the new DataFrame to a parquet file
# The index is not saved as it's often not needed in the output file.
ibd_patent_sample_df.to_parquet(output_filename, index=False)

print("="*50)
print("Process Complete.")
print(f"The file '{output_filename}' has been created with the IBD patent data.")
print("="*50)



Filtering for IBD-related patents...
Found 17749 patent entries related to IBD (MeSH ID: D015212).
Saving the filtered data to 'ibd_patent_sample.parquet'...
Process Complete.
The file 'ibd_patent_sample.parquet' has been created with the IBD patent data.


In [18]:
ibd_patent_sample_df

Unnamed: 0,patent_id,gene_id,disease_id,gene_prob,dis_prob,gene_abstract,gene_title,dis_abstract,dis_title,patent_year,PUBMEDID,gwas_year
6739736,6011048,154,D015212,0.991548,0.999984,1.0,0.0,0.0,0.0,2000,,
6739807,6011048,3290,D015212,0.995764,0.999984,2.0,0.0,0.0,0.0,2000,,
6751324,6013476,939,D015212,0.994148,0.999999,0.0,1.0,2.0,0.0,2000,,
6751973,6013476,55558,D015212,0.992004,0.999999,0.0,0.0,2.0,0.0,2000,,
6767734,6013640,4842,D015212,0.992890,0.995275,0.0,0.0,0.0,0.0,2000,,
...,...,...,...,...,...,...,...,...,...,...,...,...
487805,10857167,2524,D015212,0.992656,0.999991,0.0,0.0,1.0,0.0,2020,20570966.0,2010.0
487811,10857168,2524,D015212,0.996252,0.996587,2.0,0.0,0.0,0.0,2020,20570966.0,2010.0
488370,10858439,2212,D015212,0.995495,0.999998,0.0,0.0,0.0,0.0,2020,19915573.0,2009.0
489729,10870689,629,D015212,0.995851,0.999965,0.0,0.0,0.0,0.0,2020,24837172.0,2014.0


In [21]:
### create panel 

agg_df = (
    ibd_patent_sample_df
    .groupby(["gene_id", "disease_id", "patent_year"])
    .size()
    .reset_index(name="num_patents")
)

In [22]:
all_years = range(2000, 2021)
all_pairs = ibd_patent_sample_df[[
    "gene_id", "disease_id", "gwas_year", "gene_prob", "dis_prob",
    "gene_abstract", "gene_title", "dis_abstract", "dis_title"
]]

# Drop duplicates based on just gene_id and disease_id
all_pairs = all_pairs.drop_duplicates(subset=["gene_id", "disease_id"])

#left merge panel years onto unqiue g-d combos
balanced_panel = (
    all_pairs.assign(key=1)
    .merge(pd.DataFrame({"patent_year": all_years, "key": 1}), on="key")
    .drop(columns="key")
)

panel_df = (
    balanced_panel
    .merge(agg_df, on=["gene_id", "disease_id", "patent_year"], how="left")
    .fillna({"num_patents": 0})
)

In [23]:
panel_df

Unnamed: 0,gene_id,disease_id,gwas_year,gene_prob,dis_prob,gene_abstract,gene_title,dis_abstract,dis_title,patent_year,num_patents
0,154,D015212,,0.991548,0.999984,1.0,0.0,0.0,0.0,2000,2.0
1,154,D015212,,0.991548,0.999984,1.0,0.0,0.0,0.0,2001,0.0
2,154,D015212,,0.991548,0.999984,1.0,0.0,0.0,0.0,2002,2.0
3,154,D015212,,0.991548,0.999984,1.0,0.0,0.0,0.0,2003,0.0
4,154,D015212,,0.991548,0.999984,1.0,0.0,0.0,0.0,2004,1.0
...,...,...,...,...,...,...,...,...,...,...,...
83890,9180,D015212,2015.0,0.996261,0.999999,2.0,1.0,8.0,4.0,2016,0.0
83891,9180,D015212,2015.0,0.996261,0.999999,2.0,1.0,8.0,4.0,2017,0.0
83892,9180,D015212,2015.0,0.996261,0.999999,2.0,1.0,8.0,4.0,2018,0.0
83893,9180,D015212,2015.0,0.996261,0.999999,2.0,1.0,8.0,4.0,2019,0.0


In [24]:
panel_df.to_parquet("ibd_panel.parquet", index=False)

: 