In [1]:
import pathlib
import numpy as np
import pandas as pd

import mygene
from Bio.Seq import Seq

In [2]:
# Load IDR file
file = "idr0080-screenA-library.csv"

df = pd.read_csv(file)

print(df.shape)
df.head(2)

(6912, 17)


Unnamed: 0,Plate,Well,Characteristics [Organism],Term Source 1 REF,Term Source 1 Accession,Characteristics [Cell Line],Term Source 2 REF,Term Source 2 Accession,Reagent Identifier,Sense Sequence,Antisense Sequence,Reagent Design Gene Annotation Build,Gene Identifier,Gene Symbol,Control Type,Channels,Comments
0,SQ00014610__2016-06-16T00_38_35-Measurement2,A1,Homo sapiens,NCBITaxon,NCBITaxon_9606,A549,EFO,EFO_0001086,,,,,,,empty well,Hoechst 33342 (DNA); Concanavalin A/Alexa 488 ...,images are illumination corrected
1,SQ00014610__2016-06-16T00_38_35-Measurement2,A2,Homo sapiens,NCBITaxon,NCBITaxon_9606,A549,EFO,EFO_0001086,MCL1-5,CATTCCTGATGCCACCTTCT,,,MCL1,MCL1,,Hoechst 33342 (DNA); Concanavalin A/Alexa 488 ...,images are illumination corrected


In [3]:
# Get ensemble IDs - this will require a manual check
mg = mygene.MyGeneInfo()

result = mg.querymany(
    df.loc[:, "Gene Symbol"].unique().tolist(),
    scopes="symbol,alias",
    species="human",
    fields="entrezgene,symbol,ensembl.gene,",
    as_dataframe=True
)

ensembl_id_df = (
    result
    .sort_values(by="_score", ascending=False)
    .reset_index()
    .drop_duplicates(subset="query")
)

querying 1-59...done.
Finished.
4 input query terms found dup hits:
	[('GLS', 2), ('RAC1', 2), ('RAF1', 2), ('TXN', 2)]
4 input query terms found no hit:
	['Chr2', 'Luc', 'LacZ', 'ATP50']
Pass "returnall=True" to return complete lists of duplicate or missing query terms.


  df = json_normalize(obj)


In [4]:
# Get antisense sequences
# Also, update the comments if the control type is an empty well
antisense_results = []
comments_update = []
for idx, perturbation in df.iterrows():
    sequence = perturbation["Sense Sequence"]
    control_type = perturbation["Control Type"]
    comments = perturbation["Comments"]

    if isinstance(sequence, str):
        my_dna = Seq(sequence)
        antisense = str(my_dna.complement())
    else:
        antisense = np.nan
    
    antisense_results.append(antisense)
    
    if control_type == "empty well":
        comments = f"{comments}; empty well"
    
    comments_update.append(comments)

assert len(antisense_results) == df.shape[0]

In [5]:
# Drop SMARCB1 (mygene failure)
# Drop query "nan" (mygene false positive)
#{"SMARCB1": "ENSG00000099956"}
ensembl_id_df = ensembl_id_df.dropna(subset=["ensembl.gene"], axis="rows").query("symbol != 'SCN11A'")

ensembl_mapper = dict(zip(ensembl_id_df.loc[:, "query"], ensembl_id_df.loc[:, "ensembl.gene"]))

In [6]:
# Identify ensembl column
ensemble_column = df.loc[:, "Gene Symbol"].replace(ensembl_mapper)
ensemble_column = [x if str(x).startswith("ENSG") else np.nan for x in ensemble_column]
annotation_build_column = [
    "Ensembl release 101 - August 2020" if str(x).startswith("ENSG") else np.nan for x in ensemble_column
]

In [7]:
# Update control type
control_type_col = df.loc[:, "Control Type"].replace({"empty well": "no reagent"})

In [8]:
df.loc[:, "Antisense Sequence"] = antisense_results
df.loc[:, "Gene Identifier"] = ensemble_column
df.loc[:, "Comments"] = comments_update
df.loc[:, "Control Type"] = control_type_col

# Add annotation build
df.loc[:, "Reagent Design Gene Annotation Build"] = annotation_build_column

In [9]:
# Output updated results
output_file = f"updated_{file}"
df.to_csv(output_file, index=False, sep=",")

In [10]:
print(df.shape)
df.head(3)

(6912, 17)


Unnamed: 0,Plate,Well,Characteristics [Organism],Term Source 1 REF,Term Source 1 Accession,Characteristics [Cell Line],Term Source 2 REF,Term Source 2 Accession,Reagent Identifier,Sense Sequence,Antisense Sequence,Reagent Design Gene Annotation Build,Gene Identifier,Gene Symbol,Control Type,Channels,Comments
0,SQ00014610__2016-06-16T00_38_35-Measurement2,A1,Homo sapiens,NCBITaxon,NCBITaxon_9606,A549,EFO,EFO_0001086,,,,,,,no reagent,Hoechst 33342 (DNA); Concanavalin A/Alexa 488 ...,images are illumination corrected; empty well
1,SQ00014610__2016-06-16T00_38_35-Measurement2,A2,Homo sapiens,NCBITaxon,NCBITaxon_9606,A549,EFO,EFO_0001086,MCL1-5,CATTCCTGATGCCACCTTCT,GTAAGGACTACGGTGGAAGA,Ensembl release 101 - August 2020,ENSG00000143384,MCL1,,Hoechst 33342 (DNA); Concanavalin A/Alexa 488 ...,images are illumination corrected
2,SQ00014610__2016-06-16T00_38_35-Measurement2,A3,Homo sapiens,NCBITaxon,NCBITaxon_9606,A549,EFO,EFO_0001086,AKT1-1,GGCCAAGCCCAAGCACCGCG,CCGGTTCGGGTTCGTGGCGC,Ensembl release 101 - August 2020,ENSG00000142208,AKT1,,Hoechst 33342 (DNA); Concanavalin A/Alexa 488 ...,images are illumination corrected
