## Process data from dbesslnc1.0

In [1]:
import pandas as pd

df = pd.read_csv('../match/test/output/lncbook.bed', sep='\t', header=None, names=['chr', 'start', 'end', 'trans_id', 'score', 'strand', 'gene_id', 'gene_type', 'gene_name'])
df['gene_name'] = df['gene_name']

mask = (df['gene_type'] == 'gene') & (df['gene_name'] != 'N.A.')
filtered_df = df[mask]
result_df = filtered_df[['chr', 'start', 'end', 'strand', 'gene_id', 'gene_name']]
result_df.to_csv('gene_mapping.tsv', sep='\t', index=False, header=False)

In [1]:
import pandas as pd

def process_lncrna_mapping(v1_file,map_file1,map_file2):
    lncRNA_cancer = pd.read_csv(v1_file, sep='\t')
    mapping_file = pd.read_csv(map_file1, sep='\t', header=None)
    pos_file = pd.read_csv(map_file2, sep='\t', header=None)

    mapping_file.columns = ['chr', 'start', 'end', 'strand', 'gene_id', 'gene_name']
    pos_file.columns = ['chr', 'start', 'end', 'gene_id', 'score', 'strand']
    txt_data = lncRNA_cancer.iloc[:, [0, 1, 2]].copy()
    txt_data.columns = ['name', 'ncbi_id', 'noncode_id']
    
    txt_names = set(txt_data['name'])
    matched_names = set()
    tsv_expanded = []

    for idx, row in mapping_file.iterrows():
        if row['gene_name'] in matched_names:
            continue
        matched_names.add(row['gene_name'])
        gene_names = str(row['gene_name']).split(',')
        for gene_name in gene_names:
            gene_name = gene_name.strip()
            if gene_name != 'N.A.' and gene_name in txt_names:
                row_copy = row.copy()
                row_copy['matched_gene'] = gene_name
                tsv_expanded.append(row_copy)
    matched_by_tsv = pd.DataFrame(tsv_expanded)

    if len(matched_by_tsv) > 0:
        tsv_merged = matched_by_tsv.merge(
            txt_data,
            left_on='matched_gene',
            right_on='name',
            how='left'
        )[['chr', 'start', 'end', 'strand', 'gene_id', 'name', 'ncbi_id','noncode_id']]
        matched_txt_names = set(matched_by_tsv['matched_gene'])
    else:
        tsv_merged = pd.DataFrame(columns=['chr', 'start', 'end', 'strand', 'gene_id', 'name', 'ncbi_id','noncode_id'])
        matched_txt_names = set()

    unmatched_txt = txt_data[~txt_data['name'].isin(matched_txt_names)]
    valid_noncode = unmatched_txt[unmatched_txt['noncode_id'] != 'N.A.']
    if len(valid_noncode) > 0:
        bed_matched = pos_file[pos_file['gene_id'].isin(valid_noncode['noncode_id'])]
        bed_merged = bed_matched.merge(
            valid_noncode,
            left_on='gene_id',
            right_on='noncode_id',
            how='left'
        )[['chr', 'start', 'end', 'strand', 'gene_id', 'name', 'ncbi_id', 'noncode_id']]
    else:
        bed_merged = pd.DataFrame(columns=['chr', 'start', 'end', 'strand', 'gene_id', 'name', 'ncbi_id', 'noncode_id'])
    bed_merged['gene_id'] = 'N.A.'
    all_matched = pd.concat([tsv_merged, bed_merged], ignore_index=True)
    matched_names_all = set(all_matched['name']) if len(all_matched) > 0 else set()
    unmatched = txt_data[~txt_data['name'].isin(matched_names_all)]
    all_matched.to_csv('dbesslnc_temp.tsv', sep='\t', index=False)
    unmatched.to_csv('unmap.txt', sep='\t', index=False)


    print(f"Total records: {len(txt_data)}")
    print(f"Lncbook matched: {len(tsv_merged)}")
    print(f"Noncode matched: {len(bed_merged)}")
    print(f"Unmatched: {len(unmatched)}")
    return all_matched, unmatched

if __name__ == "__main__":
    merged, unmatched = process_lncrna_mapping(v1_file='dbesslnc.txt',
                                               map_file1='gene_mapping.tsv',
                                               map_file2='../clinvar_map/lncRNA_reference/NONCODE/NONCODEv6_hg38.lncRNAGene.bed')


Total records: 173
Lncbook matched: 156
Noncode matched: 11
Unmatched: 6


### Complete the external link ID

In [28]:
import requests
import pandas as pd
import time
import json


# Send requests in batches of 200 IDs
def send_request(ids_batch):
    # LncBook conversion API URL
    url = "https://ngdc.cncb.ac.cn/lncbook/conversion"

    # Combine IDs into a format suitable for the request
    ids_str = "\n".join(ids_batch)

    # Set the request payload
    payload = {'ids': ids_str}

    # Send POST request
    response = requests.post(url, data=payload)

    # Return results if the request is successful
    if response.status_code == 200:
        return json.loads(response.text)  # Parse as JSON
    else:
        print(f"Request failed, status code: {response.status_code}")
        return None


# Process the request results and save them to a TSV file, keeping ID, symbol, and entrez fields
def process_and_save_results(id, results, output_file):
    # Store results in a list to prepare for writing to TSV
    data = []

    for i, result in enumerate(results):
        if result:
            for entry in result:
                noncode = entry.get(id, "N/A")
                geneid = entry.get("geneid", "N/A")
                data.append([noncode, geneid])

    # Create DataFrame and save it as a tsv file
    df = pd.DataFrame(data, columns=["noncode", "geneid"])
    df = df.drop_duplicates(subset=df.columns[0])
    # Read the last 11 rows from dbesslnc_temp.tsv
    temp_df = pd.read_csv('dbesslnc_temp.tsv', sep='\t')

    # Merge conversion results with temp_df on 'noncode' and 'noncode_id'
    merged_df = temp_df.merge(df, left_on='noncode_id', right_on='noncode', how='left')

    # Replace 'gene_id' column with 'geneid' from conversion results
    merged_df['gene_id'].update(merged_df['geneid'])

    # Select and reorder columns as in temp_df, but with updated gene_id
    df = merged_df[temp_df.columns]
    # Save as tsv file
    df.to_csv(output_file, sep='\t', index=False)
    print(f"Saved conversion results to {output_file}")


# Main function
def main(NONCODE_ids, output_file_noncode):
    # Read IDs
    # Process NONCODEID
    print("Processing NONCODEID...")
    noncode_results = []
    for i in range(0, len(NONCODE_ids), 200):
        ids_batch = NONCODE_ids[i:i + 200]
        print(f"Sending batch {i // 200 + 1} of NONCODEID requests...")
        result = send_request(ids_batch)
        noncode_results.append(result)
        time.sleep(1)  # Delay 1 second to avoid frequent requests

    # Save NONCODEID results
    print(f"Saving NONCODEID conversion results to {output_file_noncode}")
    process_and_save_results("noncode", noncode_results, output_file_noncode)

# Call the main function
if __name__ == "__main__":
    output_file = 'dbesslnc_gene.tsv'  # Output file path for NONCODE results
    NONCODE_ids = pd.read_csv('dbesslnc_temp.tsv', sep='\t')['noncode_id'].tail(11).tolist()

    main(NONCODE_ids, output_file)


Processing NONCODEID...
Sending batch 1 of NONCODEID requests...
Saving NONCODEID conversion results to dbesslnc_gene.tsv
Saved conversion results to dbesslnc_gene.tsv


In [29]:
!rm -f dbesslnc_temp.tsv merge.tsv 