## Step 1: Create a database connection and Import data


**Preparations before executing the code**
```
## Create a database
create database ess_test;
use ess_test;

## Import table structure
source /your_path/store/dbess_schema.sql;

## modify the database table structure
ALTER TABLE esslnc
ADD CONSTRAINT unique_lnc 
UNIQUE (chr, start, end, strand);
```

In [None]:
import pandas as pd
from sqlalchemy import create_engine,text


username = 'root'
password = 'root'
host = 'localhost'  # general 'localhost'
port = 3307  #Default MySQL port number.
database = 'ess_test'


engine = create_engine(f'mysql+pymysql://{username}:{password}@{host}:{port}/{database}')

## Step2: Import gene entries

The files needed.
1. /match/test/map/lncbook_map.tsv, res_lncbook_map.tsv
2. /match/test/map/noncode_map.tsv, res_noncode_map.tsv
3. /match/test/map/gencode_map.tsv, res_gencode_map.tsv
4. /match/test/map/ncbi_map.tsv, res_ncbi_map.tsv
5. /match/go_map.txt
6. /match/crispr_all.bed
7. /match/merge.txt
8. /clinvar_map/db/crispr_overlap/final_lncRNA_nocrispr.bed
9. /cancer/unmap_from_dbesslnc.txt

### Step2.1 : import lncRNA verified by CRISPR 
 1. main crispr lncRNA gene
 2. update vitro column
 3. Export data for variants mapping.

In [None]:
relative_path = '../match/'
crispr_bed_file = relative_path + 'test/output/crispr_all.bed'

df = pd.read_csv(crispr_bed_file, sep='\t', header=None)

df['gene_id'] = df[3].apply(lambda x: x.rsplit('-', 1)[0])

# Directly use the gene_id as the grouping basis without merging.
df['merge_group'] = df['gene_id']

# group by gene_id and merge records
result = df.groupby('merge_group').agg({
    0: 'first',  # chr
    1: lambda x: min(x) + 1,    # start
    2: 'max',    # end
    5: 'first',  # strand
    'gene_id': lambda x: ';'.join(sorted(set(x)))  
}).reset_index()

result = result[[0, 1, 2, 5, 'gene_id']]  
result.columns = ['chr', 'start', 'end', 'strand', 'target']  

result.to_sql('esslnc', engine, if_exists='append', index=False)

In [None]:
# Insert in the manually retrieved supplementary lncRNA gene entries
data = [
        {'Noncode_id': 'N.A.', 'Lncbook_id': 'N.A.', 'ensembl_id': 'N.A.', 
         'target': 'LH00477', 'chr': 'chr1', 'start': 145410838, 
         'end': 145413269, 'strand': '+'},
        {'Noncode_id': 'NONHSAG005780.3', 'Lncbook_id': 'HSALNG0077928', 
         'ensembl_id': 'ENSG000000290921.2', 'target': 'LH02126', 
         'chr': 'chr10', 'start': 45888164, 'end': 45972422, 'strand': '+'},
        {'Noncode_id': 'NONHSAG097226.1', 'Lncbook_id': 'HSALNG0060060', 
         'ensembl_id': 'N.A.', 'target': 'LH14878', 'chr': 'chr7', 
         'start': 100963828, 'end': 100968124, 'strand': '-'}
    ]
    
    # 2. Create DataFrame
df = pd.DataFrame(data)

df.to_sql('esslnc', engine, if_exists='append', index=False)
print(f"Successfully inserted {len(df)} supplementary lncRNA gene entries.")

In [None]:
# 2. update vitro and Organism column
with engine.connect() as conn:
    update_sql = text("""
        UPDATE esslnc 
        SET vitro = 1,Organism = 'Human'
    """)
    conn.execute(update_sql)
    conn.commit()

print("Update successfully")

#### Import the externally mapped database ID

In [None]:
# Import the mapped database gene IDs and gene names,along with other mapped data(lncbook/ncbi/noncode/gencode_map.tsv)

# for lncbook and noncode IDs
def import_mapped_data(map_file,db):
    # for lncbook and noncode IDs
    map_df = pd.read_csv(map_file, sep='\t', header=None)
    map_dict = map_df[[0,3,4]].drop_duplicates()
    with engine.connect() as conn:
        for _, row in map_dict.iterrows():
            if db != 'Noncode_id':
                update_sql_name = text(f"""
                    UPDATE esslnc 
                    SET gene_name = :gene_name
                    WHERE target LIKE :pattern1 
                    OR target LIKE :pattern2
                    AND (gene_name IS NULL or gene_name = 'N.A.')
                """)

            update_sql = text(f"""
                UPDATE esslnc 
                SET {db} = :id
                WHERE target LIKE :pattern1 
                OR target LIKE :pattern2
            """)
            pattern1 = f"{row[0]}%"  
            pattern2 = f"%;{row[0]}%"  
            gene_name = 'N.A.' if str(row[4]).startswith('ENSG') or str(row[4]).startswith('LOC') else row[4]
            conn.execute(update_sql, {
                "id": row[3], 
                "pattern1": pattern1,
                "pattern2": pattern2
            })
            if db != 'Noncode_id':
                conn.execute(update_sql_name, {
                    "gene_name": gene_name,
                    "pattern1": pattern1,
                    "pattern2": pattern2
                })
            conn.commit()

import_mapped_data('../match/test/map/ncbi_map.tsv', 'NCBI_id')
import_mapped_data('../match/test/map/gencode_map.tsv', 'ensembl_id')
import_mapped_data('../match/test/map/lncbook_map.tsv', 'Lncbook_id')
import_mapped_data('../match/test/map/noncode_map.tsv', 'Noncode_id')


#### Import the reason summary of genes verified by CRISPR

In [None]:
# Read exp_crispr.csv file
df = pd.read_csv('../curated/exp_crispr.csv')

# Group by target_id and collect all exp_type
target_summary = df.groupby('target_id').agg({
    'exp_type': lambda x: ', '.join(sorted(set(x))),
    'PMID': lambda x: ','.join(set(str(p) for p in x if pd.notna(p)))
}).to_dict('index')

# Update database
with engine.connect() as conn:
    for target_id, data in target_summary.items():
        exp_types = data['exp_type']
        pmids = data['PMID']
        reason_summary = f"[{exp_types}] Verified by the {exp_types} experiment. "
        
        update_sql = text("""
            UPDATE esslnc 
            SET reason_summary = :reason_summary,
            PMID = :pmids
            WHERE target = :target_id
        """)
        
        conn.execute(update_sql, {
            "reason_summary": reason_summary,
            "pmids": pmids,
            "target_id": target_id
        })
    
    conn.commit()

print(f"Successfully updated {len(target_summary)} genes.")


### Step2.3 import variants lncRNA gene to database,

In [None]:
# 4.
# Adjust the order of column names according to different files.

input_file = '../clinvar_map/final_lncRNA_nocrispr.bed'
df = pd.read_csv(input_file, sep='\t', header=None,
                 names=['chr', 'start', 'end', 'Lncbook_id', 'Noncode_id', 
                       'strand', 'gene_name', 'NCBI_id', 'variants_num','disease_related'])
print(len(df))

df['Lncbook_id'] = df['Lncbook_id'].replace('-', 'N.A.')
df['Noncode_id'] = df['Noncode_id'].replace('-', 'N.A.')
df['NCBI_id'] = df['NCBI_id'].replace('-', 'N.A.')
df['gene_name'] = df['gene_name'].replace('-', 'N.A.')
insert_data = df[['chr', 'start', 'end', 'Lncbook_id', 'Noncode_id', 
                  'strand', 'gene_name', 'NCBI_id','disease_related']]

with engine.connect() as conn:
    for _, row in insert_data.iterrows():
        try:
            insert_sql = text("""
                INSERT IGNORE INTO esslnc 
                (chr, start, end, Lncbook_id, Noncode_id, strand, gene_name, NCBI_id,disease_related)
                VALUES (:chr, :start, :end, :Lncbook_id, :Noncode_id, :strand, :gene_name, :NCBI_id,:disease_related)
            """)
            conn.execute(insert_sql, row.to_dict())
        except Exception as e:
            print(f"skip")
            continue
    conn.commit()


# try:
#     insert_data.to_sql('esslnc', engine, if_exists='append', index=False)
# except Exception as e:
#     print(f"error {str(e)}")


In [None]:
# 5. Update disease_related column,Mark whether lncRNAs derived from CRISPR experiments are disease-related.
# Supplement the reason summary
with engine.connect() as conn:
    update_sql = text("""
        UPDATE esslnc
        SET disease_related = 1, reason_summary = '[ClinVar] Associated with variants in ClinVar.'
        WHERE vitro = 0 AND vivo = 0 AND cancer_related = 0
    """)
    conn.execute(update_sql)
    conn.commit()

print("Update successfully")

### Step2.4 Import entries from dbesslnc that have been supplemented with annotations.

In [None]:
# 6.dbesslnc_gene.csv, additional annotation information was supplemented.
input_file = 'dbesslnc_gene.csv'
df = pd.read_csv(input_file, sep=',')
df['start'] = df['start']+1;
print(df.columns)
with engine.connect() as conn:
    for _, row in df.iterrows():
        try:
            insert_sql = text("""
                INSERT IGNORE INTO esslnc 
                (chr, start, end, Lncbook_id, Noncode_id, strand, gene_name, NCBI_id)
                VALUES (:chr, :start, :end, :Lncbook_id, :Noncode_id, :strand, :gene_name, :NCBI_id)
            """)
            conn.execute(insert_sql, row.to_dict())
        except Exception as e:
            print(e)
            print(f"skip")
            continue
    conn.commit()

In [None]:
# 7.unmap_from_dbesslnc.txt,After manually searching the public database GeneCards, 
# additional annotation information was supplemented.
input_file = 'unmap_from_dbesslnc.txt'
df = pd.read_csv(input_file, sep='\t')
with engine.connect() as conn:
    for _, row in df.iterrows():
        try:
            insert_sql = text("""
                INSERT IGNORE INTO esslnc 
                (chr, start, end, Lncbook_id, Noncode_id, strand, gene_name, NCBI_id,PMID)
                VALUES (:chr, :start, :end, :Lncbook_id, :Noncode_id, :strand, :gene_name, :NCBI_id, :PMID)
            """)
            conn.execute(insert_sql, row.to_dict())
        except Exception as e:
            print(f"skip")
            continue
    conn.commit()

In [None]:
# 8. For lncRNAs from dbesslnc,mark the columns for cancer-related and in vivo.
# dbesslnc_id.txt,The file contains the gene name and the corresponding role and lit.
df = pd.read_csv('dbesslnc_reason.txt', sep='\t')

with engine.connect() as conn:
    general_genes = tuple(df[df['Role'] == 'General']['Name'].tolist())
    suppressor_genes = tuple(df[df['Role'] == 'Tumor suppressor gene']['Name'].tolist())
    oncogenes = tuple(df[df['Role'] == 'Oncogene']['Name'].tolist())
    for _,row in df.iterrows():
        if row['Name'] in general_genes:
            update_general = text("""
                UPDATE esslnc 
                SET vivo = 1, reason_summary = :Reason, PMID = :PMID
                WHERE gene_name = :genes
            """)
            conn.execute(update_general, {'genes': row['Name'], 'Reason': row['Reason'], 'PMID': row['PMID']})

        if row['Name'] in suppressor_genes:
            update_suppressor = text("""
                UPDATE esslnc 
                SET cancer_related = 2, reason_summary =  :Reason, PMID = :PMID
                WHERE gene_name = :genes
            """)
            conn.execute(update_suppressor, {'genes': row['Name'], 'Reason': row['Reason'], 'PMID': row['PMID']})

        if row['Name'] in oncogenes:
            update_oncogene = text("""
                UPDATE esslnc 
                SET cancer_related = 1, reason_summary =  :Reason, PMID = :PMID
                WHERE gene_name = :genes
            """)
            conn.execute(update_oncogene, {'genes': row['Name'], 'Reason': row['Reason'], 'PMID': row['PMID']})
    
    conn.commit()

### Step2.5: Import the mouse's essential lncRNA.

In [None]:
# import mouse essential lncRNA data
df = pd.read_csv('../dbesslnc/dbesslnc_mouse.csv', sep=',', encoding='latin-1')

for column in df.columns:
    df[column] = df[column].astype(str).str.strip('"')

if 'reason_summary' in df.columns:
    df['reason_summary'] = df['reason_summary'].apply(lambda x: f"[Literature] {x}" if x != 'N.A.' else x)

with engine.connect() as conn:
    df.to_sql('esslnc', 
              con=conn, 
              if_exists='append',
              index=False)
    update_sql = text("""
        UPDATE esslnc
        SET vivo = 1
        WHERE Organism = 'Mouse'
    """)
    conn.execute(update_sql)
    conn.commit()
    
print(f"successfully {len(df)} rows to esslnc table")

## Step3: Generating Unique Identifiers for lncRNA Entries

### Step3.1

In [None]:
#  Generating Unique Identifiers for lncRNA Entries
query = """
SELECT num_id, chr, start 
FROM esslnc 
WHERE Organism = 'Human'
ORDER BY chr, start
"""


df = pd.read_sql(query, engine)
df['new_uid'] = ['ELH{:06d}'.format(i+1) for i in range(len(df))]


with engine.connect() as conn:
    for index, row in df.iterrows():
        update_sql = text("""
        UPDATE esslnc 
        SET UID = :new_uid 
        WHERE num_id = :num_id
        """)
        conn.execute(update_sql, {"new_uid": row['new_uid'], "num_id": row['num_id']})
    conn.commit()

print(f"Successfully updated {len(df)} records。")

### Step4.2:

In [None]:
# Create a local mapping table for crispr lncRNA transcript.
query = """
SELECT UID, target 
FROM esslnc WHERE vitro = 1
"""
df = pd.read_sql(query, engine)


def split_target(row):
    targets = row['target'].split(';')
    return [{'UID': row['UID'], 'target': target} for target in targets]


expanded_rows = [item for _, row in df.iterrows() for item in split_target(row)]
result_df = pd.DataFrame(expanded_rows)


output_file = 'crispr_target_UID.tsv'
result_df.to_csv(output_file, sep='\t', index=False)

print(f"exported {len(result_df)} records {output_file}")

### Step4.3: 

In [None]:
# Create a local mapping table for variants lncRNA 
query = """
SELECT UID, target 
FROM esslnc WHERE vitro = 1
"""
df = pd.read_sql(query, engine)

output_file = 'crispr_UID.txt'
df.to_csv(output_file, sep='\t', index=False)

print(f"exported {len(df)} records {output_file}")

### Step4.4:

In [None]:
# export Create a local mapping table for disease_related lncRNA.
query = text("""
    SELECT Lncbook_id,Noncode_id,UID
    FROM esslnc 
    WHERE disease_related = 1 AND vitro = 0;
""")

output_file = 'disease_related_UID.txt'
with engine.connect() as conn:

    result = pd.read_sql(query, conn)
    

    result.to_csv(output_file, 
                  sep='\t', 
                  index=False, 
                  header=True,
                  na_rep='N.A.')

print(f"exported {len(result)} records {output_file}")

### Step4.5:

In [None]:
# export Create a local mapping table for non verified by crispr lncRNA.
query = text("""
    SELECT Lncbook_id,Noncode_id,UID
    FROM esslnc 
    WHERE  vitro = 0;
""")

output_file = 'non_crispr_UID.txt'
with engine.connect() as conn:

    result = pd.read_sql(query, conn)
    

    result.to_csv(output_file, 
                  sep='\t', 
                  index=False, 
                  header=True,
                  na_rep='N.A.')

print(f"exported {len(result)} records {output_file}")

## Step4: Import transcript Entries
The required files:
1. /match/seq_splice.bed seq_delete.bed seq_casrx.bed seq_crispri.bed
2. /match/esslnc2.fa


### Step5.1

In [None]:

# import Genomic location information.
import pandas as pd

# custum bed file, generated by gen_fa.ipynb step1, Just input different files as needed
custum_bed_file = 'seq_casrx.bed'
#transcript sequence crispr_gene.fa
seq_file = 'lncRNAV2.fasta'

mapping_df = pd.read_csv('crispr_mapping.tsv', sep='\t',names=['UID','target'])
target_to_uid = {}

for _, row in mapping_df.iterrows():
    target_to_uid[row['target']] = row['UID']

df = pd.read_csv(custum_bed_file, sep='\t', header=None, 
                 names=['chr','start','end','name','score','strand','block_starts','block_sizes'])


def parse_fasta(fasta_file):
    sequences = {}
    current_seq_id = None
    current_seq = []
    
    with open(fasta_file) as f:
        for line in f:
            line = line.strip()
            if line.startswith('>'):

                if current_seq_id:
                    sequences[current_seq_id] = ''.join(current_seq)
                current_seq_id = line[1:]  
                current_seq = []
            else:
                current_seq.append(line)
    if current_seq_id:
        sequences[current_seq_id] = ''.join(current_seq)
    
    return sequences


transcript_fa = parse_fasta(seq_file)

def process_transcript(row):

    transcript_id = row['name'].rsplit('-',1)[1]
    target = row['name'].rsplit('-',1)[0]
    UID = target_to_uid.get(target)
    fasta_seq = transcript_fa.get(transcript_id)
    if fasta_seq is None:
        FASTA = f">{transcript_id}<br/>Sequence not found"
    else:
        FASTA = f">{transcript_id}<br/>{fasta_seq}"

    starts = [int(x) for x in row['block_starts'].split(',') if x]
    sizes = [int(x) for x in row['block_sizes'].split(',') if x]
    

    exon_positions = []
    for rel_start, size in zip(starts, sizes):
        abs_start = row['start'] + rel_start + 1  # 1-base
        abs_end = abs_start + size - 1
        exon_positions.append(f"{abs_start}-{abs_end}")
    
    return {
        'UID': UID,
        'transcript_id': transcript_id,
        'chr': row['chr'],
        'start': row['start'] + 1,  # 1-base
        'end': row['end'],
        'length': sum(sizes),
        'exon_num': len(sizes),
        'exon_pos': ','.join(exon_positions),
        'strand': row['strand'],
        'FASTA':FASTA
    }

result_df = pd.DataFrame([process_transcript(row) for _, row in df.iterrows()])


result_df.to_sql('trans', engine, if_exists='append', index=False)

print(f"Successfully import {len(result_df)} records to trans table.")

In [None]:
# import non verified by crispr lncRNA 

import pandas as pd
from sqlalchemy import text

non_crispr_file = 'non_crispr_lncrna.csv'
headers = pd.read_csv(non_crispr_file, nrows=0).columns.tolist()
print(headers)

sql = text(f"""
    INSERT IGNORE INTO trans
    ({', '.join(headers)})
    VALUES ({', '.join([':' + col for col in headers])})
""")

total_rows = 0
success_rows = 0

for chunk in pd.read_csv(non_crispr_file, chunksize=1000):
    with engine.connect() as conn:
        for _, row in chunk.iterrows():
            try:
                result = conn.execute(sql, row.to_dict())
                success_rows += result.rowcount
                total_rows += 1
            except Exception as e:
                print(e)
                continue
        conn.commit()
print(f"Total processed: {total_rows}")
print(f"Successfully imported: {success_rows}")
print(f"Skipped duplicates: {total_rows - success_rows}")

### Step5.3 Import sequence (lncRNA non verified by crispr)
The required files(downloaded from lncbook v2.0 and Noncode V6.0)
1. LncBookv2_OnlyLnc.fa
2. outLncRNA.fa 


In [None]:
# import seq from lncbook/noncode
import pandas as pd
from sqlalchemy import text

fa_file = 'LncBookv2_OnlyLnc.fa'

sql_select = text("""
    SELECT transcript_id 
    FROM trans 
    WHERE FASTA IS NULL
""")


fasta_dict = {}
with open(fa_file, 'r') as f:
    seq = ''
    tid = ''
    for line in f:
        if line.startswith('>'):
            if tid and seq:
                fasta_dict[tid] = seq
            header = line.strip().lstrip('>')
            tid = header.split()[0]  # Use the first part of the header as transcript_id
            seq = ''
        else:
            seq += line.strip()
    if tid and seq:
        fasta_dict[tid] = seq


sql_update = text("""
    UPDATE trans 
    SET FASTA = :fasta , length = :length
    WHERE transcript_id = :tid
""")


with engine.connect() as conn:
    transcript_ids = conn.execute(sql_select).fetchall()
    print(f"Found {len(transcript_ids)} transcripts without FASTA data.")
    for (tid,) in transcript_ids:
        print(f"Processing transcript ID: {tid}")
        if tid in fasta_dict:
            fasta_content = f"{tid}<br/>{fasta_dict[tid]}"
            length = len(fasta_dict[tid])
            try:
                print(f"Updating {tid} with length {length}")
                conn.execute(sql_update, {"fasta": fasta_content, "tid": tid, "length": length})
            except Exception as e:
                print(f"Error updating {tid}: {e}")
    
    conn.commit()


### Step5.4 import Mouse transcripts from dbesslnc
The required files:
1. /dbesslnc/mouse_trans.csv

In [None]:
# import mouse essential lncRNA data
df = pd.read_csv('mouse_trans.csv', sep=',', usecols=lambda column: column != 'Name')

for column in df.columns:
    df[column] = df[column].astype(str).str.strip('"')

with engine.connect() as conn:
    df.to_sql('trans', 
              con=conn, 
              if_exists='append',
              index=False)
print(f"successfully {len(df)} rows to trans table.")


In [None]:
with engine.connect() as conn:
    # Read Noncode_id and UID where Organism is 'Mouse'
    query = text("""
        SELECT Noncode_id, UID
        FROM esslnc
        WHERE Organism = 'Mouse'
    """)
    result = conn.execute(query)
    rows = result.fetchall()

    # Create a dictionary to map Noncode_id to UID
    noncode_to_uid = {}
    for row in rows:
        noncode_id, uid = row
        if noncode_id not in noncode_to_uid:
            noncode_to_uid[noncode_id] = uid

    # Update UID where Noncode_id is equal
    for noncode_id, uid in noncode_to_uid.items():
        update_sql = text("""
            UPDATE trans
            SET UID = :uid
            WHERE Organism = 'Mouse' AND Noncode_id = :noncode_id 
        """)
        conn.execute(update_sql, {"uid": uid, "noncode_id": noncode_id})
    
    conn.commit()

print("Update successfully")

## Step6: Import the mapped transcript IDs
The required files
1. /match/lncbook_map.tsv
2. /match/noncode_map.tsv

In [None]:
# Upload the mapped database gene IDs and transcript IDs,along with other mapped data.
# You can modify SQL statements and input files to update fields in the database.
map_file = 'lncbook_map.tsv' #*_map.tsv,

map_df = pd.read_csv(map_file, sep='\t', header=None)
# filter
map_dict = map_df[map_df[6] == 'transcript'][[1,3,4]].drop_duplicates()
map_dict.columns = ['transcript_id','Lncbook_trans_id', 'Lncbook_id'] # You can modify the corresponding column names.
# print(map_dict.head())


for _, lnc_row in map_dict.iterrows():
    try:
        with engine.connect() as conn:
            update_sql = text("""
            UPDATE trans 
            SET Lncbook_trans_id = :Lncbook_trans_id, 
                Lncbook_id = :Lncbook_id
            WHERE transcript_id = :transcript_id
            """)
            
            result = conn.execute(update_sql, {
                "transcript_id": lnc_row['transcript_id'],
                "Lncbook_id": lnc_row['Lncbook_id'],
                "Lncbook_trans_id": lnc_row['Lncbook_trans_id']
            })
            
            rows_affected = result.rowcount
            # print(f"update records: {lnc_row['transcript_id']}, affected rows: {rows_affected}")
            
            conn.commit()
    except Exception as e:
        print(f"update fail: {lnc_row['transcript_id']}, error: {str(e)}")

## Step7: import CRISPR experiment record `/curated/exp_crispr.csv`

In [None]:

exp_df = pd.read_csv('../store/exp_crispr.csv')

exp_df.to_sql('exp_crispr', 
              engine, 
              if_exists='append', 
              index=False,         
              chunksize=1000)      

print(f"Successfully import {len(exp_df)} records to trans table.")

### step7.1 
Group by   exp_type   and   target_id   columns, for groups with a count of 1, mark as 'cell-line specific' and update in the table; for groups with a count of 2-5, mark as 'common essential'; and for groups with a count greater than 5, mark as 'core essential'.

In [None]:
import pandas as pd
from sqlalchemy import text


sql_select = text("""
    SELECT exp_type, target_id 
    FROM exp_crispr
""")

with engine.connect() as conn:

    df = pd.read_sql(sql_select, conn)
    

    group_counts = df.groupby(['exp_type', 'target_id']).size()
    

    cell_specific = group_counts[group_counts == 1].index
    common = group_counts[(group_counts >= 2) & (group_counts <= 5)].index
    core = group_counts[group_counts > 5].index
    

    update_sql = text("""
        UPDATE exp_crispr
        SET role = :etype 
        WHERE exp_type = :exp AND target_id = :tid
    """)
    

    for exp, tid in cell_specific:
        conn.execute(update_sql, {"etype": "cell-line specific", "exp": exp, "tid": tid})
        
    for exp, tid in common:
        conn.execute(update_sql, {"etype": "common essential", "exp": exp, "tid": tid})
        
    for exp, tid in core:
        conn.execute(update_sql, {"etype": "core essential", "exp": exp, "tid": tid})
    
    conn.commit()

### Step7.2: Assign generated UIDs.

In [None]:

map_file = 'crispr_mapping.tsv' #*_map.tsv,

map_df = pd.read_csv(map_file, sep='\t', names=['UID','target'])
# filter

for _, lnc_row in map_df.iterrows():
    try:
        with engine.connect() as conn:
            update_sql = text("""
            UPDATE exp_crispr 
            SET UID= :UID
            WHERE target_id = :target
            """)
            
            result = conn.execute(update_sql, {
                "UID": lnc_row['UID'],
                "target": lnc_row['target'],
            })
            
            rows_affected = result.rowcount            
            conn.commit()
    except Exception as e:
        print(f"update fail: {lnc_row['target']}, error: {str(e)}")

### step7.3: Update the PUBMED ID of the esslnc table.

In [None]:
#Extract the UID and PMID columns from exp_crispr,and update the PMID column in the esslnc table.
query = text("""
    SELECT UID, PMID
    FROM exp_crispr
    WHERE PMID IS NOT NULL
""")
with engine.connect() as conn:
    result = conn.execute(query)
    rows = result.fetchall()

    for row in rows:
        uid, pmid = row
        update_sql = text("""
            UPDATE esslnc
            SET PMID = CASE
                WHEN PMID IS NULL THEN :pmid
                ELSE CONCAT(PMID, ',', :pmid)
            END
            WHERE UID = :uid
        """)
        conn.execute(update_sql, {"pmid": pmid, "uid": uid})
    
    conn.commit()

print("PMID updated successfully")

## Step8：import variants Mapping table and variants
The required files.
1. /clinvar_map/db/construct_map/disease_mapping.csv
2. /clinvar_map/db/construct_map/crispr_variant_mapping.csv
3. /clinvar_map/db/crispr_overlap/variants_nocrispr.csv
4. /clinvar_map/db/crispr_map/crispr_variants.csv


In [None]:
# Import variant information and mapping tables into the database.
import pandas as pd
from sqlalchemy import text

variants_file = 'disease_mapping.csv'
headers = pd.read_csv(variants_file, nrows=0).columns.tolist()

#table name variants/lncrna_variant_mapping
sql = text(f"""
    INSERT IGNORE INTO lncrna_variant_mapping
    ({', '.join(headers)})
    VALUES ({', '.join([':' + col for col in headers])})
""")

total_rows = 0
success_rows = 0

for chunk in pd.read_csv(variants_file, chunksize=1000):
    with engine.connect() as conn:
        for _, row in chunk.iterrows():
            try:
                result = conn.execute(sql, row.to_dict())
                success_rows += result.rowcount
                total_rows += 1
            except Exception as e:
                continue
        conn.commit()
print(f"Total processed: {total_rows}")
print(f"Successfully imported: {success_rows}")
print(f"Skipped duplicates: {total_rows - success_rows}")



### Step8.2: Marking whether lncRNAs are associated with diseases

In [None]:
with engine.connect() as conn:
    # update esslnc table
    update_sql = text("""
    UPDATE esslnc 
    SET disease_related = 1
    WHERE UID IN (
        SELECT DISTINCT UID 
        FROM lncrna_variant_mapping
    )
    """)
    conn.execute(update_sql)
    conn.commit()


## Step9: import expression profile
The required file
1. /clinvar_map/db/exp/exp_profile.csv

In [None]:
# import all lncRNA expression profile

import pandas as pd
from sqlalchemy import text

expression_file = 'exp_profile.csv'
headers = pd.read_csv(expression_file, nrows=0).columns.tolist()
print(headers)

sql = text(f"""
    INSERT IGNORE INTO exp_profile
    ({', '.join(headers)})
    VALUES ({', '.join([':' + col for col in headers])})
""")

total_rows = 0
success_rows = 0

for chunk in pd.read_csv(expression_file, chunksize=1000):
    with engine.connect() as conn:
        for _, row in chunk.iterrows():
            try:
                result = conn.execute(sql, row.to_dict())
                success_rows += result.rowcount
                total_rows += 1
            except Exception as e:
                print(e)
                continue
        conn.commit()
print(f"Total processed: {total_rows}")
print(f"Successfully imported: {success_rows}")
print(f"Skipped duplicates: {total_rows - success_rows}")

## Step10: All NULL entries in the database should be changed to N.A.

In [None]:
with engine.connect() as conn:
    update_esslnc = text("""
    UPDATE esslnc 
    SET 
        NCBI_id = COALESCE(NCBI_id, 'N.A.'),
        gene_name = COALESCE(gene_name, 'N.A.'),
        Alias = COALESCE(Alias, 'N.A.'),
        ensembl_id = COALESCE(ensembl_id, 'N.A.'),
        Noncode_id = COALESCE(Noncode_id, 'N.A.'),
        Lncbook_id = COALESCE(Lncbook_id, 'N.A.'),
        reason = COALESCE(reason, 'N.A.'),
        Go_annotation = COALESCE(Go_annotation, 'N.A.'),
        target = COALESCE(target, 'N.A.')
    """)
    conn.execute(update_esslnc)
  

    # update_trans = text("""
    # UPDATE trans
    # SET 
    #     Noncode_id = COALESCE(Noncode_id, 'N.A.'),
    #     Noncode_trans_id = COALESCE(Noncode_trans_id, 'N.A.'),
    #     Lncbook_id = COALESCE(Lncbook_id, 'N.A.'),
    #     Lncbook_trans_id = COALESCE(Lncbook_trans_id, 'N.A.')
    # """)
    # conn.execute(update_trans)
    # conn.commit()

print("NULL2N.A. Update successfully")

In [None]:

# import Genomic location information.
import pandas as pd

# custum bed file, generated by gen_fa.ipynb step1, Just input different files as needed
custum_bed_file = 'lncRNA_with_transcripts.csv'
#transcript sequence crispr_gene.fa
seq_file = 'lncRNAV2.fasta'

# mapping_df = pd.read_csv('crispr_mapping.tsv', sep='\t',names=['UID','target'])
# target_to_uid = {}

# for _, row in mapping_df.iterrows():
#     target_to_uid[row['target']] = row['UID']

df = pd.read_csv(custum_bed_file, sep=',')



def process_transcript(row):
    # print(row)
    transcript_id = row['transcript_id']
    UID = row['UID']
    print(UID)
    print(transcript_id)
    with engine.connect() as conn:
    # Update the trans table if transcript_id matches
        update_sql = text("""
            UPDATE trans
            SET UID = :UID
            WHERE transcript_id = :transcript_id
        """)
        conn.execute(update_sql, {"UID": UID, "transcript_id": transcript_id})
        conn.commit()

result_df = pd.DataFrame([process_transcript(row) for _, row in df.iterrows()])



## Generate the FA file used by the BLAST service

In [None]:
# Generate the FA file used by the BLAST service
import pandas as pd
from sqlalchemy import text

# 查询数据库中的 transcript_id 和 FASTA 列
query = text("""
    SELECT transcript_id, FASTA 
    FROM trans 
    WHERE FASTA IS NOT NULL
""")

output_file = 'lncRNA2.fasta'

with engine.connect() as conn:
    result = conn.execute(query)
    rows = result.fetchall()
    
    with open(output_file, 'w') as f:
        for transcript_id, fasta_content in rows:
            if fasta_content and '<br/>' in fasta_content:
                # 提取 <br/> 后面的序列部分
                sequence = fasta_content.split('<br/>', 1)[1]
                
                # 写入标准 FASTA 格式
                f.write(f">{transcript_id}\n")
                f.write(f"{sequence}\n")
            else:
                print(f"Warning: Invalid FASTA format for {transcript_id}")

print(f"Successfully generated {output_file} with {len(rows)} sequences")