In [1]:
import pandas as pd
import requests
import csv
import time



In [2]:
df = pd.read_csv('pubmed_concepts.csv')

In [3]:
# Split the 'gene_id' column by semicolon and count the number of values in each row
df['gene_id_count'] = df['gene_id'].str.split(';').apply(lambda x: len(x))

# Filter rows where 'gene_id_count' is greater than 1
result = df[df['gene_id_count'] > 1]

In [4]:
# Filter rows where 'gene_id_count' is equal to 1
filtered_df = df[df['gene_id_count'] == 1]

# Drop the 'gene_id_count' column, which was used for filtering
filtered_df = filtered_df.drop(columns=['gene_id_count'])

In [5]:
#Save to csv
filtered_df.to_csv('filtered_gene_data.csv', index = False)

In [6]:
## Fetch Alias and Symbol

In [7]:
BATCH_SIZE = 500
SLEEP_TIME = 2  # Sleep for 2 seconds between requests
MAX_RETRIES = 3  # Maximum number of retries for failed requests

def retrieve_gene_info(input_file, output_file):
    gene_ids = set()
    existing_gene_ids = set()
    output_rows = []
    total_gene_ids_retrieved = 0

    # Read gene IDs from the input CSV, skipping the header row
    with open(input_file) as f:
        reader = csv.reader(f)
        header = next(reader)  # Skip the header row
        gene_id_index = header.index('gene_id')  # Assuming 'gene_id' is a column name in the header

        for row in reader:
            gene_id = row[gene_id_index]
            if gene_id not in gene_ids:
                gene_ids.add(gene_id)

    # Check for existing gene IDs in the output CSV
    try:
        with open(output_file) as f:
            reader = csv.DictReader(f)
            for row in reader:
                existing_gene_ids.add(row['gene_id'])
    except FileNotFoundError:
        pass

    # Remove existing gene IDs from the list of gene IDs to retrieve
    gene_ids_to_retrieve = gene_ids - existing_gene_ids

    start = 0
    end = BATCH_SIZE
    batch_number = 0

    while start < len(gene_ids_to_retrieve):
        batch_number += 1
        batch_ids = list(gene_ids_to_retrieve)[start:end]

        query_params = {
            'ids': ','.join(batch_ids),
            'fields': 'symbol,alias'
        }

        retries = 0
        while retries < MAX_RETRIES:
            response = requests.post('https://mygene.info/v3/gene', json=query_params)
            if response.status_code == 200:
                genes = response.json()
                total_gene_ids_retrieved += len(genes)
                break
            else:
                retries += 1
                time.sleep(SLEEP_TIME)

        for gene in genes:
            gene_id = gene.get('query')
            symbol = gene.get('symbol')
            alias = ','.join(gene.get('alias', []))

            # Check if symbol or alias is missing and replace with "None"
            if not symbol:
                symbol = "None"
            if not alias:
                alias = "None"

            output_rows.append({
                'gene_id': gene_id,  # Use 'query' to get the gene ID
                'symbol': symbol,
                'alias': alias
            })

        start = end
        end += BATCH_SIZE

    with open(output_file, 'a', newline='') as f:
        writer = csv.DictWriter(f, ['gene_id', 'symbol', 'alias'])
        if batch_number == 1:
            writer.writeheader()  # Write header only if it's the first batch
        writer.writerows(output_rows)

    print(f"Total Batches Processed: {batch_number}")
    print(f"Total Gene IDs Retrieved: {total_gene_ids_retrieved}")
    print(f"Gene IDs Skipped (Already Retrieved): {len(existing_gene_ids)}")

In [8]:
# Usage
retrieve_gene_info('filtered_gene_data.csv', 'results.csv')

Total Batches Processed: 0
Total Gene IDs Retrieved: 0
Gene IDs Skipped (Already Retrieved): 391


In [None]:
# Combile in one csc

In [9]:
input_file1 = 'filtered_gene_data.csv'
input_file2 = 'results.csv'
output_file = 'combined_data.csv'

# Create a dictionary to store gene information from the second CSV
gene_info = {}

# Read data from the second CSV (results.csv)
with open(input_file2) as f:
    reader = csv.DictReader(f)
    for row in reader:
        gene_id = row['gene_id']
        symbol = row['symbol']
        alias = row['alias']
        gene_info[gene_id] = {'symbol': symbol, 'alias': alias}

# Create a new CSV file for the combined data
with open(output_file, 'w', newline='') as f_out:
    writer = csv.writer(f_out)
    
    writer.writerow(['pubmed_id', 'concept_name', 'count', 'gene_id', 'symbol', 'alias'])
    
    # Read data from the first CSV (filtered_gene_data.csv)
    with open(input_file1) as f_in:
        reader = csv.reader(f_in)
        next(reader)  # Skip the header row
        
        for row in reader:
            pubmed_id, concept_name, count, gene_id = row
            gene_info_entry = gene_info.get(gene_id, {})
            symbol = gene_info_entry.get('symbol', 'None')
            alias = gene_info_entry.get('alias', 'None')
            
            # Write a row with combined data to the new CSV
            writer.writerow([pubmed_id, concept_name, count, gene_id, symbol, alias])

print(f"Combined data saved to {output_file}.")


Combined data saved to combined_data.csv.


In [10]:
cd = pd.read_csv('combined_data.csv')

In [12]:
print(cd.tail(10))

      pubmed_id                        concept_name  count  gene_id    symbol  \
1307    1672728                                drb1      5     3123  HLA-DRB1   
1308    1672728                                 b18      4     4713    NDUFB7   
1309    1672728                                sco1      1     6341      SCO1   
1310    1673033   transforming growth factor beta 1      1     7040     TGFB1   
1311    1673033                            tgf-beta      6     7039      TGFA   
1312    1673033                          tgf-beta 1      2     7040     TGFB1   
1313    1673033  retinoblastoma gene product (p105)      1     4790     NFKB1   
1314    1673033                                 p60      1     7984   ARHGEF5   
1315    1673792                                 p53      8     7157      TP53   
1316    1673843                                 cd2     18      914       CD2   

                                                  alias  
1307                          DRB1,HLA-DR1B,HLA-DR

In [14]:
#Enriching the original data with additional gene symbol & alias information from an external API. 