In [None]:
import pandas as pd
import numpy as np
import plotly.express as px

# Define dataset names and file paths
datasets = {
    'GSE210787': '/content/GSE210787.csv',
    'GSE229571': '/content/GSE229571.csv',
    'GSE271851': '/content/GSE271851.csv',
}

# Define the criteria for filtering
logFC_threshold = 0.5
adj_pvalue_threshold = 0.05  # p-value < 0.05 is considered significant
min_files = 2  # Minimum number of datasets a gene should appear in to be considered duplicated

# Function to filter genes based on logFC and adj.P.Val criteria
def filter_genes(data):
    return set(data[((data['logFC'] >= logFC_threshold) | (data['logFC'] <= -logFC_threshold)) &
                    (data['adj.P.Val'] < adj_pvalue_threshold)]['Gene.symbol'])

# Function to identify duplicated genes across datasets
def get_duplicated_genes(datasets, min_files):
    # Initialize dictionary to store genes from each dataset
    genes_in_datasets = {key: filter_genes(pd.read_csv(path)) for key, path in datasets.items()}

    # Identify genes that appear in at least `min_files` datasets
    duplicated_genes = set(gene for gene in set.union(*genes_in_datasets.values())
                           if sum(gene in genes for genes in genes_in_datasets.values()) >= min_files)
    return duplicated_genes, genes_in_datasets

# Generate volcano plot for each dataset
def plot_volcano(datasets, duplicated_genes, logFC_threshold, adj_pvalue_threshold):
    for title, path in datasets.items():
        # Load dataset
        data = pd.read_csv(path)

        # Add -log10(adj.P.Val) column
        data['-log10(adj.P.Val)'] = -np.log10(data['adj.P.Val'])

        # Default color is light gray
        data['color'] = 'lightgray'

        # Highlight duplicated genes with appropriate colors
        data.loc[(data['Gene.symbol'].isin(duplicated_genes)) & (data['logFC'] >= logFC_threshold), 'color'] = 'lightcoral'  # Light red
        data.loc[(data['Gene.symbol'].isin(duplicated_genes)) & (data['logFC'] <= -logFC_threshold), 'color'] = 'lightskyblue'  # Light blue

        # Create the plot
        fig = px.scatter(
            data,
            x='logFC',
            y='-log10(adj.P.Val)',
            color='color',
            hover_data={'Gene.symbol': True, 'logFC': True, '-log10(adj.P.Val)': True, 'color': False},
            title=title,
            labels={
                'logFC': 'log2 Fold Change',
                '-log10(adj.P.Val)': '-log10(Adjusted p-value)'
            },
            color_discrete_map={'lightcoral': 'lightcoral', 'lightskyblue': 'lightskyblue', 'lightgray': 'lightgray'}
        )

        # Add threshold lines
        fig.add_hline(y=-np.log10(adj_pvalue_threshold), line_dash="dash", line_color="gray", annotation_text="Adj. p-value threshold")
        fig.add_vline(x=logFC_threshold, line_dash="dash", line_color="gray", annotation_text="LogFC threshold")
        fig.add_vline(x=-logFC_threshold, line_dash="dash", line_color="gray", annotation_text="LogFC threshold")

        fig.update_traces(marker=dict(size=8, opacity=0.7))
        fig.update_layout(showlegend=False)

        # Save the plot as HTML file (one per dataset)
        fig.write_html(f"{title}_volcano_plot.html")

# Step 1: Get the duplicated genes based on the datasets and rules
duplicated_genes, genes_in_datasets = get_duplicated_genes(datasets, min_files)

# Step 2: Plot the volcano plot for each dataset
plot_volcano(datasets, duplicated_genes, logFC_threshold, adj_pvalue_threshold)



divide by zero encountered in log10



In [None]:
duplicated_genes_df = pd.DataFrame(list(duplicated_genes), columns=['Gene.symbol'])

In [None]:
duplicated_genes_df

Unnamed: 0,Gene.symbol
0,ISG20
1,TMEM220
2,IQCJ-SCHIP1
3,DMRT2
4,EDNRB
...,...
1278,NHERF1
1279,RNFT1
1280,CORO1A
1281,LOC100129697


In [None]:
duplicated_genes_df.to_csv('Step1_China.csv', index=False)

print("Duplicated genes data saved as 'Step1_China.csv'.")


Duplicated genes data saved as 'Step1_China.csv'.
