In [1]:
import pandas as pd
import os

# Replace 'your_folder_path' with the path to the folder containing the Excel files
folder_path = '/home/dusty/Desktop/All of Us'

# Initialize an empty DataFrame to store the results
df = pd.DataFrame(columns=['File', 'Sheet Count'])

# Iterate through each file in the folder
for file in os.listdir(folder_path):
    if file.endswith('.xlsx') or file.endswith('.xls'):
        file_path = os.path.join(folder_path, file)
        # Count the number of sheets in the Excel file
        xls = pd.ExcelFile(file_path)
        sheet_count = len(xls.sheet_names)
        # Append the results to the DataFrame
        df = df.append({'File': file, 'Sheet Count': sheet_count}, ignore_index=True)

# Display the DataFrame
print(df)


                                File Sheet Count
0   blackwithout18-50_converted.xlsx          78
1   whitewithout18-40_converted.xlsx          76
2      whitewith66-75_converted.xlsx          78
3     whitewithout63-_converted.xlsx          78
4   whitewithout41-62_converted.xlsx          78
5        whitewith76-_converted.xlsx          75
6      whitewith18-50_converted.xlsx          76
7        blackwith51-_converted.xlsx          78
8      whitewith51-65_converted.xlsx          77
9     blackwithout51-_converted.xlsx          76
10       blackwith18-50_updated.xlsx          76


In [2]:
import pandas as pd
import os

# Specify the path to the folder containing the Excel files
folder_path = '/home/dusty/Desktop/All of Us'

# Step 1: Identify the complete set of unique tab names
all_tabs = set()
for filename in os.listdir(folder_path):
    if filename.endswith('.xlsx') or filename.endswith('.xls'):
        file_path = os.path.join(folder_path, filename)
        xls = pd.ExcelFile(file_path)
        all_tabs.update(xls.sheet_names)

# Step 2: Read each workbook as before
dataframes = {}
for filename in os.listdir(folder_path):
    if filename.endswith('.xlsx') or filename.endswith('.xls'):
        file_path = os.path.join(folder_path, filename)
        df_name = os.path.splitext(filename)[0]
        dataframes[df_name] = pd.read_excel(file_path, sheet_name=None)

# all_tabs now contains the names of all unique tabs across all workbooks
# dataframes is a dictionary with each key being a workbook name and each value being a dictionary of DataFrames


In [3]:
import pandas as pd
import ast

# Function to process DataFrames for each gene within a single population group
def process_genes_for_population(df_dict):
    population_data = []
    
    for gene_name, df in df_dict.items():
        for index, row in df.iterrows():
            allele_count_str = row['Allele Count']
            try:
                # Converting string representation of a list to an actual list and getting the first element
                allele_count = ast.literal_eval(allele_count_str)[0] if allele_count_str else 0
            except (ValueError, SyntaxError):
                allele_count = 0  # Default to 0 in case of error
            
            # Add a row of data for this gene
            population_data.append({
                'Gene-Position': f"{gene_name}_{row['Position']}",
                'Allele Count': allele_count,
                'Allele Number': row['Allele Number']
            })
            
    return pd.DataFrame(population_data)

# Dictionary to hold the DataFrame for each population group
population_dataframes = {}

# Loop through all population groups in the `dataframes` dictionary
for population_name, df_dict in dataframes.items():
    population_dataframes[population_name] = process_genes_for_population(df_dict)

# At this point, `population_dataframes` contains a separate DataFrame for each population group
# You can access them like this:
blackwith18_50_df = population_dataframes['blackwith18-50_updated']  # for example


In [7]:
blackwith18_50_df = population_dataframes['blackwith18-50_updated']  # for example

blackwith18_50_df

Unnamed: 0,Gene-Position,Allele Count,Allele Number
0,ACTA1_229432340,2,3144
1,ABCG8_43839108,207,3144
2,ABCG8_43839131,0,3142
3,ABCG8_43839156,23,3144
4,ABCG8_43839213,17,3144
...,...,...,...
50340,VCL_74112023,10,3144
50341,ZIC3_137567523,1,3144
50342,ZIC3_137567552,94,3144
50343,ZIC3_137567584,1,3144


In [9]:
blackwithout18_50_df = population_dataframes['blackwithout18-50_converted']
blackwithout18_50_df

Unnamed: 0,Gene-Position,Allele Count,Allele Number
0,ACTA1_229432340.0,5,7340.0
1,ACTA1_229432384.0,1,7312.0
2,ABCG8_43839108.0,498,7340.0
3,ABCG8_43839129.0,1,7338.0
4,ABCG8_43839131.0,0,7336.0
...,...,...,...
76854,ZIC3_137567464.0,1,7340.0
76855,ZIC3_137567552.0,224,7340.0
76856,ZIC3_137567573.0,0,7338.0
76857,ZIC3_137567603.0,94,7340.0


In [None]:
population_dataframes.keys()

In [10]:
# Ensure all relevant DataFrames are set with 'Gene-Position' as the index
blackwith18_50_df.set_index('Gene-Position', inplace=True)
blackwithout18_50_df.set_index('Gene-Position', inplace=True)

# List of DataFrames to aggregate. Add more as needed.
dfs_to_aggregate = [blackwith18_50_df, blackwithout18_50_df]

# Concatenate and sum
aggregated_df = pd.concat(dfs_to_aggregate, axis=0).groupby(level=0).sum(min_count=1).fillna(0).astype(int)

# Resetting the index if needed, to use 'Gene-Position' as a column again
aggregated_df.reset_index(inplace=True)

# Display the aggregated DataFrame
print(aggregated_df)


KeyError: "None of ['Gene-Position'] are in the columns"

In [None]:
whitewith66_75_df = population_dataframes['whitewith66-75_converted']

In [None]:
whitewith66_75_df

In [None]:
whitewith18_50_df

In [None]:
whitewith18_50_df

In [None]:
valid_fishers_results

In [None]:
from scipy.stats import fisher_exact
import pandas as pd
import numpy as np

# Function to perform Fisher's Exact Test for a specific gene position
def fishers_test_for_gene_position(gene_position, df1, df2):
    # Extract rows for the specific gene position
    row_df1 = df1.loc[df1['Gene-Position'] == gene_position]
    row_df2 = df2.loc[df2['Gene-Position'] == gene_position]

    # Check if the gene position is found in both dataframes and that no values are NaN
    if not row_df1.empty and not row_df2.empty and \
       not row_df1.isnull().values.any() and not row_df2.isnull().values.any():

        # Build the contingency table with allele counts in one population vs. the other
        contingency_table = [
            [row_df1.iloc[0]['Allele Count'], row_df1.iloc[0]['Allele Number'] - row_df1.iloc[0]['Allele Count']],
            [row_df2.iloc[0]['Allele Count'], row_df2.iloc[0]['Allele Number'] - row_df2.iloc[0]['Allele Count']]
        ]

        # Perform Fisher's Exact Test
        odds_ratio, p_value = fisher_exact(contingency_table, alternative='two-sided')

        return odds_ratio, p_value
    else:
        # Return None if data is incomplete or contains NaN
        return None

# Get the common gene positions between both populations
common_gene_positions = set(whitewith18_50_df['Gene-Position']).intersection(blackwith18_50_df['Gene-Position'])

# Perform Fisher's Exact Test on the common gene positions
fishers_results = {}
for gene_position in common_gene_positions:              #Put one class here, and another here to compare!
    result = fishers_test_for_gene_position(gene_position, blackwith18_50_df, whitewith18_50_df)
    if result:
        fishers_results[gene_position] = result

# Create dataframes from the results, separating valid and invalid results
valid_fishers_results_df = pd.DataFrame([
    {'Gene-Position': gene_pos, 'Odds_Ratio': res[0], 'P_Value': res[1]}
    for gene_pos, res in fishers_results.items() if res is not None
], columns=['Gene-Position', 'Odds_Ratio', 'P_Value'])

# Filter out significant results
alpha = 0.05  # significance level
significant_results_df = valid_fishers_results_df[valid_fishers_results_df['P_Value'] < alpha]

# Display the significant results
print(significant_results_df.sort_values(by='P_Value').head())


In [None]:
significant_results_df