In [1]:
import pandas as pd
import pingouin as pg
from statsmodels.stats.multitest import multipletests
import numpy as np
from scipy.stats import *

In [12]:
# Delete genes if half or more of them have a 0 value and if not turn to NaN values
def delete_0s(df):
    deleted_cols = df.columns[(df == 0).sum() >= len(df)/2]
    df = df.drop(columns = deleted_cols)
    print(f'Columnas eliminadas {deleted_cols}')if len(deleted_cols) else None
    # df = df.replace(0, np.nan)
    return df
    
def delete_outliers(df):
    
    means = df.mean()
    std_devs = df.std()
    
    for column in df.columns:
        mean = means[column]
        std_dev = std_devs[column]     
        
        # Turn to 0 outliers, an outlier is over/below the mean+2*desv
        df.loc[(df[column] > mean + 2 * std_dev) | (df[column] < mean - 2 * std_dev), column] = np.nan
        #print(column)

def metrics(df_young, df_old):
    p_values_ttest, normality_young, normality_old, homoscedasticity, p_values_anova = [], [], [], [], []
    
    for column in df_young.columns:
        young_values = df_young[column].dropna().tolist()
        old_values = df_old[column].dropna().tolist()
        
        # t- student P-value
        # result = pg.ttest(young_values.dropna().tolist(), old_values.dropna().tolist(), correction=False)
        result = pg.ttest(young_values, old_values, correction=False)
        p_value = result['p-val'].iloc[0] 
        p_values_ttest.append(p_value)

        # normality by groups
        _, p_value = shapiro(young_values)
        normality_young.append(p_value)
        _, p_value = shapiro(old_values)
        normality_old.append(p_value)

        # homocedasticity
        _, p_value = levene(young_values, old_values)
        homoscedasticity.append(p_value)

        # ANOVA
        _, p_value = f_oneway(young_values, old_values)
        p_values_anova.append(p_value)
    

    # Adjusted p-values
    # adjusted_p_values_ttest = multipletests(p_values_ttest, method='fdr_bh')[1]
    # adjusted_p_values_anova = multipletests(p_values_anova, method='fdr_bh')[1]
    
    result_df = pd.DataFrame({
        'Normality_young': normality_young,
        'Normality_old': normality_old,
        'Homocedasticity': homoscedasticity,
        'P-value ANOVA': p_values_anova,
        # 'Adjusted_p-value ANOVA': adjusted_p_values_anova,
        'P-value T-test': p_values_ttest,
        # 'Adjusted_p-value T-test': adjusted_p_values_ttest,
        'Fold_change': df_old.mean(axis=0)/df_young.mean(axis=0),
        'Mean_young': df_young.mean(axis=0),
        'Mean_old': df_old.mean(axis=0),
        'Std_young': df_young.std(axis=0),
        'Std_old': df_old.std(axis=0),
    })
    result_df = result_df.transpose()
    return(result_df)

def normalize_columns(df):
    for col in df.columns:
        df[col] = np.log10(df[col])
        # mask = ~df[col].isnull()
        # transformed_values, _ = boxcox(df[col][mask]) 
        # df[col][mask] = transformed_values
    return df

In [14]:
df= pd.read_excel('PATH', sheet_name='Data_4')
demographic_data = pd.read_excel('PATH', sheet_name='Demographic_data_2')
df = df.replace(0, np.nan)
df = df.loc[df['Disease'] == 'C']

# Merge both df with the sex info of each sample
demographic_data = demographic_data[['Sample', 'Edad']]
df = pd.merge(demographic_data, df, how='inner', on='Sample')

df.set_index('Sample', inplace=True)
df = df.drop(columns=['Type', 'Disease'])
# print(df)

# Split by sex
df_young = df.loc[df['Edad'] < 40]
# df_young = df_young.drop(columns=['Edad'])
df_old = df.loc[df['Edad'] >= 40]
# df_old = df_old.drop(columns=['Edad'])
demographic_data = pd.concat([df_young.pop('Edad'), df_old.pop('Edad')], axis=0)
# print( df_young, df_old)

# Delete outliers
delete_outliers(df_young)
delete_outliers(df_old)

# Delete mewtabolites if half or more of them have a 0 value and if not turn to NaN values
df_young = delete_0s(df_young)
df_old = delete_0s(df_old)
# print( df_female, df_male)

# Metabolites' names without outliers
col_names_no_OLs = df_young.columns.intersection(df_old.columns).tolist()

print(len(col_names_no_OLs))

# Delete columns removed from the other df
df_young_no_OLs = df_young.loc[:, df_young.columns.isin(col_names_no_OLs)]
df_old_no_OLs = df_old.loc[:, df_old.columns.isin(col_names_no_OLs)]

# Normalize
df_young_no_OLs = normalize_columns(df_young_no_OLs.copy())
df_old_no_OLs = normalize_columns(df_old_no_OLs.copy())
# print(df_control_no_OLs, df_treated_no_OLs)

# Join dfs
df_no_OLs =  pd.concat([df_young_no_OLs, df_old_no_OLs])
# df_no_OLs.to_excel('data_no_OLs.xlsx')
# print(df_no_OLs)

df_no_OLs = pd.concat([demographic_data, df_no_OLs], axis=1)

# Linearity
correlation_results = {}
associated_p_value = {}
for col in df_no_OLs.columns:
    if col != 'Edad':
        # Ignore Nan values
        mask = ~df_no_OLs['Edad'].isnull() & ~df_no_OLs[col].isnull()
        correlation_coefficient, p_value = pearsonr(df_no_OLs['Edad'][mask], df_no_OLs[col][mask]) #Apply the mask
        correlation_results[col] = correlation_coefficient
        associated_p_value[col] = p_value        
df_no_OLs.loc['linearity'] = pd.Series(correlation_results)
df_no_OLs.loc['associated p-value'] = pd.Series(associated_p_value)
print(df_no_OLs)
df_no_OLs = df_no_OLs.drop(columns=['Edad'])


# Get metrics
metrics = metrics(df_young_no_OLs, df_old_no_OLs)
results = pd.concat([df_no_OLs, metrics])
# print(results)

# col_names = results.columns[(results.loc['Fold_change'] >= 1.3) | (results.loc['Fold_change'] <= 1/1.3) ]
# col_names = results.columns[(results.loc['Fold_change'] >= 1.15) | (results.loc['Fold_change'] <= 1/1.15) ]
col_names = results.columns
print(len(col_names))

results_filtered = results[col_names]
p_values_anova = results_filtered.loc['P-value ANOVA'].tolist()
p_values_ttest = results_filtered.loc['P-value T-test'].tolist()
adjusted_p_values_anova = multipletests(p_values_anova, method='fdr_bh')[1]
adjusted_p_values_ttest = multipletests(p_values_ttest, method='fdr_bh')[1]

results_filtered = results_filtered.copy()
results_filtered.loc['Adjusted_p-value ANOVA'] = adjusted_p_values_anova
results_filtered.loc['Adjusted_p-value T-test'] = adjusted_p_values_ttest

print(results_filtered)

# results_filtered.to_excel('results_CTRL.xlsx')
#'''

49
                    Edad        C0        C2        C3  C3-DC (C4-OH)  \
Sample                                                                  
c1                  33.0  0.828946  0.010959 -0.903863      -2.246672   
c2                  24.0  0.869232 -1.191564 -1.405239      -2.397940   
c5                  26.0  0.892095 -0.063821 -0.731266      -2.273001   
c10                 28.0  1.001445  0.204120 -0.527487      -2.187087   
c11                 32.0  0.757948 -0.068265 -0.851061      -2.000000   
RF_138              27.0  0.926857  0.201397 -0.853097      -2.154902   
RF_142              33.0  0.957847  0.146903 -0.671620      -2.110698   
RF_143              35.0  0.935759  0.270096 -0.639691      -2.083546   
c3                  47.0  0.888367  0.068186 -0.644612      -2.154902   
c4                  65.0  0.873127 -0.077448 -0.859073      -2.198368   
c6                  66.0  0.927712  0.015639 -0.822945      -2.079181   
c7                  53.0       NaN -0.420216 -1.

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df.loc[(df[column] > mean + 2 * std_dev) | (df[column] < mean - 2 * std_dev), column] = np.nan


TypeError: 'DataFrame' object is not callable

In [54]:
df= pd.read_excel('PATH', sheet_name='Data_3')
df.set_index('Sample', inplace=True)
old = df.loc[df['Type'] == 'VIEJO']
new = df.loc[df['Type'] == 'NUEVO']

new_control = new.loc[df['Disease'].str.startswith('C')]
new_control = new_control.drop(columns=['Disease', 'Type'])
new_treated = new.loc[df['Disease'].str.startswith('P')]
new_treated = new_treated.drop(columns=['Disease', 'Type'])
old_control = old.loc[df['Disease'].str.startswith('C')]
old_control = old.drop(columns=['Disease', 'Type'])
old_treated = old.loc[df['Disease'].str.startswith('P')]
old_treated = old.drop(columns=['Disease', 'Type'])


result_df = pd.DataFrame({
        'Mean_control_new': new_control.mean(axis=0),
        'Mean_control_old': old_control.mean(axis=0),
        'Mean_PD_new': new_treated.mean(axis=0),
        'Mean_PD_old': old_treated.mean(axis=0),
        'Std_control_new': new_control.std(axis=0),
        'Std_control_old': old_control.std(axis=0),
        'Std_PD_new': new_treated.std(axis=0),
        'Std_PD_old': old_treated.std(axis=0),
    })
result_df = result_df.transpose()
result_df.to_excel('Means_desv.xlsx')
