In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
import re
import scipy.stats as stats
from statsmodels.stats.multicomp import pairwise_tukeyhsd

In [2]:
df1=pd.read_csv('dataset.csv')
mask=df1['F']==3
df=df1[~mask].copy()
halide_groups = df['Label'].apply(lambda x: 'Chloride' if 'Chloride' in x else 'Bromide' if 'Bromide' in x else 'Iodide')
df['X_group'] = halide_groups

# Example: Create box plots for defect energies grouped by cation/anion families
cation_families = df['Label'].apply(lambda x: 'Germanium' if 'Germanium' in x else 'Tin' if 'Tin' in x else 'Lead')
df['B_group'] = cation_families

In [3]:
# Define the regular expression patterns for each ammonium family
patterns = {
    'Ammonium': r'Ammonium',
    'Methylammonium': r'Methylammonium',
    'Dimethylammonium': r'Dimethylammonium',
    'Trimethylammonium': r'Trimethylammonium',
    'Tetramethylammonium': r'Tetramethylammonium',
    'Ethylammonium': r'Ethylammonium',
    'Propylammonium': r'Propylammonium',
    'Isopropylammonium': r'Isopropylammonium',
    'Butylammonium': r'Butylammonium',
    'Hydroxylammonium': r'Hydroxylammonium',
    'Formamidinium': r'Formamidinium',
    'Acetamidinium': r'Acetamidinium',
    'Hydrazinium': r'Hydrazinium',
    'Guanidinium': r'Guanidinium',
    'Azetidinium': r'Azetidinium',
    'Imidazolium': r'Imidazolium'
}

# Function to map the Label to the ammonium family using regex
def map_ammonium_family(label):
    for family, pattern in patterns.items():
        if re.search(pattern, label):
            return family
    return 'Unknown'  # Default if no match is found

# Apply the mapping function to the 'Label' column
df['A_group'] = df['Label'].apply(map_ammonium_family)

In [4]:
# Creating a new column to represent combinations of A-site and X-site families
df['A_X_combination'] = df['A_group'] + '_' + df['X_group']
df['A_B_combination'] = df['A_group'] + '_' + df['B_group']
df['B_X_combination'] = df['B_group'] + '_' + df['X_group']

In [7]:


# Define the columns to analyze for ANOVA
columns_to_analyze = [
    'A SITE DFE', 
    'B SITE DFE', 
    'X SITE DFE', 
    'Bandgap, GGA (eV)'
]

# Define the groupings for the ANOVA
group_columns = ['A_group', 'A_X_combination', 'B_X_combination', 'A_B_combination']

# Loop through the group columns and the columns to analyze
for group_col in group_columns:
    for col in columns_to_analyze:
        # Perform One-Way ANOVA
        anova_result = stats.f_oneway(
            *[df[df[group_col] == group][col].dropna() 
              for group in df[group_col].unique()]
        )
        
        # Print the results
        print(f"One-Way ANOVA for {group_col} and {col}: F={anova_result.statistic}")

One-Way ANOVA for A_group and A SITE DFE: F=25.649653740112367
One-Way ANOVA for A_group and B SITE DFE: F=100.74810067906498
One-Way ANOVA for A_group and X SITE DFE: F=116.29643700864575
One-Way ANOVA for A_group and Bandgap, GGA (eV): F=2.2369851172208945
One-Way ANOVA for A_X_combination and A SITE DFE: F=9.998412958266247
One-Way ANOVA for A_X_combination and B SITE DFE: F=34.902239886784145
One-Way ANOVA for A_X_combination and X SITE DFE: F=38.139064463643024
One-Way ANOVA for A_X_combination and Bandgap, GGA (eV): F=27.702948337919935
One-Way ANOVA for B_X_combination and A SITE DFE: F=4.1235753131805915
One-Way ANOVA for B_X_combination and B SITE DFE: F=1.8122454888650203
One-Way ANOVA for B_X_combination and X SITE DFE: F=1.4155184568468486
One-Way ANOVA for B_X_combination and Bandgap, GGA (eV): F=172.01059998292462
One-Way ANOVA for A_B_combination and A SITE DFE: F=29.24687305472002
One-Way ANOVA for A_B_combination and B SITE DFE: F=96.20169125179724
One-Way ANOVA for A_

In [None]:
# Loop through the group columns and the columns to analyze
for group_col in group_columns:
    for col in columns_to_analyze:
        try:
            tukey = pairwise_tukeyhsd(
                endog=df[col].dropna(),
                groups=df[group_col][df[col].notna()],
                alpha=0.05
            )
            print(f"Tukey HSD for {group_col} and {col}:")
            print(tukey.summary())
            print("\n" + "-"*80 + "\n")
        except Exception as e:
            print(f"Could not perform Tukey HSD for {group_col} and {col}: {e}")