In [1]:
import pandas as pd

from perovskite_prediction_api.common.storage import GoogleDriveStorage
from perovskite_prediction_api.common.credentials import google_credentials

In [2]:
storage = GoogleDriveStorage(google_credentials())
df = storage.download_dataframe("perovskite/prepared/data.csv")
df

Unnamed: 0,Cell_stack_sequence,Cell_area_total,Cell_area_measured,Cell_number_of_cells_per_substrate,Cell_architecture,Cell_flexible,Cell_semitransparent,Cell_semitransparent_wavelength_range,Module,Module_number_of_cells_in_module,...,B_1_coef,B_2_coef,C_1,C_2,C_3,C_4,C_1_coef,C_2_coef,C_3_coef,C_4_coef
0,SLG | ITO | PEDOT:PSS | Perovskite | PCBM-70 |...,,0.06,0,pin,False,False,nan; nan,False,0,...,1,0.0,I,0,0,0,3,0,0.0,0
1,SLG | ITO | PEDOT:PSS | Perovskite | PCBM-70 |...,,0.06,0,pin,False,False,nan; nan,False,0,...,1,0.0,I,0,0,0,3,0,0.0,0
2,SLG | FTO | TiO2-c | Perovskite | Spiro-MeOTAD...,,0.04,0,nip,False,False,nan; nan,False,0,...,1,0.0,I,0,0,0,3,0,0.0,0
3,SLG | FTO | TiO2-c | Perovskite | Spiro-MeOTAD...,,0.04,0,nip,False,False,nan; nan,False,0,...,1,0.0,I,0,0,0,3,0,0.0,0
4,SLG | FTO | TiO2-c | Perovskite | Spiro-MeOTAD...,,0.06,0,nip,False,False,nan; nan,False,0,...,1,0.0,Br,I,0,0,0.51,2.49,0.0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
47458,SLG | FTO | TiO2-c | TiO2-mp | Perovskite | Sp...,,0.20,0,nip,False,False,nan; nan,False,0,...,0,0.0,Br,I,0,0,2.16,0.84,0.0,0
47459,SLG | FTO | TiO2-c | TiO2-mp | Perovskite | Cu...,,0.25,0,nip,False,False,nan; nan,False,0,...,0,0.0,I,0,0,0,3,0,0.0,0
47460,SLG | FTO | TiO2-c | Perovskite | Spiro-MeOTAD...,,0.15,0,nip,False,False,nan; nan,False,0,...,0,0.0,I,0,0,0,3,0,0.0,0
47461,SLG | FTO | TiO2-c | TiO2-mp | Perovskite | Sp...,,0.12,0,nip,False,False,nan; nan,False,0,...,0,0.0,Br,I,0,0,0.45,2.55,0.0,0


### We will predict TS80 - the time it took for that specific solar cell's efficiency to fall to 80% of its initial value (e.g., 500 hours, 1200 hours, 850 hours).

In [3]:
# drop na TS80
nan_mask = df["TS80"].isna()
print(f"NaN count in TS80 - {nan_mask.sum()}")
df = df[~nan_mask]
df

NaN count in TS80 - 41275


Unnamed: 0,Cell_stack_sequence,Cell_area_total,Cell_area_measured,Cell_number_of_cells_per_substrate,Cell_architecture,Cell_flexible,Cell_semitransparent,Cell_semitransparent_wavelength_range,Module,Module_number_of_cells_in_module,...,B_1_coef,B_2_coef,C_1,C_2,C_3,C_4,C_1_coef,C_2_coef,C_3_coef,C_4_coef
2,SLG | FTO | TiO2-c | Perovskite | Spiro-MeOTAD...,,0.04,0,nip,False,False,nan; nan,False,0,...,1,0.0,I,0,0,0,3,0,0.0,0
3,SLG | FTO | TiO2-c | Perovskite | Spiro-MeOTAD...,,0.04,0,nip,False,False,nan; nan,False,0,...,1,0.0,I,0,0,0,3,0,0.0,0
4,SLG | FTO | TiO2-c | Perovskite | Spiro-MeOTAD...,,0.06,0,nip,False,False,nan; nan,False,0,...,1,0.0,Br,I,0,0,0.51,2.49,0.0,0
5,SLG | FTO | TiO2-c | TiO2-mp | Perovskite | BD...,,0.09,0,nip,False,False,nan; nan,False,0,...,1,0.0,Br,I,0,0,0.45,2.55,0.0,0
6,SLG | FTO | SnO2-c | Perovskite | Spiro-MeOTAD...,,0.16,0,nip,False,False,nan; nan,False,0,...,1,0.0,Br,I,0,0,0.369,2.631,0.0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6990,SLG | FTO | SnO2 | Perovskite | Spiro-MeOTAD | Au,,0.05,0,nip,False,False,nan; nan,False,0,...,0,0.0,0,0,0,0,0,0,0.0,0
6992,SLG | FTO | TiO2-c | TiO2-mp | Perovskite | N2...,,0.04,0,nip,False,False,nan; nan,False,0,...,1,0.0,I,0,0,0,3,0,0.0,0
6993,SLG | FTO | TiO2-c | TiO2-mp | Perovskite | N2...,,0.04,0,nip,False,False,nan; nan,False,0,...,1,0.0,I,0,0,0,3,0,0.0,0
6994,SLG | FTO | TiO2-c | TiO2-mp | Perovskite | Sp...,,0.16,0,nip,False,False,nan; nan,False,0,...,0,0.0,Br,I,0,0,0.45,2.55,0.0,0


In [4]:
# we will check some columns for NaN and unknown values
df.info(verbose=True, show_counts=True)

<class 'pandas.core.frame.DataFrame'>
Index: 6188 entries, 2 to 6995
Data columns (total 280 columns):
 #    Column                                                          Non-Null Count  Dtype  
---   ------                                                          --------------  -----  
 0    Cell_stack_sequence                                             6188 non-null   object 
 1    Cell_area_total                                                 137 non-null    float64
 2    Cell_area_measured                                              6053 non-null   float64
 3    Cell_number_of_cells_per_substrate                              6188 non-null   int64  
 4    Cell_architecture                                               6188 non-null   object 
 5    Cell_flexible                                                   6188 non-null   bool   
 6    Cell_semitransparent                                            6188 non-null   bool   
 7    Cell_semitransparent_wavelength_range        

In [5]:
# drop some nan columns
threshold = 6000
columns_to_drop = [col for col in df.columns if df[col].isna().sum() > threshold]
columns_to_drop

['Cell_area_total',
 'Perovskite_deposition_quenching_media_mixing_ratios',
 'HTL_additives_concentrations',
 'JV_light_source_brand_name',
 'JV_light_mask_area',
 'JV_preconditioning_protocol']

In [6]:
df = df.drop(columns=columns_to_drop)
df

Unnamed: 0,Cell_stack_sequence,Cell_area_measured,Cell_number_of_cells_per_substrate,Cell_architecture,Cell_flexible,Cell_semitransparent,Cell_semitransparent_wavelength_range,Module,Module_number_of_cells_in_module,Module_JV_data_recalculated_per_cell,...,B_1_coef,B_2_coef,C_1,C_2,C_3,C_4,C_1_coef,C_2_coef,C_3_coef,C_4_coef
2,SLG | FTO | TiO2-c | Perovskite | Spiro-MeOTAD...,0.04,0,nip,False,False,nan; nan,False,0,False,...,1,0.0,I,0,0,0,3,0,0.0,0
3,SLG | FTO | TiO2-c | Perovskite | Spiro-MeOTAD...,0.04,0,nip,False,False,nan; nan,False,0,False,...,1,0.0,I,0,0,0,3,0,0.0,0
4,SLG | FTO | TiO2-c | Perovskite | Spiro-MeOTAD...,0.06,0,nip,False,False,nan; nan,False,0,False,...,1,0.0,Br,I,0,0,0.51,2.49,0.0,0
5,SLG | FTO | TiO2-c | TiO2-mp | Perovskite | BD...,0.09,0,nip,False,False,nan; nan,False,0,False,...,1,0.0,Br,I,0,0,0.45,2.55,0.0,0
6,SLG | FTO | SnO2-c | Perovskite | Spiro-MeOTAD...,0.16,0,nip,False,False,nan; nan,False,0,False,...,1,0.0,Br,I,0,0,0.369,2.631,0.0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6990,SLG | FTO | SnO2 | Perovskite | Spiro-MeOTAD | Au,0.05,0,nip,False,False,nan; nan,False,0,False,...,0,0.0,0,0,0,0,0,0,0.0,0
6992,SLG | FTO | TiO2-c | TiO2-mp | Perovskite | N2...,0.04,0,nip,False,False,nan; nan,False,0,False,...,1,0.0,I,0,0,0,3,0,0.0,0
6993,SLG | FTO | TiO2-c | TiO2-mp | Perovskite | N2...,0.04,0,nip,False,False,nan; nan,False,0,False,...,1,0.0,I,0,0,0,3,0,0.0,0
6994,SLG | FTO | TiO2-c | TiO2-mp | Perovskite | Sp...,0.16,0,nip,False,False,nan; nan,False,0,False,...,0,0.0,Br,I,0,0,0.45,2.55,0.0,0


In [7]:
# we do not need JV features - drop
df = df.drop(columns=[col for col in df.columns if col.startswith("JV")])
df

Unnamed: 0,Cell_stack_sequence,Cell_area_measured,Cell_number_of_cells_per_substrate,Cell_architecture,Cell_flexible,Cell_semitransparent,Cell_semitransparent_wavelength_range,Module,Module_number_of_cells_in_module,Module_JV_data_recalculated_per_cell,...,B_1_coef,B_2_coef,C_1,C_2,C_3,C_4,C_1_coef,C_2_coef,C_3_coef,C_4_coef
2,SLG | FTO | TiO2-c | Perovskite | Spiro-MeOTAD...,0.04,0,nip,False,False,nan; nan,False,0,False,...,1,0.0,I,0,0,0,3,0,0.0,0
3,SLG | FTO | TiO2-c | Perovskite | Spiro-MeOTAD...,0.04,0,nip,False,False,nan; nan,False,0,False,...,1,0.0,I,0,0,0,3,0,0.0,0
4,SLG | FTO | TiO2-c | Perovskite | Spiro-MeOTAD...,0.06,0,nip,False,False,nan; nan,False,0,False,...,1,0.0,Br,I,0,0,0.51,2.49,0.0,0
5,SLG | FTO | TiO2-c | TiO2-mp | Perovskite | BD...,0.09,0,nip,False,False,nan; nan,False,0,False,...,1,0.0,Br,I,0,0,0.45,2.55,0.0,0
6,SLG | FTO | SnO2-c | Perovskite | Spiro-MeOTAD...,0.16,0,nip,False,False,nan; nan,False,0,False,...,1,0.0,Br,I,0,0,0.369,2.631,0.0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6990,SLG | FTO | SnO2 | Perovskite | Spiro-MeOTAD | Au,0.05,0,nip,False,False,nan; nan,False,0,False,...,0,0.0,0,0,0,0,0,0,0.0,0
6992,SLG | FTO | TiO2-c | TiO2-mp | Perovskite | N2...,0.04,0,nip,False,False,nan; nan,False,0,False,...,1,0.0,I,0,0,0,3,0,0.0,0
6993,SLG | FTO | TiO2-c | TiO2-mp | Perovskite | N2...,0.04,0,nip,False,False,nan; nan,False,0,False,...,1,0.0,I,0,0,0,3,0,0.0,0
6994,SLG | FTO | TiO2-c | TiO2-mp | Perovskite | Sp...,0.16,0,nip,False,False,nan; nan,False,0,False,...,0,0.0,Br,I,0,0,0.45,2.55,0.0,0


In [8]:
# extract the columns which we need
features_for_stability_model = [
    # --- Composition Features (The "Recipe") ---
    'Perovskite_composition_inorganic',
    'Perovskite_composition_leadfree',
    'Perovskite_band_gap',
    'A_1', 'A_2', 'A_3', 'A_4',
    'A_1_coef', 'A_2_coef', 'A_3_coef', 'A_4_coef',
    'B_1', 'B_2',
    'B_1_coef', 'B_2_coef',
    'C_1', 'C_2', 'C_3',
    'C_1_coef', 'C_2_coef', 'C_3_coef',

    # --- Architecture & Structure Features (The "Blueprint") ---
    'Cell_architecture',
    'Perovskite_dimension_2D',
    'Perovskite_dimension_3D',
    'Perovskite_dimension_2D3D_mixture',
    'Perovskite_dimension_3D_with_2D_capping_layer',
    'ETL_stack_sequence',
    'HTL_stack_sequence',
    'Backcontact_stack_sequence',

    # --- Fabrication & Process Features (The "Instructions") ---
    'Perovskite_deposition_procedure',
    'Perovskite_deposition_quenching_induced_crystallisation',
    'Perovskite_deposition_solvent_annealing',

    # --- Device & Test Condition Features ---
    'Cell_area_measured',
    'Encapsulation',
]
df = df[features_for_stability_model]
df

Unnamed: 0,Perovskite_composition_inorganic,Perovskite_composition_leadfree,Perovskite_band_gap,A_1,A_2,A_3,A_4,A_1_coef,A_2_coef,A_3_coef,...,Perovskite_dimension_2D3D_mixture,Perovskite_dimension_3D_with_2D_capping_layer,ETL_stack_sequence,HTL_stack_sequence,Backcontact_stack_sequence,Perovskite_deposition_procedure,Perovskite_deposition_quenching_induced_crystallisation,Perovskite_deposition_solvent_annealing,Cell_area_measured,Encapsulation
2,False,False,1.60,MA,0,0,0,1,0,0,...,False,False,TiO2-c,Spiro-MeOTAD,Au,Spin-coating,True,False,0.04,False
3,False,False,1.60,MA,0,0,0,1,0,0,...,False,False,TiO2-c,Spiro-MeOTAD,Au,Spin-coating,True,False,0.04,False
4,False,False,1.59,Cs,FA,MA,0,0.05,0.788,0.162,...,False,False,TiO2-c,Spiro-MeOTAD,Au,Spin-coating,True,False,0.06,False
5,False,False,,FA,MA,0,0,0.85,0.15,0,...,False,False,TiO2-c | TiO2-mp,BDT2FMeDPA,Carbon,Spin-coating,True,False,0.09,False
6,False,False,,Cs,FA,MA,0,0.05,0.827,0.123,...,False,False,SnO2-c,Spiro-MeOTAD,Au,Spin-coating,True,False,0.16,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6990,False,False,,0,0,0,0,0,0,0,...,False,False,SnO2,Spiro-MeOTAD | MoO3,Au,Spin-coating,False,False,0.05,False
6992,False,False,,MA,0,0,0,1,0,0,...,False,False,TiO2-c | TiO2-mp,"N2,N2,N12,N12-Tetrakis(4-methoxyphenyl)-9-meth...",Ag,Spin-coating,True,False,0.04,False
6993,False,False,,MA,0,0,0,1,0,0,...,False,False,TiO2-c | TiO2-mp,"N2,N2,N12,N12-Tetrakis(4-methoxyphenyl)-9-meth...",Ag,Spin-coating,True,False,0.04,False
6994,False,False,,FA,MA,0,0,0.85,0.15,0,...,False,False,TiO2-c | TiO2-mp,Spiro-MeOTAD,Au,Spin-coating,True,False,0.16,False


In [9]:
import json


def create_and_save_mappings(df, categorical_cols, filepath):
    """
    Creates and saves label encoding mappings for categorical features.
    """
    mappings = {}
    for column in categorical_cols:
        # Using factorize to get both codes and unique values (the mapping)
        codes, uniques = pd.factorize(df[column])
        df[column] = codes # Update the column with numerical codes
        # Create a dictionary mapping from category name to its code
        mappings[column] = {name: i for i, name in enumerate(uniques)}

    # Save the mappings dictionary to a JSON file
    with open(filepath, 'w') as f:
        json.dump(mappings, f, indent=4)

    print(f"Encoder mappings saved to '{filepath}'")
    return df, mappings

In [14]:
import os

filepath = os.path.join(os.path.abspath(os.path.curdir), 'stacks_and_deposition_mapping.json')
non_atomic_cols = ["ETL_stack_sequence", "HTL_stack_sequence", "Perovskite_deposition_procedure", "Backcontact_stack_sequence", "Perovskite_deposition_solvent_annealing"]
df, mappings = create_and_save_mappings(df, non_atomic_cols, filepath=filepath)
df

Encoder mappings saved to '/Users/ahovhannisyan/perovskite/perovskite_data_analysis/notebooks/stacks_and_deposition_mapping.json'


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df[column] = codes # Update the column with numerical codes
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df[column] = codes # Update the column with numerical codes
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df[column] = codes # Update the column with numerical codes
A value is trying to be s

Unnamed: 0,Perovskite_composition_inorganic,Perovskite_composition_leadfree,Perovskite_band_gap,A_1,A_2,A_3,A_4,A_1_coef,A_2_coef,A_3_coef,...,Perovskite_dimension_2D3D_mixture,Perovskite_dimension_3D_with_2D_capping_layer,ETL_stack_sequence,HTL_stack_sequence,Backcontact_stack_sequence,Perovskite_deposition_procedure,Perovskite_deposition_quenching_induced_crystallisation,Perovskite_deposition_solvent_annealing,Cell_area_measured,Encapsulation
2,False,False,1.60,MA,0,0,0,1,0,0,...,False,False,0,0,0,0,True,0,0.04,False
3,False,False,1.60,MA,0,0,0,1,0,0,...,False,False,0,0,0,0,True,0,0.04,False
4,False,False,1.59,Cs,FA,MA,0,0.05,0.788,0.162,...,False,False,0,0,0,0,True,0,0.06,False
5,False,False,,FA,MA,0,0,0.85,0.15,0,...,False,False,1,1,1,0,True,0,0.09,False
6,False,False,,Cs,FA,MA,0,0.05,0.827,0.123,...,False,False,2,0,0,0,True,0,0.16,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6990,False,False,,0,0,0,0,0,0,0,...,False,False,408,559,0,0,False,0,0.05,False
6992,False,False,,MA,0,0,0,1,0,0,...,False,False,1,773,2,0,True,0,0.04,False
6993,False,False,,MA,0,0,0,1,0,0,...,False,False,1,773,2,0,True,0,0.04,False
6994,False,False,,FA,MA,0,0,0.85,0.15,0,...,False,False,1,0,0,0,True,0,0.16,False


In [15]:
# drop cell area measured NaN
df = df.dropna(subset=["Cell_area_measured"])
print("DF length -", df.shape[0])

DF length - 6053


In [16]:
df.info(verbose=True, show_counts=True)

<class 'pandas.core.frame.DataFrame'>
Index: 6053 entries, 2 to 6995
Data columns (total 34 columns):
 #   Column                                                   Non-Null Count  Dtype  
---  ------                                                   --------------  -----  
 0   Perovskite_composition_inorganic                         6053 non-null   bool   
 1   Perovskite_composition_leadfree                          6053 non-null   bool   
 2   Perovskite_band_gap                                      4216 non-null   float64
 3   A_1                                                      6053 non-null   object 
 4   A_2                                                      6053 non-null   object 
 5   A_3                                                      6053 non-null   object 
 6   A_4                                                      6053 non-null   object 
 7   A_1_coef                                                 6053 non-null   object 
 8   A_2_coef                         

In [None]:
# calculate features` octahedral and tolerance
from perovskite_prediction_api.features.calc_factors import compute_octahedral_factor, compute_tolerance_factor
from perovskite_prediction_api.features.structure_features import compute_effective_radii, compute_ionic_radius_ratios

