In [52]:
import pandas as pd

excel_input = 'updated_materials_data.xlsx'

In [53]:
production_data_Abrasives = pd.read_excel(excel_input, sheet_name='Abrasives_raw_info')

production_data_Fluorspar = pd.read_excel(excel_input, sheet_name='Fluorspar_raw_info')

composition_df = pd.read_excel(excel_input, sheet_name='chem_compo_db')

In [54]:
composition_df

Unnamed: 0,sub_material_name,chemical_composition
0,Fused aluminum oxidee,"{'Al': 52.9%, 'O': 47.1%}"
1,Silicon Carbide,"{'Si': 70%, 'C': 30%}"
2,Metallic Abrasives,"{'Fe': 98%, 'C': 1.5%, 'Mn': 0.5%}"
3,Antimony,{'Sb': 100%}
4,Arsenic,{'As': 100%}
5,Beryllium,{'Be': 100%}
6,Bromine,{'Br': 100%}
7,Cesium Oxide (Cs���,"{'Cs': 76.2%, 'O': 23.8%}"
8,Chromium,{'Cr': 100%}
9,Cobalt,{'Co': 100%}


In [59]:
production_df_abrasives = production_data_Abrasives.drop(['remark', 'chemical_composition', 'metric_conv_factor', 'Empty_0', 'Empty_1', 'Unnamed: 10'], axis=1)

production_df_abrasives 

Unnamed: 0,text.1,text_1.1,text_2.1,text_3.1,text_4.1
0,,Fused aluminum oxidee,Fused aluminum oxidee,Silicon carbidee,Silicon carbidee
1,,2022,2023,2022,2023
2,United States,—,—,40000,40000
3,United States and Canada,60000,60000,—,—
4,Australia,50000,50000,—,—
5,Austria,90000,90000,—,—
6,Brazil,50000,50000,40000,40000
7,China,800000,800000,450000,450000
8,France,40000,40000,20000,20000
9,Germany,80000,80000,35000,35000


In [56]:
production_df_fluorspar = production_data_Fluorspar.drop(['remark', 'chemical_composition', 'metric_conv_factor', 'Empty_0', 'Empty_1'], axis=1)

production_df_fluorspar 

Unnamed: 0,text.1,text_1.1,text_2.1,text_3.1
0,,Mine production,Mine production,Reserves4
1,,2022,2023e,
2,United States,,,
3,China,5700,5700,67000
4,Germany,60,60,
5,Iran,116,120,4500
6,Mexico,1000,1000,68000
7,Mongolia,425,930,34000
8,Pakistan,52,52,
9,South Africa,406,410,41000


In [60]:
import pandas as pd
import numpy as np

def process_production_table(df, composition_df):
    # Remove 'text.' columns and rename remaining columns
    df = df.drop(columns=[col for col in df.columns if col.startswith('text.')])
    df.columns = ['Country'] + list(df.iloc[1])
    df = df.drop([0, 1])  # Remove the first two rows which were headers
    
    # Replace '—' with NaN and convert to numeric
    for col in df.columns[1:]:
        df[col] = pd.to_numeric(df[col].replace('—', np.nan), errors='coerce')
    
    # Find materials in the table
    materials = []
    for index, row in composition_df.iterrows():
        if any(row['sub_material_name'].lower() in col.lower() for col in df.columns):
            materials.append((row['sub_material_name'], row['chemical_composition']))
    
    for material, composition in materials:
        # Parse the composition
        composition_dict = {k.strip(): float(v.strip('%'))/100 for k, v in 
                            [item.split(':') for item in composition.strip('{}').split(',')]}
        
        if len(composition_dict) < 2:
            continue
        
        # Identify columns related to this material
        material_cols = [col for col in df.columns if material.lower() in col.lower()]
        
        # Create new columns for each element in the composition
        for col in material_cols:
            for element, percentage in composition_dict.items():
                new_col = f'{col}_{element}_content'
                df[new_col] = df[col] * percentage
    
    return df

In [61]:
processed_abrasives = process_production_table(production_df_abrasives, composition_df)
processed_fluorspar = process_production_table(production_df_fluorspar, composition_df)

print("Processed Abrasives Table:")
print(processed_abrasives)
print("\nProcessed Fluorspar Table:")
print(processed_fluorspar)

ValueError: Length mismatch: Expected axis has 4 elements, new values have 5 elements

In [68]:
processed_fluorspar

Unnamed: 0,text.1,1.1,2.1,3.1
0,,Mine production,Mine production,Reserves4
1,,2022,2023e,
2,United States,,,
3,China,5700,5700,67000
4,Germany,60,60,
5,Iran,116,120,4500
6,Mexico,1000,1000,68000
7,Mongolia,425,930,34000
8,Pakistan,52,52,
9,South Africa,406,410,41000


In [74]:
import pandas as pd
import re

# Sample data as DataFrames
data1 = pd.DataFrame({
    'text_1': [None, None, 'United States', 'United States and Canada'],
    'text_1.1': ['Fused aluminum oxidee', '2022', None, '60,000'],
    'text_2.1': ['Fused aluminum oxidee', '2023', None, '60,000'],
    'text_3.1': ['Silicon carbidee', '2022', '40,000', None],
    'text_4.1': ['Silicon carbidee', '2023', '40,000', None],
    'Unnamed: 10': [None, None, None, None]
})

data2 = pd.DataFrame({
    'sub_material_name': ['Fused aluminum oxidee', 'Silicon carbidee', 'Metallic Abrasives', 'Antimony'],
    'chemical_composition': [
        {'Al': 52.9, 'O': 47.1}, 
        {'Si': 70, 'C': 30}, 
        {'Fe': 98, 'C': 1.5, 'Mn': 0.5}, 
        {'Sb': 100}
    ]
})

# Extracting the relevant chemical composition for each material
composition_dict = data2.set_index('sub_material_name')['chemical_composition'].to_dict()

# Function to calculate the production for each element
def calculate_production(row, year, element):
    material_name = None

    # Identify the material name in the row
    for material in composition_dict.keys():
        if material.lower() in [str(row[col]).lower() for col in row.index]:
            material_name = material
            break

    if material_name is None:
        return None

    for col in row.index:
        col_str = str(row[col]) if row[col] is not None else ""

        # Match the year and material production column
        if re.search(f"{year}", col_str) and material_name.lower() in str(row['text_1.1']).lower():
            try:
                production = float(re.sub(r'[^\d.]', '', col_str))  # Remove commas and non-numeric characters
                return production * (composition_dict[material_name][element] / 100)
            except ValueError:
                return None
    return None

# Dynamically adding columns for each element based on the chemical composition
for material in composition_dict.keys():
    for element in composition_dict[material].keys():
        for year in ['2022', '2023']:
            col_name = f"{element}_{year}"
            data1[col_name] = data1.apply(lambda row: calculate_production(row, year, element), axis=1)

# Show the result
print(data1)

                     text_1               text_1.1               text_2.1  \
0                      None  Fused aluminum oxidee  Fused aluminum oxidee   
1                      None                   2022                   2023   
2             United States                   None                   None   
3  United States and Canada                 60,000                 60,000   

           text_3.1          text_4.1 Unnamed: 10 Al_2022 Al_2023 O_2022  \
0  Silicon carbidee  Silicon carbidee        None    None    None   None   
1              2022              2023        None    None    None   None   
2            40,000            40,000        None    None    None   None   
3              None              None        None    None    None   None   

  O_2023 Si_2022 Si_2023 C_2022 C_2023 Fe_2022 Fe_2023 Mn_2022 Mn_2023  \
0   None    None    None   None   None    None    None    None    None   
1   None    None    None   None   None    None    None    None    None   
2   None   