In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
!pip install catboost
from catboost import CatBoostRegressor
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor, StackingRegressor
from sklearn.linear_model import ElasticNet, Ridge, Lasso
from sklearn.metrics import mean_absolute_error
from xgboost import XGBRegressor
from lightgbm import LGBMRegressor
import warnings
warnings.filterwarnings('ignore')



In [2]:
train_data = pd.read_csv('/kaggle/input/metallurgica2025/train.csv')
test_data = pd.read_csv("/kaggle/input/metallurgica2025/test.csv")

In [3]:
print(train_data.shape)
train_data.head()

(1600, 41)


Unnamed: 0,ID,Alloy formula,Alloy class,Cu,Al,Ag,B,Be,Ca,Co,...,tss (h),CR reduction (%),Aging,Tag (K),tag (h),Secondary thermo-mechanical process,Hardness (HV),Yield strength (MPa),Ultimate tensile strength (MPa),Electrical conductivity (%IACS)
0,969,Cu-6Ni-1Si-0.5Al-0.15Mg-0.1Cr,Cu low alloyed,95.557137,0.24615,7.5e-05,0.0,0.0,0,0.0,...,3.97213,4.366903,Y,778.447643,4.447236,N,163.585875,,,22.06701
1,241,Cu-4.5Ni-1Si-1.2Co-0.15Mg,Cu-Ni-Si alloys,95.083982,0.001252,0.0,0.0,0.000947,0,0.0,...,,90.214778,Y,,0.0,N,300.692142,567.094341,,43.761314
2,820,Cu-4.5Ni-1Si-1.2Co-0.15Mg,Cu-Ni-Si alloys,91.894209,0.022183,0.0,0.00326,0.0,0,0.008681,...,4.064446,0.243155,Y,637.878753,0.887991,N,327.858374,,,36.114651
3,693,Cu-4.0Ni-2.0Si,Cu-Ni-Si alloys,92.624741,0.0,0.0,0.0,0.015469,0,0.000438,...,7.041734,0.0,Y,772.859083,0.535209,Y,331.311269,,,26.008459
4,421,Cu-0.28Cr-0.19Mg,Cu-Ni-Si alloys,95.575242,0.009056,0.0,0.0,0.008676,0,0.0,...,27.77,0.0,Y,768.312615,,N,83.737011,,,29.370901


In [4]:
train_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1600 entries, 0 to 1599
Data columns (total 41 columns):
 #   Column                               Non-Null Count  Dtype  
---  ------                               --------------  -----  
 0   ID                                   1600 non-null   int64  
 1   Alloy formula                        1600 non-null   object 
 2   Alloy class                          1600 non-null   object 
 3   Cu                                   1600 non-null   float64
 4   Al                                   1600 non-null   float64
 5   Ag                                   1600 non-null   float64
 6   B                                    1600 non-null   float64
 7   Be                                   1600 non-null   float64
 8   Ca                                   1600 non-null   int64  
 9   Co                                   1600 non-null   float64
 10  Ce                                   1600 non-null   float64
 11  Cr                            

In [5]:
target = 'Electrical conductivity (%IACS)'
train_data = train_data.dropna(subset=[target])
train_data.shape

(1598, 41)

In [6]:
train_data['Alloy class'].unique() 

array(['Cu low alloyed', 'Cu-Ni-Si alloys', 'Cu-Ti alloys',
       'Cu-Be alloys'], dtype=object)

In [7]:
# Correlation with target
numeric_cols = train_data.select_dtypes(include=['float64', 'int64']).columns
correlations = train_data[numeric_cols].corr()[target].sort_values(ascending=False)
print("\nTop correlations with Electrical conductivity:")
print(correlations.head(15))


Top correlations with Electrical conductivity:
Electrical conductivity (%IACS)    1.000000
Ce                                 0.038213
Be                                 0.036596
Mg                                 0.032796
Zn                                 0.032528
Ni                                 0.030969
P                                  0.030443
Ag                                 0.030295
Zr                                 0.029891
Tss (K)                            0.028979
Tag (K)                            0.022701
tss (h)                            0.015568
Sn                                 0.010761
CR reduction (%)                   0.004557
Al                                -0.002062
Name: Electrical conductivity (%IACS), dtype: float64


In [8]:
missing_percentages = (train_data.isnull().sum() / len(train_data)) * 100
high_missing_cols = missing_percentages[missing_percentages > 80].index.tolist()
print("Columns with more than 80% missing values:", high_missing_cols)

# Features aur target alag karna with removed columns
train_data = train_data.drop(columns=high_missing_cols)

Columns with more than 80% missing values: ['Yield strength (MPa)', 'Ultimate tensile strength (MPa)']


In [9]:
# Check for missing values
missing_values = train_data.isnull().sum()
print("Missing values per column:")
print(missing_values[missing_values > 0])

Missing values per column:
Tss (K)                                 42
tss (h)                                 68
Tag (K)                                 59
tag (h)                                 85
Secondary thermo-mechanical process     28
Hardness (HV)                          175
dtype: int64


In [10]:
# Combine train and test for preprocessing
train_id = train_data['ID']
test_id = test_data['ID']
target = train_data['Electrical conductivity (%IACS)']

# Drop ID and target from train
train_features = train_data.drop(['ID',  'Alloy formula','Electrical conductivity (%IACS)'], axis=1)
test_features = test_data.drop(['ID','Alloy formula'], axis=1)

# Identify categorical and numerical columns
categorical_cols = train_features.select_dtypes(include=['object']).columns.tolist()
numerical_cols = train_features.select_dtypes(include=['int64', 'float64']).columns.tolist()

print(f"Categorical columns: {categorical_cols}")
print(f"Numerical columns: {len(numerical_cols)}")

Categorical columns: ['Alloy class', 'Aging', 'Secondary thermo-mechanical process']
Numerical columns: 33


In [11]:
def engineer_features(df):
    # Create a copy to avoid modifying the original dataframe
    df_new = df.copy()
    
    # Extract element ratios that might be important for conductivity
    # Cu to Al ratio (copper is highly conductive, aluminum less so)
    df_new['Cu_Al_ratio'] = df_new['Cu'] / (df_new['Al'] + 0.001)
    
    # Sum of highly conductive elements
    df_new['conductive_elements_sum'] = df_new['Cu'] + df_new['Ag'] + df_new['Au'] if 'Au' in df_new.columns else df_new['Cu'] + df_new['Ag']
    
    # Sum of alloying elements
    df_new['total_alloying'] = df_new[numerical_cols].sum(axis=1)
    
    # Temperature and time interaction for thermal processes
    df_new['thermal_factor'] = df_new['Tss (K)'] * np.log1p(df_new['tss (h)'])
    df_new['aging_factor'] = df_new['Tag (K)'] * np.log1p(df_new['tag (h)'])
    
    # Interaction between mechanical properties
    if 'Hardness (HV)' in df_new.columns and 'Yield strength (MPa)' in df_new.columns:
        df_new['hardness_yield_ratio'] = df_new['Hardness (HV)'] / (df_new['Yield strength (MPa)'] + 0.001)
    
    return df_new

# Apply feature engineering
train_features = engineer_features(train_features)
test_features = engineer_features(test_features)

In [12]:
preprocessor = ColumnTransformer(
    transformers=[
        ('num', Pipeline(steps=[
            ('imputer', SimpleImputer(strategy='mean')),   
            ('scaler', StandardScaler()) 
        ]), numerical_cols),
        ('cat', Pipeline(steps=[
            ('imputer', SimpleImputer(strategy='constant', fill_value='missing')), 
            ('onehot', OneHotEncoder(handle_unknown='ignore'))  
        ]), categorical_cols)
    ])

In [13]:
# Split the training data for validation
X_train, X_val, y_train, y_val = train_test_split(train_features, target, test_size=0.01, random_state=42)

# Define base models for stacking
base_models = [
    ('rf', RandomForestRegressor(n_estimators=200, random_state=42)),
    ('gbr', GradientBoostingRegressor(n_estimators=200, random_state=42)),
    ('xgb', XGBRegressor(n_estimators=200, learning_rate=0.05, random_state=42)),
    ('lgbm', LGBMRegressor(n_estimators=200, learning_rate=0.05, random_state=42)),
    ('ridge', Ridge(alpha=1.0, random_state=42)),
    ('lasso', Lasso(alpha=0.01, random_state=42)),
    ('elasticnet', ElasticNet(alpha=0.01, l1_ratio=0.5, random_state=42))
]

# Create the stacking ensemble
stacking_regressor = StackingRegressor(
    estimators=base_models,
    final_estimator=Ridge(random_state=42)
)

# Create the full pipeline
# model_pipeline = Pipeline(steps=[
#     ('preprocessor', preprocessor),
#     ('model', stacking_regressor)
# ])

model_pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('model', CatBoostRegressor(
        iterations=500,
        learning_rate=0.03,
        depth=3,
        loss_function='MAE',  # Using MAE as loss function since that's our evaluation metric
        verbose=100,  # Print progress every 100 iterations
        random_seed=42
    ))
])

# Train the model
model_pipeline.fit(X_train, y_train)

# Evaluate on validation set
val_predictions = model_pipeline.predict(X_val)
val_mae = mean_absolute_error(y_val, val_predictions)
print(f"Validation MAE: {val_mae:.4f}")

0:	learn: 13.8326971	total: 54.9ms	remaining: 27.4s
100:	learn: 13.1498206	total: 165ms	remaining: 650ms
200:	learn: 12.7341079	total: 275ms	remaining: 409ms
300:	learn: 12.3473247	total: 388ms	remaining: 256ms
400:	learn: 11.9571039	total: 497ms	remaining: 123ms
499:	learn: 11.6410655	total: 607ms	remaining: 0us
Validation MAE: 12.9885


In [14]:
# Train the final model on all training data
model_pipeline.fit(train_features, target)

# Make predictions on the test set
test_predictions = model_pipeline.predict(test_features)

# Create submission file
submission = pd.DataFrame({
    'ID': test_id,
    'Electrical conductivity (%IACS)': test_predictions
})

# Save the submission file
submission.to_csv('submission.csv', index=False)
print("Submission file created successfully!")

0:	learn: 13.8281557	total: 1.25ms	remaining: 626ms
100:	learn: 13.1312579	total: 111ms	remaining: 440ms
200:	learn: 12.7140010	total: 222ms	remaining: 330ms
300:	learn: 12.3119627	total: 332ms	remaining: 219ms
400:	learn: 11.9232705	total: 443ms	remaining: 109ms
499:	learn: 11.6308398	total: 551ms	remaining: 0us
Submission file created successfully!
