In [1]:
import pandas as pd
import numpy as np
import warnings

from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, mean_absolute_error
from scipy.cluster.hierarchy import linkage, fcluster
from sklearn.model_selection import KFold
from lightgbm import LGBMRegressor

warnings.filterwarnings(action='ignore')

In [None]:
df = pd.read_csv("../data/FP/")
df

In [3]:
data_set = df.drop('material_id', axis=1)
X = data_set.drop('GVRH',axis=1)
y = data_set['GVRH']

### Training the model

In [None]:
lgbm_pipeline = LGBMRegressor(n_estimators=1800,num_leaves=31)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state = 68)
lgbm_pipeline.fit(X_train, y_train)
print('test R2 = ' + str(lgbm_pipeline.score(X_test, y_test)))
print('test MAE = ' + str(mean_absolute_error(y_true = y_test, y_pred = lgbm_pipeline.predict(X_test))))
print('test RMSE = ' + str(np.sqrt(mean_squared_error(y_true = y_test, y_pred = lgbm_pipeline.predict(X_test)))))

### Feature selection

In [None]:
import pandas as pd
import matplotlib.pyplot as plt

feature_importances = lgbm_pipeline.feature_importances_

feature_importance_df1 = pd.DataFrame({'Feature': X.columns, 'Importance': feature_importances})

feature_importance_df1['Importance'] /= feature_importance_df1['Importance'].sum()

feature_importance_df1 = feature_importance_df1.sort_values(by='Importance', ascending=False)

top_n = 100
top_features1 = feature_importance_df1.head(top_n)

others_importance = feature_importance_df1.tail(len(feature_importance_df1) - top_n)['Importance'].sum()

others_row = pd.DataFrame({'Feature': ['others'], 'Importance': [others_importance]})
top_features1 = pd.concat([top_features1, others_row], ignore_index=True)

plt.figure(figsize=(12, 8))
bars = plt.barh(top_features1['Feature'], top_features1['Importance'], color='skyblue')
plt.xlabel('Normalized Importance', fontsize=14)
plt.ylabel('Feature', fontsize=14)

plt.xticks(fontsize=12)
plt.yticks(fontsize=12)
plt.gca().invert_yaxis()

for bar in bars:
    plt.text(
        bar.get_width() + 0.001, 
        bar.get_y() + bar.get_height() / 2, 
        f'{bar.get_width():.4f}', 
        va='center', 
        fontsize=12
    )

plt.show()


#### Get the top 100 features ranked by feature importance

In [None]:
feature_importances = lgbm_pipeline.feature_importances_

feature_importance_df = pd.DataFrame({'Feature': X.columns,'Importance': feature_importances}).sort_values(by='Importance', ascending=False)
top_n = 100
top_features = feature_importance_df.head(top_n)


plt.figure(figsize=(12, 8))  
plt.barh(top_features['Feature'], top_features['Importance'], color='skyblue')
plt.xlabel('Importance', fontsize=14)  
plt.ylabel('Feature', fontsize=14)  
plt.title(f'Top {top_n} Feature Importance', fontsize=16)  
plt.xticks(fontsize=12)  
plt.yticks(fontsize=12)  
plt.gca().invert_yaxis()  
plt.show()

In [None]:
X = df.drop(['GVRH','material_id'],axis=1)
y = df['GVRH']
X = X[top_features['Feature']]
X.head()

### Select key features through hierarchical clustering

#### Carry out clustering to select important features

In [None]:
# Calculate correlation matrix
corr = X.corr().abs()

# Check and handle NaN and Inf values
corr.fillna(0, inplace=True)

# Convert to distance matrix
distance_matrix = 1 - corr

# Ensure matrix has no non-finite values
distance_matrix = np.nan_to_num(distance_matrix, nan=0.0, posinf=1.0, neginf=0.0)

# Perform hierarchical clustering using Ward's method
linkage_matrix = linkage(distance_matrix, method='ward')

# Extract clusters
threshold = k  # k can be obtained from the paper
clusters = fcluster(linkage_matrix, threshold, criterion='distance')

selected_features = []
for cluster_id in np.unique(clusters):
    # Get features in current cluster
    cluster_features = corr.columns[clusters == cluster_id]
    
    # Find importance of these features from feature_importance_df
    cluster_importances = feature_importance_df[feature_importance_df['Feature'].isin(cluster_features)]
    
    # Select feature with highest importance contribution
    if not cluster_importances.empty:
        # Find feature with highest importance
        representative_feature = cluster_importances.loc[cluster_importances['Importance'].idxmax(), 'Feature']
        selected_features.append(representative_feature)

# Update dataset with selected features
X = df[selected_features]
X.head()

In [None]:
X.columns