<div align="center">

# Play Ground Series S3 E24

## 💎 Predicting the Hardness of minerals from data 💎

### Welcome to my kaggle notebook! Although I'm not a professional data scientist, I'm deeply interested in data analysis as a hobby. This project is an exciting exploration into using data to predict the hardness of minerals.

</div>


In [1]:
# Standard library imports
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt 
import seaborn as sns

# Plotting libraries
#import plotly.express as px

# ML imports
from catboost import CatBoostRegressor
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.model_selection import GridSearchCV, train_test_split
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.ensemble import StackingRegressor, RandomForestRegressor, GradientBoostingRegressor, BaggingRegressor
from sklearn.linear_model import Ridge, Lasso, LinearRegression
from sklearn.svm import SVR
from sklearn.tree import DecisionTreeRegressor
from sklearn.metrics import mean_squared_error, r2_score




## Description of data points
---

* id: A unique identifier for each record in the dataset.
* allelectrons_Total: The total number of electrons in the materials.
* density_Total: The total density of the materials.
* allelectrons_Average: The average number of electrons in the materials.
* val_e_Average: The average number of valence electrons.
* atomicweight_Average: The average atomic weight of the elements in the materials.
* ionenergy_Average: The average ionization energy.
* el_neg_chi_Average: The average electronegativity.
* R_vdw_element_Average: The average van der Waals radius of the elements.
* R_cov_element_Average: The average covalent radius of the elements.
* zaratio_Average: The average ratio of protons to electrons.
* density_Average: The average density of the materials.
* Hardness: The hardness of the materials on the Mohs scale.

In [2]:
df_train = pd.read_csv(r"..\data\train.csv")
df_test = pd.read_csv(r"..\data\test.csv")

## EDA
---

In [None]:
def analyze_dataframe(df):
    """
    Analyze a pandas DataFrame and provide a summary of its characteristics.

    Parameters:
    df (pandas.DataFrame): The input DataFrame to analyze.

    Returns:
    None
    """
    print("DataFrame Information:")
    print("______________________")
    display(df.info(verbose=True, show_counts=True))
    print("\n")
    
    print("DataFrame Head:")
    print("______________________")
    display(df.head())
    print("\n")

    print("DataFrame Tail:")
    print("______________________")
    display(df.tail())
    print("\n")

    print("DataFrame Description:")
    print("______________________")
    display(df.describe().T)
    print("\n")

    print("Number of Null Values:")
    print("______________________")
    display(df.isnull().sum())
    print("\n")

    print("Number of Duplicated Rows:")
    print("______________________")
    display(df.duplicated().sum())
    print("\n")

    print("Number of Unique Values:")
    print("______________________")
    display(df.nunique())
    print("\n")

    print("DataFrame Shape:")
    print("______________________")
    print(f"Rows: {df.shape[0]}, Columns: {df.shape[1]}")

analyze_dataframe(df_train)

# Visualizing the data
---

In [None]:
#TODO: Add a function to check for outliers


In [None]:
def prelim_eda_histplot(df):
    """
    Create a histogram plot of each column in a pandas DataFrame.

    Parameters:
    df (pandas.DataFrame): The input DataFrame to analyze.

    Returns:
    None
    """
    columns_to_plot = df.drop(['id'], axis=1)

    num_columns = len(columns_to_plot.columns)
    num_rows = 3  # Set the number of rows to 4
    num_cols = 4  # Set the number of columns to 3
    fig, axes = plt.subplots(nrows=num_rows, ncols=num_cols, figsize=(20, 12))  # Adjust the figsize as needed

    # Loop over selected columns and create histogram plots in separate subplots
    for i, column in enumerate(columns_to_plot):
        row = i // num_cols
        col = i % num_cols
        sns.histplot(data=df, x=column, ax=axes[row, col], kde=True, bins=20)
        axes[row, col].set_title(f'{column}', fontsize=14)
        axes[row, col].set_aspect('auto')

    # Remove empty subplots (if any)
    for i in range(num_columns, num_rows * num_cols):
        fig.delaxes(axes.flatten()[i])

    # Adjust layout
    plt.tight_layout()
    plt.show()

prelim_eda_histplot(df_train)


In [None]:
def prelim_eda_boxplot(df):
    """
    Create a boxplot of each column in a pandas DataFrame.

    Parameters:
    df (pandas.DataFrame): The input DataFrame to analyze.

    Returns:
    None
    """
    columns_to_plot = df.drop(['id'], axis=1)

    num_columns = len(columns_to_plot.columns)
    num_rows = (num_columns + 1) // 2  # Adjust the number of rows as needed
    fig, axes = plt.subplots(nrows=num_rows, ncols=2, figsize=(20, 12))

    # Loop over selected columns and create box plots in separate subplots
    for i, column in enumerate(columns_to_plot):
        row = i // 2
        col = i % 2
        sns.boxplot(x=df[column], ax=axes[row, col], palette="Set3")
        axes[row, col].set_title(f'Box Plot of {column}', fontsize=14)

    # Remove empty subplots (if any)
    for i in range(num_columns, num_rows * 2):
        fig.delaxes(axes.flatten()[i])

    # Adjust layout
    plt.tight_layout()
    plt.show()
prelim_eda_boxplot(df_train)

In [None]:
def prelim_eda_heatmap(df):
    corr = df.corr()
    plt.figure(figsize=(16, 12))
    sns.heatmap(df.corr(), annot=True,fmt='.1f', cmap='coolwarm')
    plt.title('Correlation Heatmap')
    plt.show()
prelim_eda_heatmap(df_train)

In [None]:
from sklearn.ensemble import IsolationForest

outliers_inputs = ['']
model_IF = IsolationForest(n_estimators=100, max_samples='auto', contamination=float(.1), \
                        max_features=1.0, bootstrap=False, n_jobs=-1, random_state=42, verbose=0)
model_IF.fit(df_train[outliers_inputs])
df_train['outlier_score'] = model_IF.decision_function(df_train[outliers_inputs])
df_train['outlier'] = model_IF.predict(df_train[outliers_inputs])
df_train.loc[df_train['outlier'] == 1, 'outlier'] = 0
df_train.loc[df_train['outlier'] == -1, 'outlier'] = 1
df_train['outlier'].value_counts()


In [None]:
def outlier_plot(df,outlier_method_name, x_var,y_var)

In [6]:
# from sklearn.cluster import KMeans

# # Assuming df_train is your DataFrame

# # Select the number of clusters
# n_clusters = 7  # This is an arbitrary choice; you might need to tune this number

# # Perform K-Means clustering
# kmeans = KMeans(n_clusters=n_clusters, random_state=42)
# df_train['cluster'] = kmeans.fit_predict(df_train.drop(['Hardness', 'id'], axis=1))

# # Calculate distance to cluster centroids
# centroids = kmeans.cluster_centers_
# distances = kmeans.transform(df_train.drop(['Hardness', 'id', 'cluster'], axis=1))
# for i in range(n_clusters):
#     df_train[f'distance_to_cluster_{i}'] = distances[:, i]

# # Optional: Mean properties of each cluster
# for i in range(n_clusters):
#     cluster_mean = df_train[df_train['cluster'] == i].mean()
#   df_train[f'cluster_{i}_mean_density'] = df_train['cluster'].map(lambda x: cluster_mean['density_Total'] if x == i else 0)

# View the modified DataFrame
df_train.head(20)


Unnamed: 0,id,allelectrons_Total,density_Total,allelectrons_Average,val_e_Average,atomicweight_Average,ionenergy_Average,el_neg_chi_Average,R_vdw_element_Average,R_cov_element_Average,...,distance_to_cluster_4,distance_to_cluster_5,distance_to_cluster_6,cluster_0_mean_density,cluster_1_mean_density,cluster_2_mean_density,cluster_3_mean_density,cluster_4_mean_density,cluster_5_mean_density,cluster_6_mean_density
0,0,100.0,0.841611,10.0,4.8,20.612526,11.0881,2.766,1.732,0.86,...,3228.053176,382.242276,15213.563533,0.0,0.0,10.208979,0.0,0.0,0.0,0.0
1,1,100.0,7.558488,10.0,4.8,20.298893,12.04083,2.755,1.631,0.91,...,3228.02254,381.598311,15213.281276,0.0,0.0,10.208979,0.0,0.0,0.0,0.0
2,2,76.0,8.885992,15.6,5.6,33.739258,12.0863,2.828,1.788,0.864,...,3252.035036,405.229828,15237.216342,0.0,0.0,10.208979,0.0,0.0,0.0,0.0
3,3,100.0,8.795296,10.0,4.8,20.213349,10.9485,2.648,1.626,0.936,...,3228.018474,381.492359,15213.229746,0.0,0.0,10.208979,0.0,0.0,0.0,0.0
4,4,116.0,9.577996,11.6,4.8,24.988133,11.82448,2.766,1.682,0.896,...,3212.014526,365.363794,15197.213282,0.0,0.0,10.208979,0.0,0.0,0.0,0.0
5,5,131.0,24.529328,21.833333,4.666667,50.1315,11.9482,2.858333,1.758333,0.896667,...,3197.133084,349.914965,15181.651139,0.0,0.0,10.208979,0.0,0.0,0.0,0.0
6,6,50.0,3.253996,10.0,4.8,20.077018,11.02384,2.727,1.767,0.88,...,3278.040391,431.766709,15263.417647,0.0,0.0,10.208979,0.0,0.0,0.0,0.0
7,7,30.0,2.701748,6.0,4.0,11.663548,12.41584,2.766,1.495,0.71,...,3298.066571,452.076146,15283.42335,0.0,0.0,10.208979,0.0,0.0,0.0,0.0
8,8,120.0,14.632,12.0,4.8,24.808518,11.33182,2.87,1.7185,0.91,...,3208.003627,360.970135,15193.006911,0.0,0.0,10.208979,0.0,0.0,0.0,0.0
9,9,97.0,13.477328,12.8125,5.0,24.988133,11.037175,2.81875,1.60125,0.88125,...,3231.005726,383.982364,15216.035296,0.0,0.0,10.208979,0.0,0.0,0.0,0.0


In [None]:
from sklearn.neighbors import NearestNeighbors
import numpy as np

# Assuming df_train is your DataFrame and 'Hardness' and 'id' are columns in df_train

# Select features for KNN
X = df_train.drop(['Hardness', 'id'], axis=1)

# Initialize NearestNeighbors
knn = NearestNeighbors(n_neighbors=6, algorithm='kd_tree', n_jobs=-1)
knn.fit(X)

# Calculate the distances and indices of the neighbors
distances, indices = knn.kneighbors(X)

# Example: Mean distance to the 5 nearest neighbors
df_train['mean_distance_to_5_neighbors'] = np.mean(distances, axis=1)


mean_density_neighbors = []
mean_val_e_neighbors = []
mean_allelectrons_neighbors = []

for idx in indices:
    # Calculate mean 'density_Total' of neighbors
    mean_density = df_train.iloc[idx]['density_Total'].mean()
    mean_density_neighbors.append(mean_density)

    # Calculate mean 'val_e_Average' of neighbors
    mean_val_e = df_train.iloc[idx]['val_e_Average'].mean()
    mean_val_e_neighbors.append(mean_val_e)

    # Calculate mean 'allelectrons_Total' of neighbors
    mean_allelectrons = df_train.iloc[idx]['allelectrons_Total'].mean()
    mean_allelectrons_neighbors.append(mean_allelectrons)

# Add the new features to df_train
df_train['mean_density_neighbors'] = mean_density_neighbors
df_train['mean_val_e_neighbors'] = mean_val_e_neighbors
df_train['mean_allelectrons_neighbors'] = mean_allelectrons_neighbors


# Display the head of the modified DataFrame
df_train.head()


In [None]:
def preprocessing_newfeat(df):
    
    #Drop Id
    df = df.drop(columns=['id'])
    # Define BMI
    df['BMI'] = df['weight(kg)'] / ((df['height(cm)'] / 100) ** 2)

    # Define age groups
    bins = [0, 20, 30, 40, 50, 60, 70, 80, 90, 100]
    labels = ['0-20', '21-30', '31-40', '41-50', '51-60', '61-70', '71-80', '81-90', '91-100']
    df['age_group'] = pd.cut(df['age'], bins=bins, labels=labels, right=False)

    # Define hypertension groups
    def hypertension_category(row):
        if row['systolic'] < 120 and row['relaxation'] < 80:
            return 'Normal'
        elif 120 <= row['systolic'] < 130 and row['relaxation'] < 80:
            return 'Elevated'
        elif 130 <= row['systolic'] < 140 or 80 <= row['relaxation'] < 90:
            return 'Hypertension Stage 1'
        elif row['systolic'] >= 140 or row['relaxation'] >= 90:
            return 'Hypertension Stage 2'
        else:
            return 'Unclassified'

    df['hypertension_group'] = df.apply(hypertension_category, axis=1)

    return df

In [None]:
def transform_outliers(df, features):
    """
    Transform outliers in the given features of the dataframe using the IQR method.

    :param df: DataFrame containing the data
    :param features: List of features in which to transform outliers
    """
    for feature in features:
        # Calculate Q1 (25th percentile) and Q3 (75th percentile) of the feature
        Q1 = df[feature].quantile(0.25)
        Q3 = df[feature].quantile(0.75)
        
        # Calculate the Interquartile Range (IQR)
        IQR = Q3 - Q1

        # Define bounds for outliers
        lower_bound = Q1 - 1.5 * IQR
        upper_bound = Q3 + 1.5 * IQR

        # Replace outliers with the nearest bound value
        df[feature] = df[feature].apply(lambda x: lower_bound if x < lower_bound else (upper_bound if x > upper_bound else x))

    return df

In [None]:
def master_preprocessing(df, features_to_transform):
    """
    Apply all preprocessing steps to the dataframe.

    :param df: DataFrame to be processed
    :param features_to_transform: List of features to transform outliers in
    :return: Preprocessed DataFrame
    """
    # Apply preprocessing for new features
    df = preprocessing_newfeat(df)

    # Transform outliers
    df = transform_outliers(df, features_to_transform)

    return df

# ML Pipeline
___

### Dropping the id column

In [None]:
pp_train = df_train.drop(columns=["id"])
pp_test = df_test.drop(columns=["id"])

In [None]:

X = pp_train.drop(columns=["Hardness"])
y = pp_train["Hardness"]

# Define numeric features
numeric_features = ['allelectrons_Total', 'density_Total', 'allelectrons_Average',
                    'val_e_Average', 'atomicweight_Average', 'ionenergy_Average',
                    'el_neg_chi_Average', 'R_vdw_element_Average', 'R_cov_element_Average',
                    'zaratio_Average', 'density_Average']

# Create a transformer for numeric features
numeric_transformer = Pipeline(steps=[
    ('scaler', StandardScaler()),  # Standardize the data
])

# Use ColumnTransformer to apply the numeric transformer
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numeric_features)
    ]
)

# list of regressors and their respective hyperparameter grids
regressors = {
    'DecisionTreeRegressor': (DecisionTreeRegressor(), {'regressor__max_depth': [4,5,6]}),
    'RandomForestRegressor': (RandomForestRegressor(), {'regressor__n_estimators': [211,216,220], 'regressor__max_depth': [21,25,30]}),
    'GradientBoostingRegressor': (GradientBoostingRegressor(), {'regressor__n_estimators': [286], 'regressor__learning_rate': [0.1]}),
    'SVR': (SVR(), {'regressor__C': [10], 'regressor__gamma': ['scale']})
}


# Create a dictionary to store the best models and their scores
best_models = {}
model_scores = {}

# Split data into training and validation sets
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

# Initialize variables to track the best overall model and its score
best_overall_score = float('inf')
best_overall_model = None

# Iterate through the regressors and perform GridSearchCV
for reg_name, (reg, param_grid) in regressors.items():
    pipeline = Pipeline(steps=[('preprocessor', preprocessor),
                               ('regressor', reg)])
    
    grid_search = GridSearchCV(pipeline, param_grid, cv=10, n_jobs=-1, scoring='neg_mean_squared_error')
    grid_search.fit(X_train, y_train)

    # Predict on the validation set
    y_val_pred = grid_search.predict(X_val)

    # Calculate MSE and R-squared
    mse_val = mean_squared_error(y_val, y_val_pred)
    r2_val = r2_score(y_val, y_val_pred)
    print(f'{reg_name} MSE on Validation: {mse_val:.5f}')
    print(f'{reg_name} R-squared on Validation: {r2_val:.5f}')

    # Store the best model and its score
    best_models[reg_name] = grid_search.best_estimator_
    best_score = -grid_search.best_score_
    model_scores[reg_name] = best_score
    print(f'Best {reg_name} Model: {grid_search.best_params_}')
    print(f'Best {reg_name} Cross-Validation MSE: {best_score:.5f}')

    # Update the best overall model and score if current model is better
    if best_score < best_overall_score:
        best_overall_score = best_score
        best_overall_model = reg_name

# After all models are evaluated, print the best model overall
print(f'Best Overall Model: {best_overall_model}')
print(f'Best Overall Cross-Validation MSE: {best_overall_score:.5f}')

In [None]:
final_model = best_models.get(reg_name)

X_test = pp_test
original_test_data = pd.read_csv('../data/test.csv')

id_df = df_test[['id']]

y_pred = final_model.predict(X_test)

predicted_df = pd.DataFrame({'Hardness': y_pred})
submission_df = pd.concat([id_df, predicted_df], axis=1)

submission_df.to_csv('../data/submission.csv', index=False)


In [None]:
submission_df.head()