<div align="center">

# Play Ground Series S3 E24

## 💎 Predicting the Hardness of minerals from data 💎

### Welcome to my kaggle notebook! Although I'm not a professional data scientist, I'm deeply interested in data analysis as a hobby. This project is an exciting exploration into using data to predict the hardness of minerals.

</div>


In [None]:

# Standard library imports
import pandas as pd 
import matplotlib.pyplot as plt 
import seaborn as sns

# ML imports
from sklearn.compose import ColumnTransformer
from sklearn.ensemble import GradientBoostingRegressor, RandomForestRegressor
from sklearn.impute import SimpleImputer
from sklearn.linear_model import Lasso, LinearRegression, Ridge
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.model_selection import GridSearchCV, train_test_split
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.svm import SVR
from sklearn.tree import DecisionTreeRegressor



## Description of data points
---

* id: A unique identifier for each record in the dataset.
* allelectrons_Total: The total number of electrons in the materials.
* density_Total: The total density of the materials.
* allelectrons_Average: The average number of electrons in the materials.
* val_e_Average: The average number of valence electrons.
* atomicweight_Average: The average atomic weight of the elements in the materials.
* ionenergy_Average: The average ionization energy.
* el_neg_chi_Average: The average electronegativity.
* R_vdw_element_Average: The average van der Waals radius of the elements.
* R_cov_element_Average: The average covalent radius of the elements.
* zaratio_Average: The average ratio of protons to electrons.
* density_Average: The average density of the materials.
* Hardness: The hardness of the materials on the Mohs scale.

In [None]:
df_train = pd.read_csv('/kaggle/input/playground-series-s3e25/train.csv')
df_test = pd.read_csv('/kaggle/input/playground-series-s3e25/test.csv')

## Initial data exploration
---

In [None]:
def analyze_dataframe(df):
    """
    Analyze a pandas DataFrame and provide a summary of its characteristics.

    Parameters:
    df (pandas.DataFrame): The input DataFrame to analyze.

    Returns:
    None
    """
    print("DataFrame Information:")
    print("______________________")
    display(df.info(verbose=True, show_counts=True))
    print("\n")
    
    print("DataFrame Head:")
    print("______________________")
    display(df.head())
    print("\n")

    print("DataFrame Tail:")
    print("______________________")
    display(df.tail())
    print("\n")

    print("DataFrame Description:")
    print("______________________")
    display(df.describe().T)
    print("\n")

    print("Number of Null Values:")
    print("______________________")
    display(df.isnull().sum())
    print("\n")

    print("Number of Duplicated Rows:")
    print("______________________")
    display(df.duplicated().sum())
    print("\n")

    print("Number of Unique Values:")
    print("______________________")
    display(df.nunique())
    print("\n")

    print("DataFrame Shape:")
    print("______________________")
    print(f"Rows: {df.shape[0]}, Columns: {df.shape[1]}")

analyze_dataframe(df_train)

In [None]:
def prelim_eda_boxplot(df):
    columns_to_plot = df.drop(['id', 'Hardness'], axis=1)

    num_columns = len(columns_to_plot.columns)
    num_rows = (num_columns + 1) // 2  # Adjust the number of rows as needed
    fig, axes = plt.subplots(nrows=num_rows, ncols=2, figsize=(20, 12))

    # Loop over selected columns and create box plots in separate subplots
    for i, column in enumerate(columns_to_plot):
        row = i // 2
        col = i % 2
        sns.boxplot(x=df[column], ax=axes[row, col], palette="Set3")
        axes[row, col].set_title(f'Box Plot of {column}', fontsize=14)

    # Remove empty subplots (if any)
    for i in range(num_columns, num_rows * 2):
        fig.delaxes(axes.flatten()[i])

    # Adjust layout
    plt.tight_layout()
    plt.show()
prelim_eda_boxplot(df_train)

### dropping ID column for model

In [None]:
pp_train = df_train.drop(columns=["id"])
pp_test = df_test.drop(columns=["id"])

## ML Pipeline
---

In [None]:
# Separate features and target variable
X = pp_train.drop(columns=["Hardness"])
y = pp_train["Hardness"]

# Define numeric features
numeric_features = ['allelectrons_Total', 'density_Total', 'allelectrons_Average',
                    'val_e_Average', 'atomicweight_Average', 'ionenergy_Average',
                    'el_neg_chi_Average', 'R_vdw_element_Average', 'R_cov_element_Average',
                    'zaratio_Average', 'density_Average']

# Create a transformer for numeric features
numeric_transformer = Pipeline(steps=[
    ('scaler', StandardScaler()),  # Standardize the data
])

# Use ColumnTransformer to apply the numeric transformer
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numeric_features)
    ]
)
# list of regressors and their respective hyperparameter grids
regressors = {
    'LinearRegression': (LinearRegression(), {}),
    'Ridge': (Ridge(), {'regressor__alpha': [0.1, 1, 10]}),
    'Lasso': (Lasso(), {'regressor__alpha': [0.1, 1, 10]}),
    'DecisionTreeRegressor': (DecisionTreeRegressor(), {'regressor__max_depth': [3, 5, 10]}),
    'RandomForestRegressor': (RandomForestRegressor(), {'regressor__n_estimators': [100, 200], 'regressor__max_depth': [3, 5, 10]}),
    'GradientBoostingRegressor': (GradientBoostingRegressor(), {'regressor__n_estimators': [100, 200], 'regressor__learning_rate': [0.05, 0.1, 0.2]}),
    'SVR': (SVR(), {'regressor__C': [0.1, 1, 10], 'regressor__gamma': ['scale', 'auto']})
}


# Create a dictionary to store the best models and their scores
best_models = {}
model_scores = {}

# Split data into training and validation sets
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

# Initialize variables to track the best overall model and its score
best_overall_score = float('inf')
best_overall_model = None

# Iterate through the regressors and perform GridSearchCV
for reg_name, (reg, param_grid) in regressors.items():
    pipeline = Pipeline(steps=[('preprocessor', preprocessor),
                               ('regressor', reg)])
    
    grid_search = GridSearchCV(pipeline, param_grid, cv=5, n_jobs=-1, scoring='neg_mean_squared_error')
    grid_search.fit(X_train, y_train)

    # Predict on the validation set
    y_val_pred = grid_search.predict(X_val)

    # Calculate MSE and R-squared
    mse_val = mean_squared_error(y_val, y_val_pred)
    r2_val = r2_score(y_val, y_val_pred)
    print(f'{reg_name} MSE on Validation: {mse_val:.5f}')
    print(f'{reg_name} R-squared on Validation: {r2_val:.5f}')

    # Store the best model and its score
    best_models[reg_name] = grid_search.best_estimator_
    best_score = -grid_search.best_score_
    model_scores[reg_name] = best_score
    print(f'Best {reg_name} Model: {grid_search.best_params_}')
    print(f'Best {reg_name} Cross-Validation MSE: {best_score:.5f}')

    # Update the best overall model and score if current model is better
    if best_score < best_overall_score:
        best_overall_score = best_score
        best_overall_model = reg_name

# After all models are evaluated, print the best model overall
print(f'Best Overall Model: {best_overall_model}')
print(f'Best Overall Cross-Validation MSE: {best_overall_score:.5f}')


## Submission
---

In [None]:
final_model = best_models.get(reg_name)

X_test = pp_test
original_test_data = pd.read_csv('/kaggle/input/playground-series-s3e25/test.csv')

id_df = df_test[['id']]

y_pred = final_model.predict(X_test)

predicted_df = pd.DataFrame({'Hardness': y_pred})
submission_df = pd.concat([id_df, predicted_df], axis=1)

submission_df.to_csv('submission.csv', index=False)
