# Thermophysical Property: Melting Point

Dataset - https://www.kaggle.com/competitions/melting-point/data


In [98]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

### Loading dataset


In [99]:
train_df = pd.read_csv("./melting-point/train.csv")
test_df = pd.read_csv("./melting-point/test.csv")

### Data inspection


In [100]:
train_df.head()

Unnamed: 0,id,SMILES,Tm,Group 1,Group 2,Group 3,Group 4,Group 5,Group 6,Group 7,...,Group 415,Group 416,Group 417,Group 418,Group 419,Group 420,Group 421,Group 422,Group 423,Group 424
0,2175,FC1=C(F)C(F)(F)C1(F)F,213.15,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,1222,c1ccc2c(c1)ccc3Nc4ccccc4c23,407.15,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,2994,CCN1C(C)=Nc2ccccc12,324.15,2,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,1704,CC#CC(=O)O,351.15,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,2526,CCCCC(S)C,126.15,2,3,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [101]:
train_df.shape

(2662, 427)

In [102]:
train_df["Tm"].describe()

count    2662.000000
mean      278.263452
std        85.117914
min        53.540000
25%       217.000000
50%       277.300000
75%       325.150000
max       897.150000
Name: Tm, dtype: float64

In [103]:
# Checking for missing values

train_df.isnull().any().sum()

0

### Data preprocessing


In [104]:
# Splitting predictor and target variables

x = train_df.drop(columns=["Tm", "id"], axis=1)
y = train_df["Tm"]

In [105]:
# Checking categorical columns

categorical_cols = x.select_dtypes(include="object").columns.to_list()
categorical_cols

['SMILES']

In [106]:
# Checking numerical columns

numerical_cols = x.select_dtypes(include=["int64", "float64"]).columns.to_list()
numerical_cols

['Group 1',
 'Group 2',
 'Group 3',
 'Group 4',
 'Group 5',
 'Group 6',
 'Group 7',
 'Group 8',
 'Group 9',
 'Group 10',
 'Group 11',
 'Group 12',
 'Group 13',
 'Group 14',
 'Group 15',
 'Group 16',
 'Group 17',
 'Group 18',
 'Group 19',
 'Group 20',
 'Group 21',
 'Group 22',
 'Group 23',
 'Group 24',
 'Group 25',
 'Group 26',
 'Group 27',
 'Group 28',
 'Group 29',
 'Group 30',
 'Group 31',
 'Group 32',
 'Group 33',
 'Group 34',
 'Group 35',
 'Group 36',
 'Group 37',
 'Group 38',
 'Group 39',
 'Group 40',
 'Group 41',
 'Group 42',
 'Group 43',
 'Group 44',
 'Group 45',
 'Group 46',
 'Group 47',
 'Group 48',
 'Group 49',
 'Group 50',
 'Group 51',
 'Group 52',
 'Group 53',
 'Group 54',
 'Group 55',
 'Group 56',
 'Group 57',
 'Group 58',
 'Group 59',
 'Group 60',
 'Group 61',
 'Group 62',
 'Group 63',
 'Group 64',
 'Group 65',
 'Group 66',
 'Group 67',
 'Group 68',
 'Group 69',
 'Group 70',
 'Group 71',
 'Group 72',
 'Group 73',
 'Group 74',
 'Group 75',
 'Group 76',
 'Group 77',
 'Group 

In [107]:
# Splitting data into training and testing variables

from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=2)

In [108]:
# Scaling and one-hot encoding
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler, OneHotEncoder

preprocessor = ColumnTransformer(
    transformers=[
        ("scaling", StandardScaler(), numerical_cols),
        (
            "ohe",
            OneHotEncoder(handle_unknown="ignore", sparse_output=False),
            categorical_cols,
        ),
    ],
    remainder="passthrough",
)

X_train_transformed = preprocessor.fit_transform(X_train)
X_test_transformed = preprocessor.transform(X_test)

### Model training

In [109]:
from sklearn.metrics import r2_score
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import GradientBoostingRegressor, RandomForestRegressor
from xgboost import XGBRegressor

In [110]:
models = {
    'LinearRegression': LinearRegression(),
    'RandomForestRegressor': RandomForestRegressor(),
    'GradientBoostingRegressor': GradientBoostingRegressor(),
    'XGBRegressor': XGBRegressor()
}

In [111]:
accuracy_scores = {}

for name,model in models.items():
    model.fit(X_train_transformed, y_train)
    y_pred = model.predict(X_test_transformed)
    score = r2_score(y_test,y_pred)
    accuracy_scores[name] = score

In [112]:
for name, accuracy in accuracy_scores.items():
    print(f"{name} - {accuracy}")

LinearRegression - 0.5599588000447572
RandomForestRegressor - 0.6082457971007282
GradientBoostingRegressor - 0.47402831197431816
XGBRegressor - 0.5923373051997993


### Model testing

In [113]:
from sklearn.pipeline import Pipeline

testing_pipeline = Pipeline([
    ('preprocessor', preprocessor),
    ('model',RandomForestRegressor())
])

testing_pipeline.fit(X_train, y_train)

In [114]:
X_test = test_df.drop(columns=['id'], axis=1)
y_pred = testing_pipeline.predict(X_test)

In [115]:
submission = pd.DataFrame({
    'id':test_df['id'],
    'Tm': y_pred
})

In [116]:
submission.head()

Unnamed: 0,id,Tm
0,1022,326.432
1,1146,284.951
2,79,226.048
3,2279,224.728
4,1342,252.694


In [117]:
submission.to_csv('submission.csv', index=False)