In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn import svm
from sklearn.preprocessing import StandardScaler  
from sklearn.neural_network import MLPRegressor

# Importing the datasets
#### There are 2 datasets.  One contains the physical and chemical properties including the Boiling point, Melting point, Standard Enthalpies, etc. The other file contains the chemical decomposition of the material.

In [20]:
train_frame = pd.read_csv("train.csv")
train_frame=train_frame.drop(train_frame.columns[train_frame.shape[1]-1], axis=1)
train_frame2 = pd.read_csv("unique_m.csv")
train_frame=pd.concat([train_frame, train_frame2], axis=1)
train_frame=train_frame.drop(train_frame.columns[train_frame.shape[1]-1], axis=1)
X_all=train_frame.iloc[:, 0:train_frame.shape[1]-1]
Y_all=train_frame.iloc[:, train_frame.shape[1]-1:train_frame.shape[1]]
X_train, X_test, Y_train, Y_test= train_test_split(X_all, Y_all, test_size=0.2, random_state=42)
X_train=X_train.values
Y_train=Y_train.values
X_test=X_test.values
Y_test=Y_test.values
print("Shape of X_train : "+str(X_train.shape)+" the type is "+str(type(X_train)))
print("Shape of Y_train : "+str(Y_train.shape)+" the type is "+str(type(Y_train)))
print("Shape of X_test :" +str(X_test.shape)+" the type is "+str(type(X_test)))
print("Shape of the Y_test: "+str(Y_test.shape)+" the type is "+str(type(Y_test)))
# #these are arrays now we need to check the outputs for various different classifier.
# #note that its a regression problem, depending upon the featurs, we need to predict the value of the Temperature
# #Lets Normalize the data first.


Shape of X_train : (17010, 167) the type is <class 'numpy.ndarray'>
Shape of Y_train : (17010, 1) the type is <class 'numpy.ndarray'>
Shape of X_test :(4253, 167) the type is <class 'numpy.ndarray'>
Shape of the Y_test: (4253, 1) the type is <class 'numpy.ndarray'>


# Making the input standard to improve learning

In [22]:
#Standardize the data for better learning 
scaler = StandardScaler()  
scaler.fit(X_train)  
X_train_std = scaler.transform(X_train)
X_test_std = scaler.transform(X_test)


# Using a Support Vector Machine Regressor 
### Higher the Penalty factor, more the rigid decision boundaries are and increased computation cost on the system. Score metric- R^2 loss.
#### The coefficient R^2 is defined as (1 - u/v), where u is the residual sum of squares ((y_true - y_pred) ** 2).sum() and v is the total sum of squares ((y_true - y_true.mean()) ** 2).sum(). The best possible score is 1.0 and it can be negative (because the model can be arbitrarily worse). A constant model that always predicts the expected value of y, disregarding the input features, would get a R^2 score of 0.0.

In [62]:
clf1=svm.SVR(C=30)
clf1.fit(X_train_std, Y_train)
clf1.score(X_test_std, Y_test)

  y = column_or_1d(y, warn=True)


0.8547894574946237

# Neural Network Regression Model
## Uses squared error loss metric and relu activation in between the layers
### Regularised used- Early Stopping. Score metric R^2 loss. 
#### The coefficient R^2 is defined as (1 - u/v), where u is the residual sum of squares ((y_true - y_pred) ** 2).sum() and v is the total sum of squares ((y_true - y_true.mean()) ** 2).sum(). The best possible score is 1.0 and it can be negative (because the model can be arbitrarily worse). A constant model that always predicts the expected value of y, disregarding the input features, would get a R^2 score of 0.0.



In [46]:
clf2=MLPRegressor(batch_size=32, early_stopping=True, verbose=True,learning_rate="invscaling", hidden_layer_sizes=(500,100,20,))
clf2.fit(X_train_std, Y_train)

  y = column_or_1d(y, warn=True)


Iteration 1, loss = 173.74580957
Validation score: 0.810732
Iteration 2, loss = 104.88880099
Validation score: 0.843735
Iteration 3, loss = 95.15120024
Validation score: 0.853921
Iteration 4, loss = 88.17802535
Validation score: 0.864891
Iteration 5, loss = 83.06189378
Validation score: 0.871603
Iteration 6, loss = 77.34945783
Validation score: 0.868033
Iteration 7, loss = 74.70103033
Validation score: 0.873347
Iteration 8, loss = 74.12671053
Validation score: 0.872941
Iteration 9, loss = 70.89684799
Validation score: 0.873124
Iteration 10, loss = 70.19262212
Validation score: 0.879820
Iteration 11, loss = 68.03874526
Validation score: 0.883598
Iteration 12, loss = 66.54157745
Validation score: 0.887163
Iteration 13, loss = 63.39973741
Validation score: 0.876365
Iteration 14, loss = 62.56462945
Validation score: 0.882795
Iteration 15, loss = 60.65536497
Validation score: 0.885462
Validation score did not improve more than tol=0.000100 for two consecutive epochs. Stopping.


MLPRegressor(activation='relu', alpha=0.0001, batch_size=32, beta_1=0.9,
       beta_2=0.999, early_stopping=True, epsilon=1e-08,
       hidden_layer_sizes=(500, 100, 20), learning_rate='invscaling',
       learning_rate_init=0.001, max_iter=200, momentum=0.9,
       nesterovs_momentum=True, power_t=0.5, random_state=None,
       shuffle=True, solver='adam', tol=0.0001, validation_fraction=0.1,
       verbose=True, warm_start=False)

In [47]:
clf2.score(X_test_std, Y_test)

0.8879774695782413

In [51]:
clf3=MLPRegressor(batch_size=32, early_stopping=True, verbose=True,learning_rate="adaptive", hidden_layer_sizes=(500,100,20,))
clf3.fit(X_train_std, Y_train)
clf3.score(X_test_std, Y_test)

  y = column_or_1d(y, warn=True)


Iteration 1, loss = 175.05550141
Validation score: 0.818540
Iteration 2, loss = 105.61352249
Validation score: 0.833150
Iteration 3, loss = 96.40270102
Validation score: 0.847072
Iteration 4, loss = 85.43809116
Validation score: 0.842014
Iteration 5, loss = 81.39629065
Validation score: 0.847854
Iteration 6, loss = 78.24058413
Validation score: 0.863327
Iteration 7, loss = 73.86253278
Validation score: 0.858489
Iteration 8, loss = 72.10419566
Validation score: 0.866759
Iteration 9, loss = 70.09787502
Validation score: 0.872933
Iteration 10, loss = 67.38422541
Validation score: 0.870045
Iteration 11, loss = 66.21732248
Validation score: 0.872182
Iteration 12, loss = 65.98101825
Validation score: 0.874400
Iteration 13, loss = 63.18328309
Validation score: 0.877872
Iteration 14, loss = 62.58062633
Validation score: 0.877271
Iteration 15, loss = 61.91643628
Validation score: 0.877298
Iteration 16, loss = 59.73537474
Validation score: 0.880298
Iteration 17, loss = 59.08020533
Validation sco

0.8935364266650208

In [49]:
clf4=MLPRegressor(batch_size=64, early_stopping=True, verbose=True,learning_rate="adaptive", hidden_layer_sizes=(500,100,20,))
clf4.fit(X_train_std, Y_train)
clf4.score(X_test_std, Y_test)

  y = column_or_1d(y, warn=True)


Iteration 1, loss = 213.84100382
Validation score: 0.796523
Iteration 2, loss = 107.91713014
Validation score: 0.818604
Iteration 3, loss = 97.82378075
Validation score: 0.826815
Iteration 4, loss = 89.09075550
Validation score: 0.843979
Iteration 5, loss = 83.92387188
Validation score: 0.850580
Iteration 6, loss = 79.10142948
Validation score: 0.854356
Iteration 7, loss = 75.34382585
Validation score: 0.851999
Iteration 8, loss = 74.76930055
Validation score: 0.867361
Iteration 9, loss = 70.40918615
Validation score: 0.856065
Iteration 10, loss = 69.32189181
Validation score: 0.865879
Iteration 11, loss = 67.24946586
Validation score: 0.848532
Validation score did not improve more than tol=0.000100 for two consecutive epochs. Stopping.


0.8758956268619689

In [53]:
clf7=MLPRegressor(batch_size=32, early_stopping=True, verbose=True,learning_rate="adaptive", hidden_layer_sizes=(500,100,20,))
clf7.fit(X_train_std, Y_train)
clf7.score(X_test_std, Y_test)

  y = column_or_1d(y, warn=True)


Iteration 1, loss = 159.16750993
Validation score: 0.821094
Iteration 2, loss = 101.92259746
Validation score: 0.844458
Iteration 3, loss = 92.71528641
Validation score: 0.851593
Iteration 4, loss = 86.28530578
Validation score: 0.845682
Iteration 5, loss = 82.52380637
Validation score: 0.851729
Iteration 6, loss = 76.85495881
Validation score: 0.858882
Iteration 7, loss = 74.93437903
Validation score: 0.824006
Iteration 8, loss = 72.75177619
Validation score: 0.864155
Iteration 9, loss = 70.84384786
Validation score: 0.870486
Iteration 10, loss = 68.29285459
Validation score: 0.875336
Iteration 11, loss = 65.98404374
Validation score: 0.872384
Iteration 12, loss = 64.46145694
Validation score: 0.867741
Iteration 13, loss = 64.12483146
Validation score: 0.879985
Iteration 14, loss = 63.05516240
Validation score: 0.885587
Iteration 15, loss = 60.77034863
Validation score: 0.884495
Iteration 16, loss = 59.81281801
Validation score: 0.883296
Iteration 17, loss = 58.05872028
Validation sco

0.8910338101702322

In [54]:
clf8=MLPRegressor(batch_size=32, early_stopping=False, verbose=True,learning_rate="invscaling", hidden_layer_sizes=(500,100,20,))
clf8.fit(X_train_std, Y_train)
clf8.score(X_test_std, Y_test)

  y = column_or_1d(y, warn=True)


Iteration 1, loss = 165.69187342
Iteration 2, loss = 104.49895384
Iteration 3, loss = 92.47038400
Iteration 4, loss = 86.22899868
Iteration 5, loss = 81.70295475
Iteration 6, loss = 77.14981213
Iteration 7, loss = 74.06432963
Iteration 8, loss = 70.92371556
Iteration 9, loss = 70.06862639
Iteration 10, loss = 66.95599309
Iteration 11, loss = 65.62439144
Iteration 12, loss = 65.01318693
Iteration 13, loss = 61.96536594
Iteration 14, loss = 62.35832564
Iteration 15, loss = 59.41693334
Iteration 16, loss = 59.57695405
Iteration 17, loss = 58.61965290
Iteration 18, loss = 56.31074944
Iteration 19, loss = 54.86195276
Iteration 20, loss = 54.33171126
Iteration 21, loss = 54.14790112
Iteration 22, loss = 53.38640222
Iteration 23, loss = 52.36690918
Iteration 24, loss = 51.00426875
Iteration 25, loss = 50.45049084
Iteration 26, loss = 49.31639210
Iteration 27, loss = 49.36544533
Iteration 28, loss = 48.51796611
Iteration 29, loss = 48.54640048
Iteration 30, loss = 47.48097502
Iteration 31, los

0.9179005127483747

In [61]:
clf9=MLPRegressor(batch_size=32, early_stopping=True, verbose=True,learning_rate="invscaling", hidden_layer_sizes=(500,100,20,), max_iter=1000)
clf9.fit(X_train_std, Y_train)
clf9.score(X_test_std, Y_test)

  y = column_or_1d(y, warn=True)


Iteration 1, loss = 171.46322826
Validation score: 0.792167
Iteration 2, loss = 104.91485516
Validation score: 0.833154
Iteration 3, loss = 94.33602053
Validation score: 0.847224
Iteration 4, loss = 89.45549020
Validation score: 0.837888
Iteration 5, loss = 84.16378131
Validation score: 0.858469
Iteration 6, loss = 77.57434386
Validation score: 0.854015
Iteration 7, loss = 75.19996180
Validation score: 0.858919
Iteration 8, loss = 72.66200981
Validation score: 0.874206
Iteration 9, loss = 71.90231266
Validation score: 0.868283
Iteration 10, loss = 69.28922541
Validation score: 0.874431
Iteration 11, loss = 66.92032488
Validation score: 0.878654
Iteration 12, loss = 64.83989741
Validation score: 0.882034
Iteration 13, loss = 63.79111416
Validation score: 0.877192
Iteration 14, loss = 64.59042496
Validation score: 0.889599
Iteration 15, loss = 60.90144059
Validation score: 0.883335
Iteration 16, loss = 60.38007878
Validation score: 0.884716
Iteration 17, loss = 58.70327924
Validation sco

0.8986828361103292

In [67]:
clf10=MLPRegressor(batch_size=32, early_stopping=False, verbose=True,learning_rate="adaptive", hidden_layer_sizes=(500,100,20,))
clf10.fit(X_train_std, Y_train)
clf10.score(X_test_std, Y_test)

  y = column_or_1d(y, warn=True)


Iteration 1, loss = 173.40242690
Iteration 2, loss = 101.71611983
Iteration 3, loss = 92.32044854
Iteration 4, loss = 85.18434040
Iteration 5, loss = 80.98216878
Iteration 6, loss = 75.70970200
Iteration 7, loss = 73.15589379
Iteration 8, loss = 71.46387221
Iteration 9, loss = 70.90720522
Iteration 10, loss = 67.95617032
Iteration 11, loss = 65.46792258
Iteration 12, loss = 63.72137064
Iteration 13, loss = 61.85805928
Iteration 14, loss = 61.03213098
Iteration 15, loss = 60.09319157
Iteration 16, loss = 58.90535863
Iteration 17, loss = 57.71707483
Iteration 18, loss = 56.26515009
Iteration 19, loss = 55.53939701
Iteration 20, loss = 53.61214691
Iteration 21, loss = 52.51564420
Iteration 22, loss = 52.98059167
Iteration 23, loss = 51.12332691
Iteration 24, loss = 49.79314687
Iteration 25, loss = 48.73416962
Iteration 26, loss = 48.53011733
Iteration 27, loss = 47.36702669
Iteration 28, loss = 45.97248545
Iteration 29, loss = 46.59965976
Iteration 30, loss = 45.12841258
Iteration 31, los

0.9150003154331632

#### As results, we get the maximum score of 0.8986 with early stopping criteria of 10% of maximum iterations on 10% of training set. 
#### Without Early stopping we achieved the score of 0.915. 