#Importing all the necessary libraries

In [53]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd

#Importing our dataset 'QM9_HOMO_LUMO_Gap.csv'

In [54]:
dataset = pd.read_csv("QM9_HOMO_LUMO_Gap.csv")
x = dataset.iloc[:, :-1].values
y = dataset.iloc[:, -1].values
missing_data = dataset.isnull().sum()

##Printing all the matrix of features(x)

In [55]:
print(x)

[[ 1.58e+02  1.58e+02  1.58e+02 ... -3.99e+02 -4.01e+02 -3.72e+02]
 [ 2.94e+02  2.94e+02  1.91e+02 ... -2.79e+02 -2.80e+02 -2.59e+02]
 [ 8.00e+02  4.38e+02  2.83e+02 ... -2.14e+02 -2.15e+02 -2.01e+02]
 ...
 [ 3.39e+00  2.36e+00  1.40e+00 ... -1.40e+03 -1.41e+03 -1.30e+03]
 [ 3.31e+00  2.38e+00  1.40e+00 ... -1.47e+03 -1.47e+03 -1.36e+03]
 [ 3.30e+00  2.42e+00  1.41e+00 ... -1.44e+03 -1.45e+03 -1.34e+03]]


##Printing all the dependent variable vectors(y)

In [56]:
print (y)

[0.5  0.34 0.36 ... 0.19 0.17 0.17]


##Printing if there are missing_data in the dataset

In [57]:
print(missing_data)

A            0
B            0
C            0
mu           0
alpha        0
homo         0
lumo         0
r2           0
zpve         0
u0           0
u298         0
h298         0
g298         0
cv           0
u0_atom      0
u298_atom    0
h298_atom    0
g298_atom    0
gap          0
dtype: int64


#Splitting the entire dataset into training & test sets

In [58]:
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size = 0.2, random_state = 1)

##Printing all the x_train, y_train, x_test, & y_test

In [59]:
print (x_train)
print (y_train)
print (x_test)
print (y_test)

[[ 7.72e+00  1.10e+00  1.00e+00 ... -1.43e+03 -1.44e+03 -1.34e+03]
 [ 7.50e+00  1.93e+00  1.57e+00 ... -1.29e+03 -1.29e+03 -1.19e+03]
 [ 3.88e+00  1.67e+00  1.26e+00 ... -1.88e+03 -1.89e+03 -1.73e+03]
 ...
 [ 5.99e+00  2.84e+00  1.92e+00 ... -1.16e+03 -1.17e+03 -1.08e+03]
 [ 6.50e+00  7.95e-01  7.12e-01 ... -1.32e+03 -1.32e+03 -1.23e+03]
 [ 9.68e+00  2.17e+00  1.81e+00 ... -1.13e+03 -1.13e+03 -1.05e+03]]
[0.25 0.25 0.34 ... 0.21 0.17 0.24]
[[ 3.72e+00  2.26e+00  1.61e+00 ... -1.62e+03 -1.63e+03 -1.50e+03]
 [ 4.60e+00  1.41e+00  1.10e+00 ... -1.57e+03 -1.58e+03 -1.46e+03]
 [ 1.03e+01  2.03e+00  1.69e+00 ... -8.92e+02 -8.96e+02 -8.35e+02]
 ...
 [ 2.00e+00  1.80e+00  9.54e-01 ... -1.48e+03 -1.49e+03 -1.37e+03]
 [ 6.61e+00  1.26e+00  1.12e+00 ... -1.51e+03 -1.52e+03 -1.40e+03]
 [ 4.80e+00  2.65e+00  2.16e+00 ... -1.50e+03 -1.51e+03 -1.39e+03]]
[0.22 0.24 0.21 0.26 0.21 0.19 0.31 0.3  0.2  0.25 0.22 0.23 0.33 0.25
 0.23 0.25 0.34 0.18 0.22 0.23 0.39 0.23 0.24 0.23 0.28 0.33 0.2  0.22
 0.22 

#Converting y_train & y_test into 2D array (because StandardScaler class expects inputs as 2D array)

In [60]:
y_train = y_train.reshape(-1, 1)
y_test = y_test.reshape(-1, 1)

#Applying feature scaling (Standardisation) to all these splitted datasets

In [61]:
from sklearn.preprocessing import StandardScaler
sc_x = StandardScaler()
sc_y = StandardScaler()
x_train = sc_x.fit_transform(x_train)
x_test = sc_x.transform(x_test)
y_train = sc_y.fit_transform(y_train)
y_test = sc_y.transform(y_test)

#Building and training the SVR model on the training set (x_train, y_train) using the RBF kernel

In [62]:
from sklearn.svm import SVR
regressor = SVR(kernel = 'rbf') #RBF stands for Radial Basis Function,which is a kernel
regressor.fit(x_train, y_train)

  y = column_or_1d(y, warn=True)


#Predicting results for the test set (x_test) and applying inverse transformation to return to the original scale

In [63]:
y_pred = sc_y.inverse_transform(regressor.predict(x_test).reshape(-1, 1))

In [64]:
print(y_pred)

[[0.22]
 [0.24]
 [0.21]
 [0.26]
 [0.21]
 [0.18]
 [0.31]
 [0.3 ]
 [0.19]
 [0.26]
 [0.21]
 [0.23]
 [0.32]
 [0.25]
 [0.23]
 [0.25]
 [0.33]
 [0.18]
 [0.22]
 [0.23]
 [0.38]
 [0.23]
 [0.24]
 [0.22]
 [0.28]
 [0.32]
 [0.2 ]
 [0.22]
 [0.21]
 [0.33]
 [0.28]
 [0.32]
 [0.25]
 [0.3 ]
 [0.24]
 [0.26]
 [0.22]
 [0.34]
 [0.32]
 [0.31]
 [0.24]
 [0.22]
 [0.23]
 [0.27]
 [0.24]
 [0.19]
 [0.24]
 [0.33]
 [0.21]
 [0.27]
 [0.26]
 [0.33]
 [0.26]
 [0.22]
 [0.3 ]
 [0.23]
 [0.2 ]
 [0.24]
 [0.3 ]
 [0.22]
 [0.28]
 [0.25]
 [0.26]
 [0.28]
 [0.22]
 [0.23]
 [0.32]
 [0.29]
 [0.19]
 [0.34]
 [0.25]
 [0.24]
 [0.29]
 [0.34]
 [0.23]
 [0.21]
 [0.27]
 [0.32]
 [0.34]
 [0.17]
 [0.23]
 [0.26]
 [0.21]
 [0.24]
 [0.23]
 [0.25]
 [0.19]
 [0.2 ]
 [0.34]
 [0.27]
 [0.3 ]
 [0.22]
 [0.27]
 [0.19]
 [0.25]
 [0.14]
 [0.22]
 [0.18]
 [0.39]
 [0.21]
 [0.26]
 [0.16]
 [0.19]
 [0.32]
 [0.24]
 [0.21]
 [0.32]
 [0.27]
 [0.21]
 [0.32]
 [0.24]
 [0.21]
 [0.22]
 [0.22]
 [0.23]
 [0.34]
 [0.2 ]
 [0.35]
 [0.29]
 [0.26]
 [0.23]
 [0.34]
 [0.33]
 [0.24]
 [0.29]


#Displaying the predicted and actual results side by side

In [65]:
np.set_printoptions(precision = 2)
y_test = sc_y.inverse_transform(y_test)
print (np.concatenate((y_pred.reshape(len(y_pred), 1), y_test.reshape(len(y_test), 1)), 1))

[[0.22 0.22]
 [0.24 0.24]
 [0.21 0.21]
 ...
 [0.18 0.17]
 [0.3  0.29]
 [0.34 0.34]]


#Evaluating the Model Performance (R² Score)

In [66]:
from sklearn.metrics import r2_score
print(r2_score(y_test, y_pred))

0.991372946717914
