In [1]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
from sklearn.preprocessing import MinMaxScaler
from sklearn.svm import SVR
from sklearn.metrics import r2_score,mean_squared_error
import seaborn as sns
import matplotlib.pyplot as plt
import pickle
from joblib import dump, load

In [2]:
# Load Data Train csv
train = pd.read_csv(r'data/new_train.csv')
test = pd.read_csv(r'data/new_test.csv')

In [3]:
# Load Pickle linear
label_10 = pickle.load(open("data/desc_10.pkl", "rb"))
label_10

['MATS8i',
 'GATS7i',
 'VCH-6',
 'nssAsH',
 'SdssC',
 'SddsAs',
 'minssssBem',
 'maxsPH2',
 'nAtomLAC',
 'TDB6e']

In [4]:
x_train = train.loc[:,label_10]
x_test = test.loc[:,label_10]
y_train = train.iloc[:,[-1]]
y_test = test.iloc[:,[-1]]

In [5]:
x_train.shape, x_test.shape, y_train, y_test

((107, 10),
 (27, 10),
      pIC50
 0    6.374
 1    5.678
 2    8.000
 3    6.824
 4    8.108
 ..     ...
 102  7.387
 103  6.821
 104  6.652
 105  5.958
 106  5.509
 
 [107 rows x 1 columns],
     pIC50
 0   6.228
 1   7.569
 2   8.201
 3   7.081
 4   7.796
 5   5.602
 6   6.167
 7   7.026
 8   6.358
 9   8.319
 10  5.469
 11  7.921
 12  5.051
 13  6.066
 14  8.658
 15  6.292
 16  5.886
 17  7.301
 18  6.393
 19  8.167
 20  6.114
 21  5.745
 22  6.215
 23  7.221
 24  8.495
 25  8.456
 26  6.541)

In [6]:
mmscaler = MinMaxScaler()
mmscaler.fit(x_train)
sc_x_train = mmscaler.transform(x_train)
sc_x_test = mmscaler.transform(x_test)

In [7]:
paramgrid = {
    'C' : [0.1,1,10,100,1000],
    'degree' : [1],
    'epsilon' : [0.1,1,10,100,1000]
}

In [8]:
grid = GridSearchCV(
        estimator = SVR(kernel = 'linear'),
        param_grid = paramgrid,
        cv = 5,
        scoring = 'neg_mean_squared_error',
        n_jobs = -1
)
grid.fit(sc_x_train, y_train)

  y = column_or_1d(y, warn=True)


GridSearchCV(cv=5, error_score=nan,
             estimator=SVR(C=1.0, cache_size=200, coef0=0.0, degree=3,
                           epsilon=0.1, gamma='scale', kernel='linear',
                           max_iter=-1, shrinking=True, tol=0.001,
                           verbose=False),
             iid='deprecated', n_jobs=-1,
             param_grid={'C': [0.1, 1, 10, 100, 1000], 'degree': [1],
                         'epsilon': [0.1, 1, 10, 100, 1000]},
             pre_dispatch='2*n_jobs', refit=True, return_train_score=False,
             scoring='neg_mean_squared_error', verbose=0)

In [9]:
grid.best_params_

{'C': 1, 'degree': 1, 'epsilon': 0.1}

In [10]:
pickle.dump(grid.best_params_, open( "l_data/linear_params_desc_10.p", "wb" ))

In [11]:
# model
model = SVR(kernel = 'linear', C = grid.best_params_['C'],
           degree = grid.best_params_['degree'],
           epsilon = grid.best_params_['epsilon'])
model.fit(sc_x_train, y_train)

  y = column_or_1d(y, warn=True)


SVR(C=1, cache_size=200, coef0=0.0, degree=1, epsilon=0.1, gamma='scale',
    kernel='linear', max_iter=-1, shrinking=True, tol=0.001, verbose=False)

In [12]:
y_train_pred = model.predict(sc_x_train)
y_test_pred = model.predict(sc_x_test)

In [13]:
train_r2 = r2_score(y_train, y_train_pred)
test_r2 = r2_score(y_test, y_test_pred)

In [14]:
train_r2

0.6042187347691881

In [15]:
pickle.dump(train_r2, open( "l_data/train_linear_10.p", "wb" ))

In [16]:
test_r2

0.49547601539648223

In [17]:
pickle.dump(test_r2, open( "l_data/test_linear_10.p", "wb" ))

In [18]:
# Load Pickle 
params5 = pickle.load(open("l_data/linear_params_desc_5.p", "rb"))
train_linear5 = pickle.load(open("l_data/train_linear_5.p", "rb"))
test_linear5 = pickle.load(open("l_data/test_linear_5.p", "rb"))

params10 = pickle.load(open("l_data/linear_params_desc_10.p", "rb"))
train_linear10 = pickle.load(open("l_data/train_linear_10.p", "rb"))
test_linear10 = pickle.load(open("l_data/test_linear_10.p", "rb"))

params15 = pickle.load(open("l_data/linear_params_desc_15.p", "rb"))
train_linear15 = pickle.load(open("l_data/train_linear_15.p", "rb"))
test_linear15 = pickle.load(open("l_data/test_linear_15.p", "rb"))

params20 = pickle.load(open("l_data/linear_params_desc_20.p", "rb"))
train_linear20 = pickle.load(open("l_data/train_linear_20.p", "rb"))
test_linear20 = pickle.load(open("l_data/test_linear_20.p", "rb"))

params25 = pickle.load(open("l_data/linear_params_desc_25.p", "rb"))
train_linear25 = pickle.load(open("l_data/train_linear_25.p", "rb"))
test_linear25 = pickle.load(open("l_data/test_linear_25.p", "rb"))

In [19]:
params5, train_linear5, test_linear5

({'C': 1, 'degree': 1, 'epsilon': 0.1},
 0.5113273122284105,
 0.47955806709546733)

In [20]:
params10, train_linear10, test_linear10

({'C': 1, 'degree': 1, 'epsilon': 0.1},
 0.6042187347691881,
 0.49547601539648223)

In [21]:
params15, train_linear15, test_linear15

({'C': 1, 'degree': 1, 'epsilon': 0.1}, 0.697594031920671, 0.6690744727127131)

In [22]:
params20, train_linear20, test_linear20

({'C': 1, 'degree': 1, 'epsilon': 0.1}, 0.5992867584178037, 0.5256362078573839)

In [23]:
params25, train_linear25, test_linear25

({'C': 1, 'degree': 1, 'epsilon': 0.1}, 0.6843844059428645, 0.5885541722223695)