In [1]:
# import libraries 

import pandas as pd 

from sklearn.model_selection import train_test_split 

from sklearn.linear_model import LinearRegression 

# Compute R^2 Score

In [3]:
# column headers 

_headers = ['CIC0', 'SM1', 'GATS1i', 'NdsCH', 'Ndssc', 'MLOGP', 'response'] 

# read in data 

df = pd.read_csv('Data/qsar_fish_toxicity.csv', names=_headers, sep=';') 

In [4]:
df.head()

Unnamed: 0,CIC0,SM1,GATS1i,NdsCH,Ndssc,MLOGP,response
0,3.26,0.829,1.676,0,1,1.453,3.77
1,2.189,0.58,0.863,0,0,1.348,3.115
2,2.125,0.638,0.831,0,0,1.348,3.531
3,3.027,0.331,1.472,1,0,1.807,3.51
4,2.094,0.827,0.86,0,0,1.886,5.39


In [5]:
# Let's split our data 

features = df.drop('response', axis=1).values 

labels = df[['response']].values 

 

X_train, X_eval, y_train, y_eval = train_test_split(features, labels, test_size=0.2, random_state=0) 

X_val, X_test, y_val, y_test = train_test_split(X_eval, y_eval, random_state=0) 

In [6]:
model = LinearRegression() 

In [7]:
model.fit(X_train, y_train) 

In [8]:
y_pred = model.predict(X_val) 

In [9]:
r2 = model.score(X_val, y_val) 

print('R^2 score: {}'.format(r2)) 

R^2 score: 0.5623861754188693


In [10]:
_ys = pd.DataFrame(dict(actuals=y_val.reshape(-1), predicted=y_pred.reshape(-1))) 

_ys.head() 

Unnamed: 0,actuals,predicted
0,3.742,4.155885
1,6.143,6.398238
2,4.674,5.183181
3,4.865,3.771333
4,4.732,4.593059


# Compute Mean Absolute Error

In [11]:
from sklearn.metrics import mean_absolute_error

In [12]:
# Let's split our data
features = df.drop('response', axis=1).values
labels = df[['response']].values

X_train, X_eval, y_train, y_eval = train_test_split(features, labels, test_size=0.2, random_state=0)
X_val, X_test, y_val, y_test = train_test_split(X_eval, y_eval, random_state=0)

In [13]:
# create a simple Linear Regression model
model = LinearRegression()

In [14]:
# train the model
model.fit(X_train, y_train)

In [15]:
# let's use our model to predict on our validation datast
y_pred = model.predict(X_val)

In [16]:
# Let's compute our MEAN ABSOLUTE ERROR
mae = mean_absolute_error(y_val, y_pred)
print('MAE: {}'.format(mae))

MAE: 0.724344084644794


In [17]:
#Let's get the R2 score
r2 = model.score(X_val, y_val)
print('R^2 score: {}'.format(r2))

R^2 score: 0.5623861754188693


# Compute MAE of Second Model

In [22]:
from sklearn.pipeline import Pipeline
#preprocessing
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import PolynomialFeatures

In [19]:
# import the data
# column headers
_headers = ['CIC0', 'SM1', 'GATS1i', 'NdsCH', 'Ndssc', 'MLOGP', 'response']

# read in data
df = pd.read_csv('Data/qsar_fish_toxicity.csv', names=_headers, sep=';')

In [20]:
# Let's split our data
features = df.drop('response', axis=1).values
labels = df[['response']].values

X_train, X_eval, y_train, y_eval = train_test_split(features, labels, test_size=0.2, random_state=0)
X_val, X_test, y_val, y_test = train_test_split(X_eval, y_eval, random_state=0)

In [23]:
#create a pipeline and engineer quadratic features
steps = [
    ('scaler', MinMaxScaler()),
    ('poly', PolynomialFeatures(2)),
    ('model', LinearRegression())
]

In [24]:
#create a Linear Regression model
model = Pipeline(steps)

In [25]:
#train the model
model.fit(X_train, y_train)

In [26]:
#predict on validation dataset
y_pred = model.predict(X_val)

In [27]:
#compute MAE
mae = mean_absolute_error(y_val, y_pred)
print('MAE: {}'.format(mae))

MAE: 0.6605526100836078


In [28]:
# let's get the R2 score
r2 = model.score(X_val, y_val)
print('R^2 score: {}'.format(r2))

R^2 score: 0.6284921344153387


# Mean Squared Error (MSE) and Root Mean Squared Error (RMSE)

In [29]:
from sklearn.metrics import mean_squared_error

In [30]:
mse = mean_squared_error(y_val, y_pred)
rmse = mean_squared_error(y_val, y_pred, squared=False)
print("MSE:", mse)
print("RMSE:", rmse)


MSE: 0.7754934417658146
RMSE: 0.8806210545778557




# Mean Squared Log Error (MSLE)

In [31]:
from sklearn.metrics import mean_squared_log_error
msle = mean_squared_log_error(y_val, y_pred)
print("MSLE:", msle)

MSLE: 0.04380799336445415
