Linear Regression From SciKit-Learn

In [12]:
import numpy as np
from numpy import genfromtxt
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
import math 
from sklearn.preprocessing import PolynomialFeatures

# Load data into a Pandas DataFrame
x_data = genfromtxt('../data/embs/embsdensity-ince/carbon/embs250-5000woccspca.csv',delimiter=',',encoding='utf-8-sig',skip_header=1)

# Split data into X and y
X = x_data[:,0:128]
#128 --> pka
#138 --> nmr
#138 onwards --> electron density 
y = x_data[:,138:141]

#label to split as well
label = x_data[:,128:136]

# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test, label_train, label_test = train_test_split(X, y, label, test_size=0.2, random_state=42)

use_train_for_test = False

save_pred_filepath = '../data/embs/embsdensity-ince/carbon/lrtestpredictions' #NO .csv!!

In [13]:
# Transform features to quadratic form
#poly = PolynomialFeatures(degree=2)  # Use degree 2 for quadratic regression
#X_train = poly.fit_transform(X_train)
#X_test = poly.fit_transform(X_test)

# Create a LinearRegression object
lr = LinearRegression()

# Standardize the data
scaler_X = StandardScaler()
scaler_y = StandardScaler()

X_train_scaled = scaler_X.fit_transform(X_train)
X_test_scaled = scaler_X.transform(X_test)

y_train_scaled = scaler_y.fit_transform(y_train)
y_test_scaled = scaler_y.transform(y_test)

# Fit the model
lr.fit(X_train_scaled, y_train_scaled)

if use_train_for_test == True:
    # Make predictions
    predictions_scaled = lr.predict(X_train_scaled)
    predictions = scaler_y.inverse_transform(predictions_scaled)

    # Evaluate the model
    mse = mean_squared_error(y_train, predictions)
    r2 = r2_score(y_train, predictions)
else:
    # Make predictions
    predictions_scaled = lr.predict(X_test_scaled)
    predictions = scaler_y.inverse_transform(predictions_scaled)

    # Evaluate the model
    mse = mean_squared_error(y_test, predictions)
    r2 = r2_score(y_test, predictions)

print('Mean square-root error:', math.sqrt(mse))
print('R-squared:', r2)


Mean square-root error: 0.006797923292683738
R-squared: 0.8894137487964855


In [14]:

print(np.shape(label_train))
print(np.shape(predictions))

print(predictions)

for i in range(np.shape(predictions)[1]):

    if use_train_for_test == True: 
        save_predictions = np.column_stack((predictions[:,i],y_train[:,i],label_train))
    elif use_train_for_test == False: 
        save_predictions = np.column_stack((predictions[:,i],y_test[:,i],label_test))
        
    filepath =  save_pred_filepath + '%s.csv' %(i) 
    np.savetxt(filepath,save_predictions,delimiter=',')


(18338, 8)
(4585, 3)
[[1.99455543 0.65660389 2.10136477]
 [1.99681025 0.64991693 2.0721682 ]
 [1.99588224 0.63759669 2.14522473]
 ...
 [1.9955826  0.64986369 2.13923683]
 [1.9991228  0.65405281 2.09456079]
 [1.99555814 0.65674251 2.15087748]]


Custom testing

In [None]:
custom_test_path = '../data/datasets/embspKA-stef/induction/embs.csv'

custom_test_data = genfromtxt(custom_test_path,delimiter=',',encoding='utf-8-sig',skip_header=1)

X_test_custom = custom_test_data[:,0:128]
# Make predictions
predictions = lr.predict(X_test_custom)

print(predictions)