In [23]:
# Common use 
import numpy as np 
import matplotlib.pyplot as plt 
import seaborn as sns

#Metrics
import sklearn.metrics as metrics
from sklearn.preprocessing import StandardScaler

#Models
import xgboost
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.svm import SVR

XGBoostError: XGBoost Library (libxgboost.dylib) could not be loaded.
Likely causes:
  * OpenMP runtime is not installed (vcomp140.dll or libgomp-1.dll for Windows, libomp.dylib for Mac OSX, libgomp.so for Linux and other UNIX-like OSes). Mac OSX users: Run `brew install libomp` to install OpenMP runtime.
  * You are running 32-bit Python on a 64-bit OS
Error message(s): ['dlopen(/Users/acorrochanon/opt/anaconda3/envs/my-rdkit-env/lib/python3.9/site-packages/xgboost/lib/libxgboost.dylib, 6): Library not loaded: /usr/local/opt/libomp/lib/libomp.dylib\n  Referenced from: /Users/acorrochanon/opt/anaconda3/envs/my-rdkit-env/lib/python3.9/site-packages/xgboost/lib/libxgboost.dylib\n  Reason: image not found']


## Visualization methods

In [10]:
# Compute regression metrics 
def plot_regression_metrics(model_name, y_true, y_pred, mode = 0):
    
    # Compute metrics 
    r2 = metrics.r2_score(y_true, y_pred)
    mae = metrics.mean_absolute_error(y_true, y_pred)
    mse = metrics.mean_squared_error(y_true, y_pred)
    
    # Mode to 1 displays r-squared plots
    if mode == 1: 
        plt.figure(figsize=(10, 5), dpi = 95)

        plt.scatter(y_true, y_pred, color='salmon', s=5)
        plt.plot(np.unique(y_true), np.poly1d(np.polyfit(y_true, y_pred, 1))(np.unique(y_true)), color='black')

        plt.text(0, 3.5,'R-squared = %0.2f' % r2)
        plt.xlabel('Actual values')
        plt.ylabel('Predicted Values')
        plt.title('Prediction results using {}'.format(model_name))
        plt.show()
    
    print(model_name,'| R2: %0.3f, MAE: %0.3f, MSE: %0.3f' %(r2, mae, mse))
        
    return r2, mae, mse 

In [22]:
def models_comparison(names, met):
    sns.set(font_scale = 1)
    # set width of bar
    barWidth = 0.25
    plt.figure(figsize=(8, 4), dpi = 95)

    # set height of bar
    R2 = [i[0] for i in met]
    MAE = [i[1] for i in met]
    MSE = [i[2] for i in met]

    # Set position of bar on X axis
    br1 = np.arange(len(R2))
    br2 = [x + barWidth for x in br1]
    br3 = [x + barWidth for x in br2]

    # Make the plot
    plt.bar(br1, R2, color ='r', width = barWidth,
            edgecolor ='grey', label ='R2')
    plt.bar(br2, MAE, color ='g', width = barWidth,
            edgecolor ='grey', label ='MAE')
    plt.bar(br3, MSE, color ='b', width = barWidth,
            edgecolor ='grey', label ='MSE')

    # Adding Xticks
    plt.xlabel('Models', fontweight ='bold', fontsize = 15)
    plt.ylabel('Metrics', fontweight ='bold', fontsize = 15)
    plt.xticks([r + barWidth for r in range(len(R2))],
            [name for name in names])
    plt.legend()
    plt.title('Metrics obtained for each regression model')
    
    plt.show()

In [None]:
#We visualize the distributions of the columns that contain NaN values in order to decide which value should be replaced with

# import matplotlib.pyplot as plt 

# fig, axs = plt.subplots(4, 3, figsize=(15,15))
# fig.tight_layout()

# def iterate_columns(cols, counter):
#     for ind, col in enumerate(cols):
#         col.hist(df[nan_cols[ind+counter]])
#         col.axvline(df[nan_cols[ind+counter]].mean(), color='k', linestyle='dashed', linewidth=1, label='Mean')
#         col.axvline(df[nan_cols[ind+counter]].median(), color='r', linestyle='dashed', linewidth=1, label='Median')
#         col.legend()
#         col.set_title(nan_cols[ind+counter])

# counter = [0,3,6,9]
# aux = 0
# for row in axs:
#     iterate_columns(row, counter[aux])
#     aux += 1

# plt.show()

## Computation methods

In [None]:
# Save the model 
# joblib.dump(rf_lipo_baseline, 'rf_lipo_baseline.pkl')

# Loading model
# rf_model = joblib.load('rf_lipo_baseline.pkl')

In [21]:
def normalize_data(X_train, X_val, X_test):
    scaler = StandardScaler()
    
    X_train_norm = scaler.fit_transform(X_train)
    X_val_norm = scaler.transform(X_val)
    X_test_norm = scaler.transform(X_test)
    
    return X_train_norm, X_val_norm, X_test_norm

In [19]:
def reg_models_comparison(X_train, X_train_norm, y_train, X_val, X_val_norm, y_val):
    models = [('LR', LinearRegression()), 
              ('RFR', RandomForestRegressor()), 
              ('DTR', DecisionTreeRegressor()),
              ('SVM', SVR()),
              
             ]

    names = []
    results = []

    for name, model in models:
        # If the model is linear we then use linear data
        if name == 'LR' or name == 'SVM':
            clf = model.fit(X_train_norm, y_train)
            y_pred = clf.predict(X_val_norm)
        else:
            clf = model.fit(X_train, y_train)
            y_pred = clf.predict(X_val)

        # Compute metrics 
        r2, mae, mse = plot_regression_metrics(name, y_val, y_pred)

        # Save values to later plot them
        names.append(name)
        results.append([r2, mae, mse])
        
    # We plot the metrics obtained for each model
    models_comparison(names, results)