This notebook covers Data Discretization applied to compare the models performances across different subgroups of Topt. 

# **Import libraries and data**


In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import r2_score, mean_absolute_error, mean_squared_error
from collections import Counter

In [42]:
# Train data set consists of X_train_ig (with 500 features selected by Information Gain in Feature Selection notebook) and y_train
data_train_ig = pd.concat([y_train, X_train_ig], axis=1).reset_index(level=0) 
data_train_ig = data_train_ig.drop(['index'],axis=1) 
data_train_ig.head()

Unnamed: 0,seq_TEMP,A,C,E,I,K,L,N,Q,R,...,Normalized van der Waals Volume-T2332,Polarity-T1331,Polarizability-T2332,Charge-T1221,Charge-T1331,Solvent accessibility-T1221,Hydrophobicity_CASG920101-G1D100,Hydrophobicity_FASG890101-G1D100,Normalized van der Waals Volume-G1D100,Charge-G2D100
0,26,0.128743,0.023952,0.05988,0.041916,0.017964,0.083832,0.020958,0.020958,0.083832,...,0.156156,0.213213,0.189189,0.15015,0.048048,0.237237,100.0,100.0,100.0,100.0
1,30,0.076923,0.008547,0.094017,0.065527,0.068376,0.096866,0.051282,0.025641,0.031339,...,0.18,0.262857,0.2,0.165714,0.034286,0.265714,99.7151,98.005698,100.0,100.0
2,74,0.043937,0.001757,0.072056,0.108963,0.077329,0.075571,0.047452,0.015817,0.038664,...,0.209507,0.278169,0.228873,0.181338,0.026408,0.288732,100.0,100.0,98.242531,100.0
3,90,0.066055,0.009174,0.102752,0.047706,0.051376,0.095413,0.011009,0.020183,0.106422,...,0.200368,0.257353,0.25,0.213235,0.053309,0.321691,100.0,100.0,99.816514,99.816514
4,29,0.067146,0.007194,0.086331,0.074341,0.05036,0.076739,0.047962,0.069544,0.045564,...,0.197115,0.204327,0.209135,0.149038,0.028846,0.257212,100.0,100.0,100.0,100.0


In [43]:
# Test data set consists of X_test_ig (with 500 features selected by Information Gain in Feature Selection notebook) and y_test
data_test_ig = pd.concat([y_test, X_test_ig], axis=1).reset_index(level=0) 
data_test_ig = data_test_ig.drop(['index'],axis=1) 
data_test_ig.head()

Unnamed: 0,seq_TEMP,A,C,E,I,K,L,N,Q,R,...,Normalized van der Waals Volume-T2332,Polarity-T1331,Polarizability-T2332,Charge-T1221,Charge-T1331,Solvent accessibility-T1221,Hydrophobicity_CASG920101-G1D100,Hydrophobicity_FASG890101-G1D100,Normalized van der Waals Volume-G1D100,Charge-G2D100
0,37,0.129845,0.007752,0.067829,0.027132,0.017442,0.100775,0.02907,0.01938,0.079457,...,0.153398,0.198058,0.170874,0.151456,0.027184,0.264078,99.806202,99.612403,100.0,100.0
1,60,0.122727,0.004545,0.027273,0.05,0.045455,0.059091,0.077273,0.038636,0.018182,...,0.116173,0.182232,0.125285,0.100228,0.013667,0.223235,100.0,96.136364,100.0,100.0
2,36,0.092742,0.008065,0.068548,0.056452,0.032258,0.084677,0.064516,0.03629,0.056452,...,0.178138,0.238866,0.194332,0.145749,0.016194,0.287449,100.0,100.0,97.580645,100.0
3,74,0.051447,0.001608,0.061093,0.094855,0.065916,0.090032,0.078778,0.016077,0.041801,...,0.228663,0.278583,0.252818,0.15781,0.019324,0.273752,99.678457,99.196141,98.713826,100.0
4,28,0.057221,0.013624,0.079019,0.049046,0.108992,0.089918,0.06267,0.035422,0.032698,...,0.20765,0.26776,0.226776,0.169399,0.04918,0.251366,100.0,100.0,99.182561,99.182561


# **Chi-Merge algorithm**

The Chi2 algorithm for discretization applied to compute the best interval for splitting the continuous variable (Topt) based on the number of intervals. The code is borrowed from https://gist.github.com/alanzchen/17d0c4a45d59b79052b1cd07f531689e


In [12]:
def chimerge(data, attr, label, max_intervals):
    distinct_vals = sorted(set(data[attr])) # The training examples are sorted according to their values
    labels = sorted(set(data[label])) # Get all possible labels
    empty_count = {l: 0 for l in labels} # A helper function for padding the Counter()
    intervals = [[distinct_vals[i], distinct_vals[i]] for i in range(len(distinct_vals))] # Each example is put into its own interval
    while len(intervals) > max_intervals: # While loop
        chi = []
        for i in range(len(intervals)-1):
            # Calculate the Chi2 value
            obs0 = data[data[attr].between(intervals[i][0], intervals[i][1])]
            obs1 = data[data[attr].between(intervals[i+1][0], intervals[i+1][1])]
            total = len(obs0) + len(obs1)
            count_0 = np.array([v for i, v in {**empty_count, **Counter(obs0[label])}.items()])
            count_1 = np.array([v for i, v in {**empty_count, **Counter(obs1[label])}.items()])
            count_total = count_0 + count_1
            expected_0 = count_total*sum(count_0)/total
            expected_1 = count_total*sum(count_1)/total
            chi_ = (count_0 - expected_0)**2/expected_0 + (count_1 - expected_1)**2/expected_1
            chi_ = np.nan_to_num(chi_) # Deal with the zero counts
            chi.append(sum(chi_)) # Finally do the summation for Chi2
        min_chi = min(chi) # Find the minimal Chi2 for current iteration
        for i, v in enumerate(chi):
            if v == min_chi:
                min_chi_index = i # Find the index of the interval to be merged
                break
        new_intervals = [] # Prepare for the merged new data array
        skip = False
        done = False
        for i in range(len(intervals)):
            if skip:
                skip = False
                continue
            if i == min_chi_index and not done: # Merge the intervals
                t = intervals[i] + intervals[i+1]
                new_intervals.append([min(t), max(t)])
                skip = True
                done = True
            else:
                new_intervals.append(intervals[i])
        intervals = new_intervals
    for i in intervals:
        print('[', i[0], ',', i[1], ']', sep='')

In [21]:
#Computed 3 intervals for splitting Topt
for attr in ['seq_TEMP']:
    print('Interval for', attr)
    chimerge(data=data_train_ig, attr=attr, label='seq_TEMP', max_intervals=3)

Interval for seq_TEMP


  chi_ = (count_0 - expected_0)**2/expected_0 + (count_1 - expected_1)**2/expected_1


[8,29]
[30,36]
[37,99]


In [10]:
#Computed 4 intervals for splitting Topt
for attr in ['seq_TEMP']:
    print('Interval for', attr)
    chimerge(data=data_train_ig, attr=attr, label='seq_TEMP', max_intervals=4)

Interval for seq_TEMP


  chi_ = (count_0 - expected_0)**2/expected_0 + (count_1 - expected_1)**2/expected_1


[8,29]
[30,36]
[37,37]
[40,99]


In [11]:
#Computed 5 intervals for splitting Topt
for attr in ['seq_TEMP']:
    print('Interval for', attr)
    chimerge(data=data_train_ig, attr=attr, label='seq_TEMP', max_intervals=5)

Interval for seq_TEMP


  chi_ = (count_0 - expected_0)**2/expected_0 + (count_1 - expected_1)**2/expected_1


[8,25]
[26,29]
[30,36]
[37,37]
[40,99]


# **Testing computed intervals**

To select an optimal number of splits, the performances of random forests models were compared. Specifically, the change in mean squared errors (MSE) across intervals.

In [27]:
#Random forests
clf = RandomForestRegressor(random_state=0).fit(X_train_ig, y_train)
y_pred=clf.predict(X_test_ig)

print('MSE on test set: {:.2f}'
     .format(mean_squared_error(y_test, y_pred)))
print('Accuracy of Decision Tree regressor on training set: {:.2f}'
     .format(clf.score(X_train_ig, y_train)))
print('Accuracy of Decision Tree regressor on test set: {:.2f}'
     .format(clf.score(X_test_ig, y_test)))

MSE on test set: 119.06
Accuracy of Decision Tree regressor on training set: 0.95
Accuracy of Decision Tree regressor on test set: 0.70


## 3 *intervals* (8-29°C, 30-36°C, 37-99°C)

In [49]:
#define X and y for the interval 4-29°C
bin4_29=data_test_ig[data_test_ig['seq_TEMP'] <30]

X_test4_29 = bin4_29.drop(['seq_TEMP'], axis=1)
y_test4_29 = bin4_29['seq_TEMP']

bin4_29.shape

(295, 501)

In [50]:
#compute MSE for the interval  4-29°C
y_pred4_29=clf.predict(X_test4_29)

print('MSE on test set: {:.2f}'
     .format(mean_squared_error(y_test4_29, y_pred4_29)))


MSE on test set: 82.46


In [51]:
#define X and y for the interval 30-36°C
bin30_36=data_test_ig[(data_test_ig['seq_TEMP']>= 30) & (data_test_ig['seq_TEMP'] < 37)]

X_test30_36 = bin30_36.drop(['seq_TEMP'], axis=1)
y_test30_36 = bin30_36['seq_TEMP']

bin30_36.shape

(254, 501)

In [52]:
#compute MSE for the interval  30-36°C
y_pred30_36=clf.predict(X_test30_36)

print('MSE on test set: {:.2f}'
     .format(mean_squared_error(y_test30_36, y_pred30_36)))

MSE on test set: 45.64


In [53]:
#define X and y for the interval 37-99°C
bin37_99=data_test_ig[data_test_ig['seq_TEMP']>= 37]

X_test37_99 = bin37_99.drop(['seq_TEMP'], axis=1)
y_test37_99 = bin37_99['seq_TEMP']

bin37_99.shape

(327, 501)

In [54]:
#compute MSE for the interval  37-99°C
y_pred37_99=clf.predict(X_test37_99)

print('MSE on test set: {:.2f}'
     .format(mean_squared_error(y_test37_99, y_pred37_99)))

MSE on test set: 209.10


Overall, MSE across intervals decreased from 119.06 (for whole data set) to 45.64 for the interval 30-36°C and 82.46 for the interval 4-29°C. The exception is the interval 37-99°C where MSE increased significantly to 209.10.

## 4 intervals (8-29°C, 30-36°C, 37-37°C, 40-99°C)

In [56]:
#define X and y for the interval 37-37°C
bin37_37=data_test_ig[data_test_ig['seq_TEMP'] == 37]

X_test37_37 = bin37_37.drop(['seq_TEMP'], axis=1)
y_test37_37 = bin37_37['seq_TEMP']

bin37_37.shape

(107, 501)

In [57]:
#compute MSE for the interval  37-37°C

y_pred37_37=clf.predict(X_test37_37)

print('MSE on test set: {:.2f}'
     .format(mean_squared_error(y_test37_37, y_pred37_37)))

MSE on test set: 57.33


In [58]:
#define X and y for the interval 40-99°C
bin40_99=data_test_ig[(data_test_ig['seq_TEMP']>= 40) & (data_test_ig['seq_TEMP'] < 100)]

X_test40_99 = bin40_99.drop(['seq_TEMP'], axis=1)
y_test40_99 = bin40_99['seq_TEMP']

bin40_99.shape

(219, 501)

In [59]:
#compute MSE for the interval  40-99°C
y_pred40_99=clf.predict(X_test40_99)

print('MSE on test set: {:.2f}'
     .format(mean_squared_error(y_test40_99, y_pred40_99)))

MSE on test set: 284.15


When splitting Topt into 4 intervals, Chi-Merge interval suggests to further granulate the interval 37-99°C  into two intervals 37-37°C and 40-99°C. This decreases MSE from 209.10 for the interval 37-99°C to 57.33 for the interval 37-37°C, but increases MSE to 284.15 for the interval 40-99°C.


## 5 intervals (8-25°C, 26-29°C, 30-36°C, 37-37°C, 40-99°C)

In [62]:
#define X and y for the interval 4-25°C
bin4_25=data_test_ig[data_test_ig['seq_TEMP'] <= 25]

X_test4_25 = bin4_25.drop(['seq_TEMP'], axis=1)
y_test4_25 = bin4_25['seq_TEMP']

bin4_25.shape

(118, 501)

In [63]:
#compute MSE for the interval  4-25°C
y_pred4_25=clf.predict(X_test4_25)

print('MSE on test set: {:.2f}'
     .format(mean_squared_error(y_test4_25, y_pred4_25)))


MSE on test set: 102.93


In [64]:
#define X and y for the interval 26-29°C
bin26_29=data_test_ig[(data_test_ig['seq_TEMP']>= 26) & (data_test_ig['seq_TEMP'] <30)]

X_test26_29 = bin26_29.drop(['seq_TEMP'], axis=1)
y_test26_29 = bin26_29['seq_TEMP']

bin26_29.shape

(177, 501)

In [65]:
#compute MSE for the interval  26-29°C
y_pred26_29=clf.predict(X_test26_29)

print('MSE on test set: {:.2f}'
     .format(mean_squared_error(y_test26_29, y_pred26_29)))

MSE on test set: 68.82


When splitting Topt into 5 intervals, Chi-Merge interval suggests to further granulate the interval 8-29°C  into two intervals 8-25°C and 26-29°C. This decreases MSE from 82.46 for the interval 8-29°C to 68.82 for the interval 26-29°C, but increases MSE to 102.93 for the interval 8-25°C.


Although splitting Topt into numerous intervals may bring its advantages, the 3 intervals split (8-29°C, 30-36°C, 37-99°C) was chosen for this work.