## **1. Import libraries**

In [24]:
import pandas as pd
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor

## **2. Load the data set**

In [25]:
df = pd.read_csv('Cruzipain_06_bioactivity_data_3class_pIC50_pubchem_fp.csv')

In [26]:
df1 = pd.read_csv('acetylcholinesterase_06_bioactivity_data_3class_pIC50_pubchem_fp.csv')

## **3. Input features**
The ***Cruzipain*** data set contains 881 input features and 1 output variable (pIC50 values).

### **3.1. Input features**

In [27]:
X = df.drop('pIC50', axis=1)
X

Unnamed: 0,PubchemFP0,PubchemFP1,PubchemFP2,PubchemFP3,PubchemFP4,PubchemFP5,PubchemFP6,PubchemFP7,PubchemFP8,PubchemFP9,...,PubchemFP871,PubchemFP872,PubchemFP873,PubchemFP874,PubchemFP875,PubchemFP876,PubchemFP877,PubchemFP878,PubchemFP879,PubchemFP880
0,1,1,0,0,0,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,0
1,1,1,0,0,0,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,0
2,1,1,0,0,0,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,0
3,1,1,0,0,0,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,0
4,1,1,0,0,0,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
599,1,1,1,0,0,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,0
600,1,1,1,0,0,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,0
601,1,1,1,0,0,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,0
602,1,1,1,0,0,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,0


In [28]:
XX = df1.drop('Name', axis=1)
XX

Unnamed: 0,PubchemFP0,PubchemFP1,PubchemFP2,PubchemFP3,PubchemFP4,PubchemFP5,PubchemFP6,PubchemFP7,PubchemFP8,PubchemFP9,...,PubchemFP871,PubchemFP872,PubchemFP873,PubchemFP874,PubchemFP875,PubchemFP876,PubchemFP877,PubchemFP878,PubchemFP879,PubchemFP880
0,1,1,1,0,0,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,0
1,1,1,1,0,0,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,0
2,1,1,1,0,0,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,0
3,1,1,0,0,0,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,0
4,1,1,0,0,0,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4687,1,1,1,1,0,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,0
4688,1,1,1,1,0,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,0
4689,1,1,1,1,0,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,0
4690,1,1,1,1,0,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,0


### **3.2. Output features**

In [29]:
Y = df.pIC50
Y

0      7.301030
1      5.000000
2      5.000000
3      6.000000
4      5.000000
         ...   
599    5.136677
600    4.869666
601    5.602060
602    4.283997
603    4.853872
Name: pIC50, Length: 604, dtype: float64

### **3.3. Let's examine the data dimension**

In [30]:
X.shape

(604, 881)

In [31]:
XX.shape

(4692, 881)

In [32]:
Y.shape

(604,)

In [33]:
X.shape

(604, 881)

In [34]:
XX.shape

(4692, 881)

## **4. Data split (80/20 ratio)**

In [35]:
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2)

In [36]:
X_train.shape, Y_train.shape

((483, 881), (483,))

In [37]:
X_test.shape, Y_test.shape

((121, 881), (121,))

## **5. Building a Regression Model using Random Forest**

In [38]:
model = RandomForestRegressor(n_estimators=100)
model.fit(X_train, Y_train)
r2 = model.score(X_test, Y_test)
r2

0.6287346089017966

In [39]:
Y_pred = model.predict(X_test)

In [40]:
Y_pred_neg = model.predict(XX)

In [44]:
Avg = sum(Y_pred_neg) / len(Y_pred_neg)
Avg

4.769707854020896

## **6. Scatter Plot of Experimental vs Predicted pIC50 Values**

In [46]:
import seaborn as sns
import matplotlib.pyplot as plt

sns.set(color_codes=True)
sns.set_style("white")

ax = sns.regplot(Y_test, Y_pred, scatter_kws={'alpha':0.4})
ax.set_xlabel('Experimental pIC50', fontsize='large', fontweight='bold')
ax.set_ylabel('Predicted pIC50', fontsize='large', fontweight='bold')
ax.set_xlim(0, 12)
ax.set_ylim(0, 12)
ax.figure.set_size_inches(5, 5)
plt.show

TypeError: ignored