# Practicing on training the model

### 1. Importing usual libraries

In [1]:
import pandas as pd
import numpy as np

### 2. Import the dataset

In [8]:
housing_data = pd.read_csv("../data/heart-disease.csv")
housing_data.head()

Unnamed: 0,age,sex,cp,trestbps,chol,fbs,restecg,thalach,exang,oldpeak,slope,ca,thal,target
0,63,1,3,145,233,1,0,150,0,2.3,0,0,1,1
1,37,1,2,130,250,0,1,187,0,3.5,0,0,2,1
2,41,0,1,130,204,0,0,172,0,1.4,2,0,2,1
3,56,1,1,120,236,0,1,178,0,0.8,2,0,2,1
4,57,0,0,120,354,0,1,163,1,0.6,2,0,2,1


### 3. Split the dataset into axes, ie, `x & y`

In [10]:
X = housing_data.drop("target", axis=1)
y = housing_data["target"]
X.dtypes, y.dtypes

(age           int64
 sex           int64
 cp            int64
 trestbps      int64
 chol          int64
 fbs           int64
 restecg       int64
 thalach       int64
 exang         int64
 oldpeak     float64
 slope         int64
 ca            int64
 thal          int64
 dtype: object,
 dtype('int64'))

###  5. Split them into training and testing dataset, ie, `X_train`, `y_train`, `X_test`, `y_test`

In [11]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
X_train.head(), X_test.head(), y_train.head(), y_test.head()

(     age  sex  cp  trestbps  chol  fbs  restecg  thalach  exang  oldpeak  \
 132   42    1   1       120   295    0        1      162      0      0.0   
 202   58    1   0       150   270    0        0      111      1      0.8   
 196   46    1   2       150   231    0        1      147      0      3.6   
 75    55    0   1       135   250    0        0      161      0      1.4   
 176   60    1   0       117   230    1        1      160      1      1.4   
 
      slope  ca  thal  
 132      2   0     2  
 202      2   0     3  
 196      1   0     2  
 75       1   0     2  
 176      2   2     3  ,
      age  sex  cp  trestbps  chol  fbs  restecg  thalach  exang  oldpeak  \
 179   57    1   0       150   276    0        0      112      1      0.6   
 228   59    1   3       170   288    0        0      159      0      0.2   
 111   57    1   2       150   126    1        1      173      0      0.2   
 246   56    0   0       134   409    0        0      150      1      1.9   
 60   

### Calling the `Ensemble` model that uses the algorithm - `RandomForestRegressor` for training

In [12]:
#Random Forest Regression is a supervised learning algorithm that uses ensemble learning method for regression.
#Ensemble learning method is a technique that combines predictions from multiple
#machine learning algorithms to make a more accurate prediction than a single model.
#*************************************************************************************************************************
#Pick at random k data points from the training set.
#Build a decision tree associated to these k data points.
#Choose the number N of trees you want to build and repeat steps 1 and 2.
#For a new data point, make each one of your N-tree trees predict the value of y for the data 
#point in question and assign the new data point to the average across all of the predicted y values.

In [13]:
from sklearn.ensemble import RandomForestRegressor
model = RandomForestRegressor(n_estimators=94)

### Now fitting the dataset for model training

In [14]:
model.fit(X_train, y_train)

### Pedicting

In [15]:
y_preds = model.predict(X_test)
y_preds

array([0.09574468, 0.55319149, 0.40425532, 0.13829787, 0.87234043,
       0.85106383, 0.72340426, 0.0212766 , 0.0106383 , 0.4787234 ,
       0.92553191, 0.20212766, 0.89361702, 0.17021277, 0.93617021,
       1.        , 1.        , 0.04255319, 0.09574468, 0.03191489,
       0.5106383 , 0.0212766 , 0.85106383, 0.76595745, 0.69148936,
       0.53191489, 0.90425532, 0.76595745, 0.05319149, 0.89361702,
       0.12765957, 0.        , 0.        , 0.29787234, 0.57446809,
       0.04255319, 0.4893617 , 0.87234043, 0.5       , 0.95744681,
       0.93617021, 0.86170213, 0.90425532, 0.56382979, 0.63829787,
       0.27659574, 0.62765957, 1.        , 0.11702128, 0.0212766 ,
       0.11702128, 0.34042553, 0.91489362, 0.63829787, 0.18085106,
       0.11702128, 0.39361702, 1.        , 0.10638298, 0.        ,
       0.14893617])

In [16]:
y_test

179    0
228    0
111    1
246    0
60     1
      ..
249    0
104    1
300    0
193    0
184    0
Name: target, Length: 61, dtype: int64

### Finding the accuracy

In [17]:
from sklearn.metrics import r2_score
r2 = r2_score(y_test, y_preds)
r2

0.5158792742464214