### Load the packages for the classification problem

In [39]:
import pandas as pd 
from sklearn import datasets
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier # import classifier 
from sklearn import metrics
from sklearn.metrics import plot_confusion_matrix
import matplotlib.pyplot as plt

from sklearn.model_selection import cross_val_score

### Let's define all the functions that we need for performing the following steps: 

- load and transform the data 

- split the data into training and testing set. 

- train the model and visualize the first prediction

- Measure the perfomance of the model with accuracy, precison and recall .. eventually f1-score

- Cross-validation to measure bias and variance.

In [88]:
def return_data(iris):
    df = pd.DataFrame({      
        'sepal length':iris.data[:,0],
        'sepal width':iris.data[:,1],
        'petal length':iris.data[:,2],
        'petal width':iris.data[:,3],
        'species':iris.target
    })
    return df


def split_data(df, ts):
    X=df[['sepal length', 'sepal width', 'petal length', 'petal width']]  # Features
    y=df['species']  # Labels
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=ts)
    return  X_train, X_test, y_train, y_test

def fitpredict_model(X_train, y_train, est):
    rf_classifier = RandomForestClassifier(n_estimators=est)
    rf_classifier.fit(X_train, y_train)
    y_pred= rf_classifier.predict(X_test)
    
    return y_pred


#print(rf_classifier.feature_importances_
def evaluate_model(y_test,y_pred):
    print("Accuracy:",metrics.accuracy_score(y_test, y_pred))  
    print("Precision score:", metrics.precision_score(y_test, y_pred, average = "weighted"))
    print("Recall score:", metrics.recall_score(y_test, y_pred, average = "weighted"))

In [89]:
### Load and transform the data

In [90]:
iris = datasets.load_iris()
data = return_data(iris)

In [91]:
data

Unnamed: 0,sepal length,sepal width,petal length,petal width,species
0,5.1,3.5,1.4,0.2,0
1,4.9,3.0,1.4,0.2,0
2,4.7,3.2,1.3,0.2,0
3,4.6,3.1,1.5,0.2,0
4,5.0,3.6,1.4,0.2,0
...,...,...,...,...,...
145,6.7,3.0,5.2,2.3,2
146,6.3,2.5,5.0,1.9,2
147,6.5,3.0,5.2,2.0,2
148,6.2,3.4,5.4,2.3,2


In [92]:
X_train, X_test, y_train, y_test = split_data(data, 0.25)

In [93]:
X_train.shape

(112, 4)

In [102]:
y_hat = fitpredict_model(X_train, y_train, 20)

In [103]:
evaluate_model(y_test, y_hat)


Accuracy: 0.9473684210526315
Precision score: 0.9473684210526315
Recall score: 0.9473684210526315


### Using Cross-validation

In [84]:
rfc = RandomForestClassifier(n_estimators=20, random_state = 4)

In [85]:
# prepare the data for crossvalidation
X = data.iloc[:, 0:4]
y = data.iloc[:, -1]

In [65]:
y

0      0
1      0
2      0
3      0
4      0
      ..
145    2
146    2
147    2
148    2
149    2
Name: species, Length: 150, dtype: int32

In [66]:
score = cross_val_score(rfc, X, y, cv = 10, scoring="accuracy")

In [67]:
score

array([1.        , 0.93333333, 1.        , 0.93333333, 0.86666667,
       0.93333333, 0.86666667, 1.        , 1.        , 1.        ])

### How to optimize the hyperparameters