# Scikit learn introduction
**Steps:**
0. An end-to-end Scikit-Learn workflow.
1. Getting the data ready.
2. Choose the right estimator / algorithm for our probelms .
3. Fit the model / algorithm and use it to make predictions on our data.
4. Evaluating a model.
5. Improve the model.
6. Save and load the trained model.
7. Putting it all together!

In [36]:
steps = [
"0. An end-to-end Scikit-Learn workflow.",
"1. Getting the data ready.",
"2. Choose the right estimator / algorithm for our probelms .",
"3. Fit the model / algorithm and use it to make predictions on our data.",
"4. Evaluating a model.",
"5. Improve the model.",
"6. Save and load the trained model.",
"7. Putting it all together!"
        ]

In [37]:
steps

['0. An end-to-end Scikit-Learn workflow.',
 '1. Getting the data ready.',
 '2. Choose the right estimator / algorithm for our probelms .',
 '3. Fit the model / algorithm and use it to make predictions on our data.',
 '4. Evaluating a model.',
 '5. Improve the model.',
 '6. Save and load the trained model.',
 '7. Putting it all together!']

## 0. An end-to-end Scikit-learn workflow

In [1]:
# import libraries
%matplotlib inline
import pandas as pd
import numpy as np
import matplotlib.pyplot

In [2]:
# 1.Getting the data ready 
heart_disease = pd.read_csv('heart-disease.csv')
heart_disease.head()

Unnamed: 0,age,sex,cp,trestbps,chol,fbs,restecg,thalach,exang,oldpeak,slope,ca,thal,target
0,63,1,3,145,233,1,0,150,0,2.3,0,0,1,1
1,37,1,2,130,250,0,1,187,0,3.5,0,0,2,1
2,41,0,1,130,204,0,0,172,0,1.4,2,0,2,1
3,56,1,1,120,236,0,1,178,0,0.8,2,0,2,1
4,57,0,0,120,354,0,1,163,1,0.6,2,0,2,1


In [3]:
# Create x(feature matrix)
x = heart_disease.drop('target', axis=1)
# Create y(labels)
y = heart_disease['target']

In [4]:
x.head()

Unnamed: 0,age,sex,cp,trestbps,chol,fbs,restecg,thalach,exang,oldpeak,slope,ca,thal
0,63,1,3,145,233,1,0,150,0,2.3,0,0,1
1,37,1,2,130,250,0,1,187,0,3.5,0,0,2
2,41,0,1,130,204,0,0,172,0,1.4,2,0,2
3,56,1,1,120,236,0,1,178,0,0.8,2,0,2
4,57,0,0,120,354,0,1,163,1,0.6,2,0,2


In [5]:
y.head()

0    1
1    1
2    1
3    1
4    1
Name: target, dtype: int64

In [6]:
# 2nd way to get rid of warnings
# import warnings
# warnings.filterwarnings('ignore')

In [7]:
# 2.Choose the right model hyperparameters
from sklearn.ensemble import RandomForestClassifier
clf = RandomForestClassifier() # clf = classification model
# we'll keep the deafault hyperparameters
clf.get_params() # to get the paramters of clf

{'bootstrap': True,
 'ccp_alpha': 0.0,
 'class_weight': None,
 'criterion': 'gini',
 'max_depth': None,
 'max_features': 'auto',
 'max_leaf_nodes': None,
 'max_samples': None,
 'min_impurity_decrease': 0.0,
 'min_impurity_split': None,
 'min_samples_leaf': 1,
 'min_samples_split': 2,
 'min_weight_fraction_leaf': 0.0,
 'n_estimators': 100,
 'n_jobs': None,
 'oob_score': False,
 'random_state': None,
 'verbose': 0,
 'warm_start': False}

In [8]:
# 3.Fit the model to the training data 
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2)
# for fix the warnings we can use 2 ways
# 1- n_estemators=100
# 2- import warnings
# warnings.filterwarnigs('ignore') or warning.filterwarnings('default') to get back

In [9]:
clf.fit(x_train, y_train)

RandomForestClassifier(bootstrap=True, ccp_alpha=0.0, class_weight=None,
                       criterion='gini', max_depth=None, max_features='auto',
                       max_leaf_nodes=None, max_samples=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=100,
                       n_jobs=None, oob_score=False, random_state=None,
                       verbose=0, warm_start=False)

In [10]:
# Make a prediction
# y_label = clf.predict(np.array([1, 2, 3, 4])) ==> Error
# Shape of array doesn't fit

In [11]:
x_test.head()

Unnamed: 0,age,sex,cp,trestbps,chol,fbs,restecg,thalach,exang,oldpeak,slope,ca,thal
48,53,0,2,128,216,0,0,115,0,0.0,2,0,0
204,62,0,0,160,164,0,0,145,0,6.2,0,3,3
260,66,0,0,178,228,1,1,165,1,1.0,1,2,3
235,51,1,0,140,299,0,1,173,1,1.6,2,0,3
278,58,0,1,136,319,1,0,152,0,0.0,2,2,2


In [12]:
y_preds = clf.predict(x_test)
y_preds

array([1, 0, 0, 0, 1, 1, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 1, 0, 1, 1, 1, 1,
       0, 1, 1, 0, 1, 1, 1, 0, 0, 1, 1, 0, 1, 1, 1, 0, 1, 1, 1, 0, 0, 1,
       1, 1, 0, 1, 1, 0, 1, 0, 1, 0, 0, 1, 0, 1, 0, 1, 0])

In [13]:
y_test.head()

48     1
204    0
260    0
235    0
278    0
Name: target, dtype: int64

In [14]:
# 4. Evaluate the model on the trainig data and test data
clf.score(x_train, y_train), clf.score(x_test, y_test)

(1.0, 0.7868852459016393)

In [15]:
# See the accuracy more details
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
print(classification_report(y_test, y_preds))


              precision    recall  f1-score   support

           0       0.81      0.73      0.77        30
           1       0.76      0.84      0.80        31

    accuracy                           0.79        61
   macro avg       0.79      0.79      0.79        61
weighted avg       0.79      0.79      0.79        61



In [16]:
print(confusion_matrix(y_test, y_preds))

[[22  8]
 [ 5 26]]


In [17]:
print(accuracy_score(y_test, y_preds))

0.7868852459016393


In [25]:
# 5.Improve the model
# Try diffrent amount of n_estimator
best_score = 0

for i in range(10, 100, 5):
    clf = RandomForestClassifier(n_estimators=i).fit(x_train, y_train)
    print(f"Trying model with {i} estimators ...")
    print(f"The model accuracy on the test set:{clf.score(x_test, y_test) * 100: .2f}%")
    
    if clf.score(x_test, y_test) > best_score:
        best_clf = clf
        best_score = clf.score(x_test, y_test)

print(f'The best score is ==> {best_score}')

Trying model with 10 estimators ...
The model accuracy on the test set: 72.13%
Trying model with 15 estimators ...
The model accuracy on the test set: 75.41%
Trying model with 20 estimators ...
The model accuracy on the test set: 78.69%
Trying model with 25 estimators ...
The model accuracy on the test set: 73.77%
Trying model with 30 estimators ...
The model accuracy on the test set: 77.05%
Trying model with 35 estimators ...
The model accuracy on the test set: 73.77%
Trying model with 40 estimators ...
The model accuracy on the test set: 77.05%
Trying model with 45 estimators ...
The model accuracy on the test set: 70.49%
Trying model with 50 estimators ...
The model accuracy on the test set: 78.69%
Trying model with 55 estimators ...
The model accuracy on the test set: 73.77%
Trying model with 60 estimators ...
The model accuracy on the test set: 75.41%
Trying model with 65 estimators ...
The model accuracy on the test set: 75.41%
Trying model with 70 estimators ...
The model accura

In [26]:
best_clf.score(x_test, y_test)

0.8032786885245902

In [27]:
# 6. Save the model and reload it 
import pickle
pickle.dump(best_clf, open('Random_forest_model_1.pkl', 'wb'))


In [28]:
# Load it 
loaded_model = pickle.load(open('Random_forest_model_1.pkl', 'rb'))
loaded_model.score(x_test, y_test)

0.8032786885245902

In [29]:
# To see the version of sklearn
import sklearn
sklearn.show_versions()


System:
    python: 3.6.10 |Anaconda, Inc.| (default, Jan  7 2020, 21:14:29)  [GCC 7.3.0]
executable: /home/ali/workspace/Machine-learning/env/bin/python
   machine: Linux-5.4.0-kali4-amd64-x86_64-with-debian-kali-rolling

Python dependencies:
       pip: 20.0.2
setuptools: 46.0.0.post20200309
   sklearn: 0.22.1
     numpy: 1.18.1
     scipy: 1.4.1
    Cython: None
    pandas: 1.0.2
matplotlib: 3.1.3
    joblib: 0.14.1

Built with OpenMP: True


## 1. Getting the data ready

Three main things we have to do:

1. Split the data into features and labels (usually `X` & `y`)
2. Filling (also called imputing) or disregarding missing values
3. Converting non-numerical values to numerical values (also called feature encoding)

In [39]:
# split data for x and y
x = heart_disease.drop('target', axis=1)
y = heart_disease['target']

In [40]:
x.head()

Unnamed: 0,age,sex,cp,trestbps,chol,fbs,restecg,thalach,exang,oldpeak,slope,ca,thal
0,63,1,3,145,233,1,0,150,0,2.3,0,0,1
1,37,1,2,130,250,0,1,187,0,3.5,0,0,2
2,41,0,1,130,204,0,0,172,0,1.4,2,0,2
3,56,1,1,120,236,0,1,178,0,0.8,2,0,2
4,57,0,0,120,354,0,1,163,1,0.6,2,0,2


In [42]:
y.tail()

298    0
299    0
300    0
301    0
302    0
Name: target, dtype: int64

In [44]:
# Split data into training and test sets
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2)

In [45]:
# Show the shapes of our train data and test data 
x_train.shape, x_test.shape, y_test.shape, y_train.shape

((242, 13), (61, 13), (61,), (242,))

In [46]:
# shape of all the data x = x_test + x_train
x.shape 

(303, 13)

### 1.1 Make sure all the data is numerical

In [49]:
# import the data
car_sales = pd.read_csv('car-sales-extended.csv')
car_sales.head()

Unnamed: 0,Make,Colour,Odometer (KM),Doors,Price
0,Honda,White,35431,4,15323
1,BMW,Blue,192714,5,19943
2,Honda,White,84714,4,28343
3,Toyota,White,154365,4,13434
4,Nissan,Blue,181577,3,14043


In [51]:
# see the type of our data and see the size of it
car_sales.dtypes, len(car_sales)

(Make             object
 Colour           object
 Odometer (KM)     int64
 Doors             int64
 Price             int64
 dtype: object,
 1000)

In [52]:
# Split the data to x, y
x = car_sales.drop('Price', axis=1)
y = car_sales['Price']

In [53]:
x.head()

Unnamed: 0,Make,Colour,Odometer (KM),Doors
0,Honda,White,35431,4
1,BMW,Blue,192714,5
2,Honda,White,84714,4
3,Toyota,White,154365,4
4,Nissan,Blue,181577,3


In [54]:
y.tail()

995    32042
996     5716
997    31570
998     4001
999    12732
Name: Price, dtype: int64

In [55]:
# Split to training and test sets
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2)

In [56]:
x_train.shape, x_test.shape

((800, 4), (200, 4))

In [60]:
# Build machine learning model 
from sklearn.ensemble import RandomForestRegressor
model = RandomForestRegressor()
# model.fit(x_train, y_train)
# model.score(x_test, y_test)
# Error ==> cann't convert string data to float

In [None]:
# Turn the categories into numbers
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer

categorical_features = ["Make", "Colour", "Doors"]
one_hot = OneHotEncoder
transformer = ColumnTransformer([''])