<a href="https://colab.research.google.com/github/arad1367/Dresden_April2024_UniCourse/blob/main/7_ML_with_scikit_learn_supervised.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

### Machine learning with scikit-learn (`Supervised learning`: `Regression` and `Classification`)
Steps:

* Data
* Preprocessing (missing values, string data, ...)
* Define dependend variable(target) and independent variables(features)
* Split data(train dataset, validation dataset, test dataset)
* Make the model
* Fit the model
* Make some prediction
* Accuracy of the model & metrics
* Improvement the model(hyperparameters tuning)

* Select best estimator: https://scikit-learn.org/stable/tutorial/machine_learning_map/

In [10]:
# import necessary libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

# Models for Regression
from sklearn.ensemble import RandomForestRegressor
from sklearn.svm import SVR

# Models for classification
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier

# other libraries
from sklearn.model_selection import train_test_split

# Metrics
from sklearn.metrics import r2_score, mean_absolute_error, mean_squared_error
from sklearn.metrics import classification_report, accuracy_score, confusion_matrix

# Datasets from scikit-learn
from sklearn.datasets import load_wine

### 1. Supervised learning: Regression

* Regression is a supervised machine learning technique which is used to predict continuous values

In [None]:
# 1. Data
data_path = "/content/boston.csv"
df = pd.read_csv(data_path)
df.head(10)

# Explore data
df.info()
df.describe()

# 2. data preprocessing
# missing values
df.isna().sum()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 506 entries, 0 to 505
Data columns (total 14 columns):
 #   Column   Non-Null Count  Dtype  
---  ------   --------------  -----  
 0   CRIM     506 non-null    float64
 1   ZN       506 non-null    float64
 2   INDUS    506 non-null    float64
 3   CHAS     506 non-null    int64  
 4   NOX      506 non-null    float64
 5   RM       506 non-null    float64
 6   AGE      506 non-null    float64
 7   DIS      506 non-null    float64
 8   RAD      506 non-null    int64  
 9   TAX      506 non-null    float64
 10  PTRATIO  506 non-null    float64
 11  B        506 non-null    float64
 12  LSTAT    506 non-null    float64
 13  MEDV     506 non-null    float64
dtypes: float64(12), int64(2)
memory usage: 55.5 KB


CRIM       0
ZN         0
INDUS      0
CHAS       0
NOX        0
RM         0
AGE        0
DIS        0
RAD        0
TAX        0
PTRATIO    0
B          0
LSTAT      0
MEDV       0
dtype: int64

In [None]:
# 1. Data is ready

# 2. Preprocessing is done

# 3. Make X (features) and y (label)
X = df.drop("MEDV", axis=1)
y = df["MEDV"]

# 4. Split the data
X_train, X_test, y_train, y_test = train_test_split(X,
                                                    y,
                                                    test_size=0.2,
                                                    random_state=42)
(len(X_train) == len(y_train)) & (len(X_test) == len(y_test))
len(X_train), len(y_train)

# 5. Make the model for regression purpose
models = {
    "RFR" : RandomForestRegressor(),
    "SVR" : SVR()
}

RFR_model = models["RFR"]
SVR_model = models["SVR"]

# 6. Fit our regression models with train data
RFR_model.fit(X_train, y_train)
SVR_model.fit(X_train, y_train)

# 7. Make some prediction
y_pred_rfr = RFR_model.predict(X_test)
y_pred_svr = SVR_model.predict(X_test)

# 8. Accuracy of regression models --> R2
RFR_score = RFR_model.score(X_test, y_test)
SVR_score = SVR_model.score(X_test, y_test)
print(f"The accuracy of RFR model is: {RFR_score*100:0.2f}% -- The accuracy of SVR model is: {SVR_score*100:0.2f}%")

The accuracy of RFR model is: 88.24% -- The accuracy of SVR model is: 27.95%


In [None]:
# Make a function to check accuracy of regression models
def accuracy_of_reg_model(y_pred, y_true):
  """
  This model can evaluate the accuracy of regression models.
  Our criteria: r2_score, mae, mse
  """
  acc = {}
  acc["r2_score"] = round(r2_score(y_pred, y_true), 2)
  acc["mae"] = round(mean_absolute_error(y_pred, y_true), 2)
  acc["mse"] = round(mean_squared_error(y_pred, y_true), 2)
  return acc

In [None]:
accuracy_of_reg_model(y_pred=y_pred_rfr,
                      y_true=y_test)


{'r2_score': 0.86, 'mae': 2.07, 'mse': 8.63}

### Project: Write a function --> parameters are X, y, Model_name
* Function must return acc of model for r2, mse, mae

### Improvement the model(hyperparameters tuning)

In [None]:
kernel_list = ["linear", "poly", "rbf", "sigmoid"]

for parameter in kernel_list:
  svr_model = SVR(kernel=parameter)
  svr_model.fit(X_train, y_train)
  prediction = svr_model.predict(X_test)
  score = svr_model.score(X_test, y_test)
  print(f"Kernel = {parameter} --- accuracy: {score*100:0.2f}")

Kernel = linear --- accuracy: 59.86
Kernel = poly --- accuracy: 27.06
Kernel = rbf --- accuracy: 27.95
Kernel = sigmoid --- accuracy: 7.76


### 2. Supervised learning: Classification
* Classification is a branch of supervised learning, a type of machine learning algorithm that involves training a model on labeled data to identify and categorize new input data into one or more predefined classes.

In [2]:
# 1. data
wine_data = load_wine()
wine_data

{'data': array([[1.423e+01, 1.710e+00, 2.430e+00, ..., 1.040e+00, 3.920e+00,
         1.065e+03],
        [1.320e+01, 1.780e+00, 2.140e+00, ..., 1.050e+00, 3.400e+00,
         1.050e+03],
        [1.316e+01, 2.360e+00, 2.670e+00, ..., 1.030e+00, 3.170e+00,
         1.185e+03],
        ...,
        [1.327e+01, 4.280e+00, 2.260e+00, ..., 5.900e-01, 1.560e+00,
         8.350e+02],
        [1.317e+01, 2.590e+00, 2.370e+00, ..., 6.000e-01, 1.620e+00,
         8.400e+02],
        [1.413e+01, 4.100e+00, 2.740e+00, ..., 6.100e-01, 1.600e+00,
         5.600e+02]]),
 'target': array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1,
        1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
        1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
        1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1

In [8]:
# Make our df and X, y
X = pd.DataFrame(data = wine_data["data"],
                 columns=wine_data["feature_names"])
X.head()
len(X)

y = wine_data["target"]
y[:10]

df = X
df["target"] = y
df.tail()

df.info()

df["target"].value_counts()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 178 entries, 0 to 177
Data columns (total 14 columns):
 #   Column                        Non-Null Count  Dtype  
---  ------                        --------------  -----  
 0   alcohol                       178 non-null    float64
 1   malic_acid                    178 non-null    float64
 2   ash                           178 non-null    float64
 3   alcalinity_of_ash             178 non-null    float64
 4   magnesium                     178 non-null    float64
 5   total_phenols                 178 non-null    float64
 6   flavanoids                    178 non-null    float64
 7   nonflavanoid_phenols          178 non-null    float64
 8   proanthocyanins               178 non-null    float64
 9   color_intensity               178 non-null    float64
 10  hue                           178 non-null    float64
 11  od280/od315_of_diluted_wines  178 non-null    float64
 12  proline                       178 non-null    float64
 13  targe

target
1    71
0    59
2    48
Name: count, dtype: int64

In [9]:
# 1. Data is ready

# 2. Preprocessing is done

# 3. X, y are ready.

# 4. Split the data
X_train, X_test, y_train, y_test = train_test_split(X,
                                                    y,
                                                    test_size=0.2,
                                                    random_state=42)

# 5. Make the model for regression purpose
models = {
    "RFC" : RandomForestClassifier(),
    "KNN" : KNeighborsClassifier()
}

RFC_model = models["RFC"]
KNN_model = models["KNN"]

# 6. Fit our regression models with train data
RFC_model.fit(X_train, y_train)
KNN_model.fit(X_train, y_train)

# 7. Make some prediction
y_pred_rfc = RFC_model.predict(X_test)
y_pred_knn = KNN_model.predict(X_test)

# 8. Accuracy of regression models --> R2
RFC_score = RFC_model.score(X_test, y_test)
KNN_score = KNN_model.score(X_test, y_test)
print(f"The accuracy of RFC model is: {RFC_score*100:0.2f}% -- The accuracy of KNN model is: {KNN_score*100:0.2f}%")

The accuracy of RFC model is: 100.00% -- The accuracy of KNN model is: 72.22%


In [12]:
confusion_matrix(y_test, y_pred_knn)

array([[12,  0,  2],
       [ 0, 11,  3],
       [ 2,  3,  3]])

In [15]:
from pprint import pprint

print(classification_report(y_test, y_pred_rfc))

              precision    recall  f1-score   support

           0       1.00      1.00      1.00        14
           1       1.00      1.00      1.00        14
           2       1.00      1.00      1.00         8

    accuracy                           1.00        36
   macro avg       1.00      1.00      1.00        36
weighted avg       1.00      1.00      1.00        36

