In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

# Common AI models

In this first notebook you will practice AI with some common Machine Learning models. You're expected to already have some basic experience in playing around with Python. If you find yourself stranded on one of the assignments please put a question in the bootcamp-questions Slack channel. There are mentors there that are eager to help.

# Classic machine learning models

## Assignment 1
From the Sklearn library choose models of at least the following types, train them on the 6 imported datasets, evaluate their accuracy or R^2 and see which model works best on which dataset. (Note that there are both regression and classification sets)
* Tree
* Neural Network
* Neighbors
* Ensemble
* Naive Byes (classification only)
* Linear


In [None]:
from sklearn.datasets import load_iris, load_diabetes, load_digits, load_wine, load_breast_cancer
from sklearn.metrics import accuracy_score, r2_score
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier

# Example of how to load a dataset
print(load_iris().DESCR)

X = load_iris().data
y = load_iris().target

# Create a train/test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)

In [None]:
# Example of how to load/train a model and evaluate
from sklearn.tree import DecisionTreeClassifier
tree = DecisionTreeClassifier()             # Load
fitted_tree = tree.fit(X_train, y_train)    # Train
y_pred = fitted_tree.predict(X_test)        # Predict on unseen data

acc_score = accuracy_score(y_test, y_pred)  # Evaluate accuracy score
r2_score = r2_score(y_test, y_pred)         # Evaluate R^2 score

print(f'On the {load_iris.__name__} dataset the {DecisionTreeClassifier.__name__} reaches an accuracy score of {acc_score} and a R^2 score of {r2_score}')

In [48]:
from sklearn.datasets import load_iris, load_diabetes, load_digits, load_wine, load_breast_cancer
from sklearn.metrics import accuracy_score, r2_score
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.linear_model import LogisticRegression


dataSets = [load_iris, load_diabetes, load_digits, load_wine, load_breast_cancer]

for i in range(len(dataSets)):
  scores =[]
  module = []
  # Conducts the DecisionTree algorithm for the data sets
  X = dataSets[i]().data
  y = dataSets[i]().target

  X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)

  tree = DecisionTreeClassifier()
  fitted_tree = tree.fit(X_train, y_train)
  y_pred = fitted_tree.predict(X_test)

  acc_score = accuracy_score(y_test, y_pred)
  r2 = r2_score(y_test, y_pred)

  scores.append(acc_score)
  module.append(DecisionTreeClassifier.__name__)

  print(f'On the {dataSets[i].__name__} dataset the {DecisionTreeClassifier.__name__} reaches an accuracy score of {acc_score} and a R^2 score of {r2}')


 # Conducts the Neural Network algorithm for the data sets

  if dataSets[i].__name__ == "load_iris":
    mlp = MLPClassifier(hidden_layer_sizes=(10,), activation='relu', solver='sgd', max_iter=500, random_state=0, early_stopping=True)
  else:
    mlp = MLPClassifier(hidden_layer_sizes=(10,), activation='relu', solver='sgd', max_iter=500, random_state=0)

  mlp.fit(X_train, y_train)
  y_pred = mlp.predict(X_test)
  r2 = r2_score(y_test, y_pred)
  acc_score = accuracy_score(y_test, y_pred)

  print(f'On the {dataSets[i].__name__} dataset the {MLPClassifier.__name__} reaches an accuracy score of {acc_score} and a R^2 score of {r2}')

  scores.append(acc_score)
  module.append(MLPClassifier.__name__)

    # Conducts the Neighbors algorithm for the data sets
  knn = KNeighborsClassifier(n_neighbors=3, metric='euclidean')
  X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
  knn.fit(X_train, y_train)

  y_pred = knn.predict(X_test)
  r2 = r2_score(y_test, y_pred)
  acc_score = accuracy_score(y_test, y_pred)

  print(f'On the {dataSets[i].__name__} dataset the {KNeighborsClassifier.__name__} reaches an accuracy score of {acc_score} and a R^2 score of {r2}')

  scores.append(acc_score)
  module.append(KNeighborsClassifier.__name__)

 # Conducts the Ensemble algorithm for the data set
  X_train, X_test, y_train, y_test, = train_test_split(X, y, test_size=0.2, random_state=42)
  rf = RandomForestClassifier(n_estimators=100, random_state=42)
  rf.fit(X_train, y_train)

  y_pred = rf.predict(X_test)
  r2 = r2_score(y_test, y_pred)
  acc_score = accuracy_score(y_test, y_pred)

  print(f'On the {dataSets[i].__name__} dataset the {RandomForestClassifier.__name__} reaches an accuracy score of {acc_score} and a R^2 score of {r2}')

  scores.append(acc_score)
  module.append(RandomForestClassifier.__name__)

 # Conducts the Naive Byes algorithm for the data set
  X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
  nb = GaussianNB()
  nb.fit(X_train, y_train)

  y_pred = nb.predict(X_test)
  r2 = r2_score(y_test, y_pred)
  acc_score = accuracy_score(y_test, y_pred)

  print(f'On the {dataSets[i].__name__} dataset the {GaussianNB.__name__} reaches an accuracy score of {acc_score} and a R^2 score of {r2}')

  scores.append(acc_score)
  module.append(GaussianNB.__name__)

  X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

  lr = LogisticRegression(max_iter=10000)
  lr.fit(X_train, y_train)

  y_pred = lr.predict(X_test)
  r2 = r2_score(y_test, y_pred)
  acc_score = accuracy_score(y_test, y_pred)

  print(f'On the {dataSets[i].__name__} dataset the {LogisticRegression.__name__} reaches an accuracy score of {acc_score} and a R^2 score of {r2}')

  scores.append(acc_score)
  module.append(LogisticRegression.__name__)
  
  topScore = min(scores, key=lambda x: abs(x-1.0))
  index = scores.index(topScore)
  topModule = module[index]

  print()
  print(f'The {topModule} module worked best for the {dataSets[i].__name__} dataset with a accuracy score of {topScore}')
  print()







On the load_iris dataset the DecisionTreeClassifier reaches an accuracy score of 0.98 and a R^2 score of 0.9712808730614589
On the load_iris dataset the MLPClassifier reaches an accuracy score of 0.52 and a R^2 score of 0.3107409534750145
On the load_iris dataset the KNeighborsClassifier reaches an accuracy score of 1.0 and a R^2 score of 1.0
On the load_iris dataset the RandomForestClassifier reaches an accuracy score of 1.0 and a R^2 score of 1.0
On the load_iris dataset the GaussianNB reaches an accuracy score of 1.0 and a R^2 score of 1.0
On the load_iris dataset the LogisticRegression reaches an accuracy score of 1.0 and a R^2 score of 1.0

The KNeighborsClassifier module worked best for the load_iris dataset with a accuracy score of 1.0

On the load_diabetes dataset the DecisionTreeClassifier reaches an accuracy score of 0.00684931506849315 and a R^2 score of -0.07747773215197507
On the load_diabetes dataset the MLPClassifier reaches an accuracy score of 0.0 and a R^2 score of -1

## Assignment 2
Use XGBoost running on GPU to predict the same datasets. You can activate GPU acceleration in the Runtime tab:
Runtime -> Change runtime type -> Select GPU from the dropdown

In [None]:
from sklearn.datasets import load_iris, load_diabetes, load_digits, load_wine, load_breast_cancer
from sklearn.metrics import r2_score
from sklearn.model_selection import train_test_split
from xgboost import XGBRegressor

dataSets = [load_iris, load_diabetes, load_digits, load_wine, load_breast_cancer]

for i in range(len(dataSets)):

  X = dataSets[i]().data
  y = dataSets[i]().target

  X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

  model = XGBRegressor()

  model.fit(X_train, y_train)

  y_pred = model.predict(X_test)
  r2 = r2_score(y_test, y_pred)

  print(f'On the {dataSets[i].__name__} dataset {XGBRegressor.__name__} module reaches an R^2 score of {r2}')







On the load_iris dataset XGBRegressor module reaches an R^2 score of 0.9926678928287365
On the load_diabetes dataset XGBRegressor module reaches an R^2 score of 0.22857599305390852
On the load_digits dataset XGBRegressor module reaches an R^2 score of 0.8497827487365098
On the load_wine dataset XGBRegressor module reaches an R^2 score of 0.702832079593972
On the load_breast_cancer dataset XGBRegressor module reaches an R^2 score of 0.811670892353153
