# Classification

Classification is a task that requires the use of machine learning algorithms that learn how to assign a class label to examples from the problem domain. An easy to understand example is classifying emails as “spam” or “not spam.”

In this lab, you will use three different classification algorithms: Logistic Regression and Support Vector Machines(SVM), and Decision Tree.

## Three differnet algorithms with example dataset, [Iris dataset](https://scikit-learn.org/stable/auto_examples/datasets/plot_iris_dataset.html)

In [None]:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd

from sklearn import preprocessing
from sklearn import svm, datasets
from sklearn.inspection import DecisionBoundaryDisplay
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import KFold
from sklearn.model_selection import train_test_split

### Load the data

In [None]:
# Data
iris = datasets.load_iris()
# Take the first two features. We could avoid this by using a two-dim dataset
X = iris.data[:, :2]
y = iris.target
print(y)
titles = (
    "[0,0]",
    "[0,1]",
    "[1,0]",
    "[1,1]",
)

fig, ax = plt.subplots(2, 2)
plt.subplots_adjust(wspace=0.4, hspace=0.4) 
#check how the parameters of function impact the figure
ax[0 , 0].scatter(X[: , 0] , X[: , 1] , )
ax[0 , 0].set_title("ax[0,0]")
ax[0 , 1].scatter(X[: , 0] , X[: , 1] , c=y)
ax[0 , 1].set_title("ax[0,1]")
ax[1 , 0].scatter(X[: , 0] , X[: , 1] , c=y , cmap=plt.cm.coolwarm , s=20)
ax[1 , 0].set_title("ax[1,0]")
ax[1 , 1].scatter(X[: , 0] , X[: , 1] , c=y, cmap=plt.cm.coolwarm, s=20, edgecolors="k")
ax[1 , 1].set_title("ax[1,1]")

### Logistic Regression

In [None]:
C = 1.0
LR = LogisticRegression(C=C)
LR.fit(X , y)

In [None]:
# plot 

# title for the plots
title = "Logistic Regression"

# Set-up figure.
fig, ax = plt.subplots(1 , 1)
plt.subplots_adjust(wspace=0.4, hspace=0.4)

X0, X1 = X[:, 0], X[:, 1]

disp = DecisionBoundaryDisplay.from_estimator(
        LR,
        X,
        response_method="predict",
        cmap=plt.cm.coolwarm,
        alpha=0.8,
        ax=ax,
        xlabel=iris.feature_names[0],
        ylabel=iris.feature_names[1],
    )
    
ax.scatter(X0, X1, c=y, cmap=plt.cm.coolwarm, s=20, edgecolors="k")
ax.set_xticks(())
ax.set_yticks(())
ax.set_title(title)

plt.show()

### Support vector machines

In [None]:
# fit the model

C = 1.0  # SVM regularization parameter
svmmodel = svm.SVC(kernel="linear", C=C)    
model = svmmodel.fit(X, y)

In [None]:
# plot 

# title for the plots
title = "SVC with linear kernel"

# Set-up figure.
fig, ax = plt.subplots(1 , 1)
plt.subplots_adjust(wspace=0.4, hspace=0.4)

X0, X1 = X[:, 0], X[:, 1]

disp = DecisionBoundaryDisplay.from_estimator(
        svmmodel,
        X,
        response_method="predict",
        cmap=plt.cm.coolwarm,
        alpha=0.8,
        ax=ax,
        xlabel=iris.feature_names[0],
        ylabel=iris.feature_names[1],
    )
    
ax.scatter(X0, X1, c=y, cmap=plt.cm.coolwarm, s=20, edgecolors="k")
ax.set_xticks(())
ax.set_yticks(())
ax.set_title(title)

plt.show()

### Decision Tree Classifier

In [None]:
# fit the model

C = 1.0  # SVM regularization parameter
treemodel = DecisionTreeClassifier(random_state=0)
model = treemodel.fit(X, y)

In [None]:
# plot 

# title for the plots
title = "Decision tree"

# Set-up figure.
fig, ax = plt.subplots(1 , 1)
plt.subplots_adjust(wspace=0.4, hspace=0.4)

X0, X1 = X[:, 0], X[:, 1]

disp = DecisionBoundaryDisplay.from_estimator(
        treemodel,
        X,
        response_method="predict",
        cmap=plt.cm.coolwarm,
        alpha=0.8,
        ax=ax,
        xlabel=iris.feature_names[0],
        ylabel=iris.feature_names[1],
    )
    
ax.scatter(X0, X1, c=y, cmap=plt.cm.coolwarm, s=20, edgecolors="k")
ax.set_xticks(())
ax.set_yticks(())
ax.set_title(title)

plt.show()

## Three differnet algorithms with Camels dataset
[CAMELS](https://ral.ucar.edu/solutions/products/camels)(Catchment Attributes and Meteorology for Large-sample Studies), a community data set of daily forcing and hydrologic response data for 671 small- to medium-sized basins across the contiguous United States. In this lab, we will use location (latitude, longitude) and elevation information of 671 catchments from the camels dataset, we use location as input values and elevation as targets values to perform the classification methods.

### Load the data from camels_topo.txt

In [None]:
with open('camels_topo.txt') as f:
    lines = f.readlines()

num_of_rows = len(lines)

var = np.zeros((num_of_rows - 1 , 4)) # save first fourth variables
                                      # in the files

for num in range(1 , num_of_rows): # we don't need the first row
    xx = lines[num] # variable to save each line of lines
    l = []
    for t in xx.split(';'):
        try:
            l.append(float(t))
        except ValueError:
            pass
    var[num - 1 , :] = l[0 : 4]

    
fig, ax = plt.subplots(1, 1)
plt.subplots_adjust(wspace=0.4, hspace=0.4)
ax.scatter(var[: , 2] , var[: , 1] , c=var[: , 3], cmap=plt.cm.coolwarm, s=20, edgecolors="k")
ax.set_title("catchments with elevation(m)")

var = pd.DataFrame(var , columns = ['catchment_idx','lat','lon','elev'])

### Cut the elevation into 4 levels
Elevation data is continuous data, as we want to apply classification method on it, we need to separate the elevation data into several different levels and add the corresponding label on each level, which is artifical labels.

In [None]:
var['elev_class'] , range_of_quantile = pd.qcut(var['elev'], 4, labels=False , retbins=True)
print(range_of_quantile)
print(var['elev_class'])

### As shown in the above cell, we get 4 groups of levels, and the intervals was printed as in variable "range_of_quantile" , we labeled them as 0,1,2,3

In [None]:
var = np.array(var)
fig, ax = plt.subplots(1, 1)
plt.subplots_adjust(wspace=0.4, hspace=0.4)
sc = ax.scatter(var[: , 2] , var[: , 1] , c=var[: , 4], cmap=plt.cm.coolwarm, s=20, edgecolors="k")
ax.set_title("elevation class")
bounds = [0, 1, 2, 3, 4]
plt.colorbar(sc , ticks=bounds)
X = var[: , [2,1]]
y = var[: , 4]
plt.show()

 ## Separate data into training and test part

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, shuffle=True)
print(X_train.shape)

### Logistic Regression

In [None]:
kf = KFold(n_splits=5)
for train, test in kf.split(X_train): 
    X_kf_train, X_kf_test, y_kf_train, y_kf_test = X_train[train], X_train[test], y_train[train], y_train[test]
    print(X_kf_train.shape)
    C = 1.0
    LR_camel = LogisticRegression(C=C , max_iter = 100000)
    LR_camel.fit(X_kf_train[: , :] , y_kf_train)
    
LR_pred = LR_camel.predict(X_test)

### Testing performance

In [None]:
print("testing score : %.3f" % (LR_camel.score(X_test, y_test)))

In [None]:
fig, ax = plt.subplots(2, 1 , figsize=(6, 10))
plt.subplots_adjust(wspace=0.4, hspace=0.2)
print(X_train.shape)
disp = DecisionBoundaryDisplay.from_estimator(
        LR_camel,
        X[: , :],    
        response_method="predict",
        cmap=plt.cm.coolwarm,
        alpha=0.8,
        ax=ax[0],
        xlabel='lat',
        ylabel='lon',
    )
cmap = plt.cm.get_cmap('PiYG', 4) 
ax[0].scatter(X[: , 0] , X[: , 1])
sc1 = ax[0].scatter(X_test[: , 0] , X_test[: , 1] , c=LR_pred, cmap=cmap, s=20, edgecolors="k")
ax[0].set_title("Our prediction on test data")
ax[1].set_title("True value")
sc2 = ax[1].scatter(X[: , 0] , X[: , 1] , c=y, cmap=cmap, s=20, edgecolors="k")
bounds = [0, 1, 2, 3, 4]
plt.colorbar(sc1 , ticks=bounds , ax = ax[0])
plt.colorbar(sc2 , ticks=bounds , ax = ax[1])
plt.show()

### Support Vector Machine(SVM)

In [None]:
kf = KFold(n_splits=5)
for train, test in kf.split(X_train):
    X_kf_train, X_kf_test, y_kf_train, y_kf_test = X_train[train], X_train[test], y_train[train], y_train[test]
    C = 1.0
    SVC_camel = svm.SVC(kernel="linear", C=C)
    SVC_camel.fit(X_kf_train , y_kf_train)
SVC_pred = SVC_camel.predict(X_test)

### Testing performance

In [None]:
print("testing score : %.3f" % (SVC_camel.score(X_test, y_test)))

In [None]:
fig, ax = plt.subplots(2, 1 , figsize=(6, 10))
plt.subplots_adjust(wspace=0.4, hspace=0.2)

disp = DecisionBoundaryDisplay.from_estimator(
        SVC_camel,
        X[: , :],       
        response_method="predict",
        cmap=plt.cm.coolwarm,
        alpha=0.8,
        ax=ax[0],
        xlabel='lat',
        ylabel='lon',
    )

cmap = plt.cm.get_cmap('PiYG', 4) 

ax[0].scatter(X[: , 0] , X[: , 1])
sc1 = ax[0].scatter(X_test[: , 0] , X_test[: , 1] , c=SVC_pred, cmap=cmap, s=20, edgecolors="k")
ax[0].set_title("Our prediction on test data")
ax[1].set_title("True value")
sc2 = ax[1].scatter(X[: , 0] , X[: , 1] , c=y, cmap=cmap, s=20, edgecolors="k")
bounds = [0, 1, 2, 3]



plt.colorbar(sc1 , ticks=bounds , ax = ax[0])
plt.colorbar(sc2 , ticks=bounds , ax = ax[1])
plt.show()

### Decision tree

In [None]:
kf = KFold(n_splits=5)
for train, test in kf.split(X_train):
    X_kf_train, X_kf_test, y_kf_train, y_kf_test = X_train[train], X_train[test], y_train[train], y_train[test]
    C = 1.0
    DT_camel = DecisionTreeClassifier(random_state=0)
    DT_camel.fit(X_kf_train , y_kf_train)
DT_pred = DT_camel.predict(X_test)

### Testing performance

In [None]:
print("testing score : %.3f" % (DT_camel.score(X_test, y_test)))

In [None]:
fig, ax = plt.subplots(2, 1 , figsize=(6, 10))
plt.subplots_adjust(wspace=0.4, hspace=0.2)

disp = DecisionBoundaryDisplay.from_estimator(
        DT_camel,
        X[: , :],       
        response_method="predict",
        cmap=plt.cm.coolwarm,
        alpha=0.8,
        ax=ax[0],
        xlabel='lat',
        ylabel='lon',
    )

cmap = plt.cm.get_cmap('PiYG', 4) 

ax[0].scatter(X[: , 0] , X[: , 1])
sc1 = ax[0].scatter(X_test[: , 0] , X_test[: , 1] , c=SVC_pred, cmap=cmap, s=20, edgecolors="k")
ax[0].set_title("Our prediction on test data")
ax[1].set_title("True value")
sc2 = ax[1].scatter(X[: , 0] , X[: , 1] , c=y, cmap=cmap, s=20, edgecolors="k")
bounds = [0, 1, 2, 3]



plt.colorbar(sc1 , ticks=bounds , ax = ax[0])
plt.colorbar(sc2 , ticks=bounds , ax = ax[1])
plt.show()

## Apply pipeline to the support vector machine method
The purpose of the [pipeline](https://scikit-learn.org/stable/modules/generated/sklearn.pipeline.Pipeline.html) is to assemble several steps that can be cross-validated together while setting different parameters.

Use a pipeline can also output the accuracy score of each fold.

In [None]:
from sklearn.model_selection import cross_val_score
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler

In [None]:
pipeline = make_pipeline(StandardScaler(), svm.SVC(kernel="linear", C=C))
scores = cross_val_score(pipeline, X=X_train, y=y_train, cv=5, n_jobs=1)
# cv=5 represents the StratifiedKFold with 5 folds

print('Cross Validation accuracy scores: %s' % scores)
# print the accuracy score of every fold

print('Cross Validation accuracy: %.3f +/- %.3f' % (np.mean(scores),np.std(scores)))