# Cross Validation

## Decision Trees Regression

In [8]:
import numpy as np
import pandas as pd

from sklearn.datasets import fetch_california_housing

from sklearn.datasets import load_iris

from sklearn.model_selection import cross_val_score
from sklearn.model_selection import cross_val_predict

from sklearn.model_selection import train_test_split

from sklearn.linear_model import LinearRegression
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsRegressor
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.tree import DecisionTreeRegressor
import warnings
warnings.filterwarnings('ignore')

In [10]:
X, y = fetch_california_housing(return_X_y=True)
print("X has %d rows and %d columns"  %(X.shape[0],X.shape[1]))
print("y has %d rows"  %(y.shape[0]))

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20)
model = DecisionTreeRegressor()
model.fit(X_train, y_train)

print("X_train has %d rows and %d columns"  %(X_train.shape[0],X_train.shape[1]))
print("-----------------------------------")
print("The coefficient of determination for the test data is R2=%.2f"
      %(model.score(X_test, y_test)))
print("The coefficient of determination for the train data is R2=%.2f"
      %(model.score(X_train, y_train)))


X has 20640 rows and 8 columns
y has 20640 rows
X_train has 16512 rows and 8 columns
-----------------------------------
The coefficient of determination for the test data is R2=0.62
The coefficient of determination for the train data is R2=1.00


In [6]:
scores=cross_val_score(model, X_train, y_train, cv=5)
print("Cross validation scores: ", scores)
print("Score stats: %0.2f accuracy with a standard deviation of %0.2f" % (scores.mean(), scores.std()))

Cross validation scores:  [0.60631632 0.62282709 0.62052433 0.60906436 0.60872188]
Score stats: 0.61 accuracy with a standard deviation of 0.01


## Decision Trees Classification

In [11]:
X, y = load_iris(return_X_y=True)
print("X has %d rows and %d columns"  %(X.shape[0],X.shape[1]))
print("y has %d rows"  %(y.shape[0]))

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20)
model = DecisionTreeClassifier()
model.fit(X_train, y_train)
print("The (mean) accuracy on the test set is %.2f" %(model.score(X_test, y_test)))
print("The (mean) accuracy on the train data is %.2f" %(model.score(X_train, y_train)))

X has 150 rows and 4 columns
y has 150 rows
The (mean) accuracy on the test set is 0.97
The (mean) accuracy on the train data is 1.00


<b> multi class problem 

In [12]:
set(y) # 3 labels

{0, 1, 2}

In [13]:
scores=cross_val_score(model, X_train, y_train, cv=5) # 5 cv foldings
scores

array([1.        , 0.91666667, 0.95833333, 0.91666667, 0.91666667])

In [14]:
print("Five-fold cv results: \n %0.2f mean accuracy with a standard deviation of %0.2f" % (scores.mean(), scores.std()))

Five-fold cv results: 
 0.94 mean accuracy with a standard deviation of 0.03


In [15]:
y_test

array([1, 0, 2, 2, 1, 1, 2, 2, 1, 1, 1, 1, 0, 1, 0, 0, 2, 2, 0, 0, 1, 1,
       0, 2, 2, 2, 0, 1, 2, 2])

In [46]:
y_pred = cross_val_predict(model, X_test, y_test, cv=5)
y_pred

array([2, 2, 0, 0, 2, 1, 2, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 2, 0, 2, 1, 0,
       0, 0, 0, 0, 2, 1, 1, 1])

### Models Comparision (3 classification estimators)

In [47]:
model1 = DecisionTreeClassifier()
model2 = LogisticRegression()
model3 = KNeighborsClassifier()

model_pipeline = [model1, model2, model3]
model_names = ['Classification Tree', 'Logistic Regression', 'KNN']
scores = {}
i=0
for model in model_pipeline:
    mean_score = np.mean(cross_val_score(model, X_train, y_train, cv=5))
    scores[model_names[i]] = mean_score
    i = i+1
print(scores)

{'Classification Tree': 0.9416666666666668, 'Logistic Regression': 0.9583333333333333, 'KNN': 0.9333333333333333}


In [48]:
print("Comparing the 3 regression scores we find \n")

pd.DataFrame([scores], index=["score"])

Comparing the 3 regression scores we find 



Unnamed: 0,Classification Tree,Logistic Regression,KNN
score,0.941667,0.958333,0.933333
