# 1. Load dataset
#### In this assignment, you are expected to build a decision tree model that classifies a toy dataset.
#### You will need to read the data from the file (data.csv). It contains 15000 samples and two features for each sample.

In [3]:
import pandas as pd
import numpy as np
df = pd.read_csv('data.csv')
df.head()

Unnamed: 0,1.018255499889504426e+04,-3.718306912453772384e+02,1.000000000000000000e+02
0,-8493.323486,7009.446179,0.0
1,21322.088204,-390.558362,100.0
2,5473.925002,-1878.223941,0.0
3,-7422.54071,5291.351276,0.0
4,-9103.655795,3197.164389,0.0


# 2. Prepare dataset
#### Split the data into train and test sets.

In [4]:
from sklearn.model_selection import train_test_split
X = df.drop("1.000000000000000000e+02",axis=1).values
y = df["1.000000000000000000e+02"].values
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.2, random_state=1)

# 3. Modeling
#### Train a decision tree classifier on the data. You can use DecisionTreeClassifier. Use grid search to tune the hyperparameters.

In [5]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import GridSearchCV
dtc = DecisionTreeClassifier()
parameters = { 
    'max_depth' : [3,4,5,6,7],
    'criterion' :['gini', 'entropy']
}
clf = GridSearchCV(dtc, parameters)
clf.fit(X,y)

GridSearchCV(estimator=DecisionTreeClassifier(),
             param_grid={'criterion': ['gini', 'entropy'],
                         'max_depth': [3, 4, 5, 6, 7]})

#### Train the best model you found on the whole train set (do you need to?) and evaluate the model on the test set.

In [6]:
clf.best_params_

{'criterion': 'entropy', 'max_depth': 4}

In [7]:
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score
dtc = DecisionTreeClassifier(criterion="entropy",max_depth=4)
dtc.fit(X_train,y_train)
predict_y = dtc.predict(X_test)
confusion_matrix(y_test,predict_y)
accuracy_score(y_test,predict_y)


0.8616666666666667

#### Generate 1,500 subsets of the training set, each containing 100 randomly chosen instances. You can use ShuffleSplit.

In [8]:
from sklearn.model_selection import ShuffleSplit
n_trees = 1500
n_instances = 100

mini_sets = []

rs = ShuffleSplit(n_splits=n_trees,test_size=len(X_train)- n_instances,random_state= 42)
for mini_train_index, min_test_index in rs.split(X_train):
    X_mini_train = X_train[mini_train_index]
    y_mini_train = y_train[mini_train_index]
    mini_sets.append((X_mini_train,y_mini_train))
    
    

#### Train one tree on each subset, using the best model you previously found. Evaluate the performance of the trees using the test set. Did you get lower or higher accuracy?

In [9]:
from sklearn.base import clone

models = [clone(dtc) for _ in range(n_trees)]
scores = np.zeros(n_trees)

for index,((x_subtrain,y_subtrain), model) in enumerate(zip(mini_sets,models)):
    model.fit(x_subtrain,y_subtrain)
    test_score = model.score(X_test,y_test)
    scores[index] = test_score

In [36]:
index = np.argmax(scores)
score = scores[index]
index,score
# even max score is lower than what we found above

(1072, 0.8613333333333333)

#### For each instance in the test set, predict its class using 1200 trees, and keep only the most frequent prediction. Evaluate these predictions. Did you get lower or higher accuracy?

In [37]:
from scipy.stats import mode
from sklearn.metrics import accuracy_score

n_trees = 1500
y_preds = [ tree.predict(X_test) for tree in models[:n_trees]]
y_preds = np.array(y_preds)

In [38]:
most, frequency = mode(y_preds)

In [39]:
accuracy_score(y_test,most.reshape(-1))

0.863

we got slightly better score