# 1. Load dataset
#### In this assignment, you are expected to build a decision tree model that classifies a toy dataset.
#### You will need to read the data from the file (data.csv). It contains 15000 samples and two features for each sample.

In [1]:
import pandas as pd
import numpy as np
df=pd.read_csv('data.csv')
df.head()

Unnamed: 0,1.018255499889504426e+04,-3.718306912453772384e+02,1.000000000000000000e+02
0,-8493.323486,7009.446179,0.0
1,21322.088204,-390.558362,100.0
2,5473.925002,-1878.223941,0.0
3,-7422.54071,5291.351276,0.0
4,-9103.655795,3197.164389,0.0


# 2. Prepare dataset
#### Split the data into train and test sets.

In [2]:
from sklearn.model_selection import train_test_split
X=df.iloc[:,:2]
y=df.iloc[:,2]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25)

# 3. Modeling
#### Train a decision tree classifier on the data. You can use DecisionTreeClassifier. Use grid search to tune the hyperparameters.

In [3]:
from sklearn.model_selection import GridSearchCV
from sklearn.tree import DecisionTreeClassifier

params={'criterion':['gini','entropy'], 'max_depth':[4,5,6,7,8,9,10,11,12,15,20,30,40,50,70,90,120,150], 'min_samples_leaf':[55,60,65,70,75]}
clf = GridSearchCV(DecisionTreeClassifier(), params, cv=5)
clf.fit(X_train, y_train)

clf.best_params_

{'criterion': 'entropy', 'max_depth': 9, 'min_samples_leaf': 55}

In [4]:
clf.best_score_

0.8614982263722147

#### Train the best model you found on the whole train set (do you need to?) and evaluate the model on the test set.

In [5]:
# we do not need: by default, GridSearchCV trains the best model found on the whole training set
# but 
from sklearn.metrics import accuracy_score

model = DecisionTreeClassifier(criterion='entropy', max_depth=7, min_samples_leaf=70)
model.fit(X_train, y_train)
predicts=model.predict(X_test)
accuracy_score(y_test, predicts)

0.8541333333333333

#### Generate 1,500 subsets of the training set, each containing 100 randomly chosen instances. You can use ShuffleSplit.

In [6]:
from sklearn.model_selection import ShuffleSplit

rs = ShuffleSplit(n_splits=1500, random_state=42, train_size=100, test_size=100)

print(rs)

ShuffleSplit(n_splits=1500, random_state=42, test_size=100, train_size=100)


#### Train one tree on each subset, using the best model you previously found. Evaluate the performance of the trees using the test set. Did you get lower or higher accuracy?

In [7]:
n=0
acc_scores=list()
for train_index, test_index in rs.split(X):
#   print("TRAIN:", train_index.shape, "TEST:", test_index.shape)
    X_train, X_test = X.iloc[train_index], X.iloc[test_index]
    y_train, y_test = y.iloc[train_index], y.iloc[test_index]
    
    model = DecisionTreeClassifier(criterion='entropy', max_depth=7, min_samples_leaf=70)
    model.fit(X_train, y_train)
    
    predicts=model.predict(X_test)
    
    acc_scores.append(accuracy_score(y_test, predicts))
    
    n+=1
print(f'number of subsets:{n}')
 

number of subsets:1500


In [8]:
max(acc_scores)

0.66

#### For each instance in the test set, predict its class using 1200 trees, and keep only the most frequent prediction. Evaluate these predictions. Did you get lower or higher accuracy?

In [20]:
rs = ShuffleSplit(n_splits=1500, random_state=42, train_size=100, test_size=100)

model = DecisionTreeClassifier(criterion='entropy', max_depth=9, min_samples_leaf=55)
predicts_df=pd.DataFrame()

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25)

for train_index, test_index in rs.split(X):
    
    X_train = X.iloc[train_index]
    y_train = y.iloc[train_index]
    
    model.fit(X_train, y_train)
    predicts=model.predict(X_test)
    
    acc_scores.append(accuracy_score(y_test, predicts))
    

In [21]:
max(acc_scores)

0.66

In [19]:
predicts.shape

(3750,)