### Imports

In [60]:
import numpy as np
from sklearn import tree
import pandas as pd
from sklearn.model_selection import train_test_split
from scipy import stats
from sklearn.metrics import accuracy_score
from sklearn.ensemble import RandomForestClassifier



### Download Sonar Dataset

In [61]:
path = "https://archive.ics.uci.edu/ml/machine-learning-databases/undocumented/connectionist-bench/sonar/sonar.all-data"
names = ['col_'+str(i) for i in np.arange(61)]
data = pd.read_csv(path, names = names)
data.fillna(method='ffill', inplace=True)
data["col_60"] = data["col_60"].apply(lambda x: 1 if x=='R' else 0)

# #TODO train, test split
X_train, X_test, y_train, y_test = train_test_split(data.iloc[:,:-1], data.iloc[:,-1], test_size=0.33, random_state=58)

In [62]:
from sklearn.tree import DecisionTreeClassifier as DecisionTree
class RandomForest:
    def __init__(self, 
                 max_depth=4,
                 subsample_size=70,
                 tree_count=5,
                 criterion="gini"):
        self.trees = []
        self.subsample_size = subsample_size
        for i in range(tree_count):
            self.trees.append(DecisionTree(criterion=criterion, max_depth=max_depth))
    
    def fit(self, X, Y):
        for index, tree in enumerate(self.trees):
            #TODO bootstrap X_part and Y_part for each tree
            X_part = X.sample(n = self.subsample_size)
            Y_part = Y[X_part.index]
            self.trees[index].fit(X_part, Y_part)
    
    def predict(self, X):
        predictions = []
        for index, tree in enumerate(self.trees):
            predictions.append(self.trees[index].predict(X))
        #TODO combine and return y_pred
        y_pred = np.array(stats.mode(predictions).mode)[0]
        return y_pred


### Try on the sonar dataset

In [81]:
# TODO fit and predict on sonar dataset
# TODO compare results with Y_test and print the accuracy
model = RandomForest()
model.fit(X_train,y_train)

In [82]:
arr = model.predict(X_test)

In [83]:
accuracy_score(np.asarray(y_test), arr)

0.8260869565217391

### Try the Random Forest Implementation form Sklearn and compare results

In [109]:
clf = RandomForestClassifier(max_depth=4, random_state=0,min_samples_split=70,n_estimators=5)
clf.fit(X_train, y_train)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
                       max_depth=4, max_features='auto', max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=70,
                       min_weight_fraction_leaf=0.0, n_estimators=5,
                       n_jobs=None, oob_score=False, random_state=0, verbose=0,
                       warm_start=False)

In [110]:
pred = clf.predict(X_test)

In [111]:
accuracy_score(np.asarray(y_test), pred)

0.782608695652174