In [15]:
import pandas as pd
import numpy as np
import math
from sklearn.preprocessing import OneHotEncoder 
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
from sklearn.model_selection import RandomizedSearchCV
from imblearn.over_sampling import SMOTE, RandomOverSampler
from imblearn.combine import SMOTEENN
from sklearn.model_selection import KFold
import matplotlib.pyplot as plt
import re

In [16]:
filename = 'data/iris_train.csv'
df = pd.read_csv(filename, index_col = 0)
labels = df['labels']
df = df.drop(['labels'], axis = 1)

In [17]:
X_train, y_train = df, labels

In [112]:
def cross_validate(X_train, y_train):
    n_estimators = [int(x) for x in np.linspace(start = 200, stop = 1500, num = 10)]
    # Number of features to consider at every split
    max_features = ['auto', 'sqrt', 'log2']
    # Maximum number of levels in tree
    max_depth = [int(x) for x in np.linspace(10, 110, num = 11)]
    max_depth.append(None)
    # Minimum number of samples required to split a node
    min_samples_split = [2, 5, 7, 10]
    # Minimum number of samples required at each leaf node
    min_samples_leaf = [1, 2, 4]
    # Method of selecting samples for training each tree
    bootstrap = [False, True]

    random_grid = {'n_estimators': n_estimators,
                   'max_features': max_features,
                   'max_depth': max_depth,
                   'min_samples_split': min_samples_split,
                   'min_samples_leaf': min_samples_leaf,
                   'bootstrap': bootstrap}

    classifier = RandomForestClassifier()
    rf_random = RandomizedSearchCV(estimator = classifier, param_distributions = random_grid, 
                                   n_iter = 100, cv = 3, verbose=2, random_state=42, n_jobs = 3)

    rf_random.fit(X_train, y_train)
    print rf_random.best_estimator_

In [113]:
#cross_validate(df, labels)

Fitting 3 folds for each of 100 candidates, totalling 300 fits


KeyboardInterrupt: 

In [114]:
classifier = RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=80, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=4, min_samples_split=7,
            min_weight_fraction_leaf=0.0, n_estimators=488, n_jobs=1,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False)
# classifier = RandomForestClassifier(bootstrap=False, class_weight=None, criterion='gini',
#             max_depth=90, max_features='auto', max_leaf_nodes=None,
#             min_impurity_decrease=0.0, min_impurity_split=None,
#             min_samples_leaf=1, min_samples_split=5,
#             min_weight_fraction_leaf=0.0, n_estimators=922, n_jobs=2,
#             oob_score=False, random_state=None, verbose=0,
#             warm_start=False)

In [125]:
X_train, X_test,y_train, y_test = train_test_split(df, labels, test_size = 0.3)

In [18]:
#sm = SMOTE(random_state = 42)
#X_train, y_train = sm.fit_sample(X_train, y_train)
print classification_report(y_train, y_train)

                 precision    recall  f1-score   support

    Iris-setosa       1.00      1.00      1.00        37
Iris-versicolor       1.00      1.00      1.00        38
 Iris-virginica       1.00      1.00      1.00        35

    avg / total       1.00      1.00      1.00       110



In [19]:
pred = classifier.predict(X_test)
print (classification_report(y_test, pred))
print(confusion_matrix(y_test, pred))


                 precision    recall  f1-score   support

    Iris-setosa       1.00      1.00      1.00        10
Iris-versicolor       1.00      1.00      1.00        12
 Iris-virginica       1.00      1.00      1.00        11

    avg / total       1.00      1.00      1.00        33

[[10  0  0]
 [ 0 12  0]
 [ 0  0 11]]


In [25]:
classifier.fit(X_train, y_train)
filename = 'data/iris_test.csv'
test_df = pd.read_csv(filename, index_col = 0)
pred_test = classifier.predict(test_df)

In [27]:
test_df['pred'] = pred_test

In [28]:
final_df = test_df['pred']

In [30]:
final_df.to_csv('iris_final.csv')

######################

In [103]:
def run_rf(df, labels, classifier):
    X_train, X_test,y_train, y_test = train_test_split(df, labels, test_size = 0.3)
    classifier.fit(X_train, y_train)
    pred = classifier.predict(X_test)
    
    return accuracy_score(y_test, pred)

In [106]:
importance = classifier.feature_importances_
thing = zip(list(X_train),(importance))
sorted(thing, key = lambda x: x[1])

[(array([6.3, 2.3, 4.4, 1.3]), 0.023022894921921857),
 (array([5.1, 2.5, 3. , 1.1]), 0.08464294664068044),
 (array([5.6, 2.5, 3.9, 1.1]), 0.39005395920273184),
 (array([5.1, 3.3, 1.7, 0.5]), 0.5022801992346658)]

In [None]:
correct = 0
for i in range(100):
    if i % 10 == 0:
        print "%d percent done" % i
    correct += run_rf(df, labels, classifier)
print correct/100
    

0 percent done
10 percent done
20 percent done
30 percent done
40 percent done
50 percent done
