In [1]:
import numpy as np
import pandas as pd

from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LinearRegression
from sklearn.neighbors import KNeighborsClassifier

import decision_tree
import random_forest
import linear_regression
from naive_bayes import NaiveBayes
from nearest_neighbors import KNN

from sklearn_base import Pipeline
from sklearn_base import DummyTransformer
from sklearn_base import ModelSelector

from metrics import accuracy
from metrics import accuracy_wrapper

from sklearn.metrics import accuracy_score, mean_squared_error
from sklearn.model_selection import train_test_split
from sklearn import datasets
import pickle

import time
import matplotlib.pyplot as plt

In [2]:
def load_dataset(filename):
    with open(filename, "rb") as f:
        return pickle.load(f)

In [3]:
df = load_dataset("citiesSmall.pkl")
X = df['X']
y = df['y']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=19)
X_train.shape

(300, 2)

In [7]:
pipeline = Pipeline([
    ('transform', DummyTransformer()),
    ('classify',  RandomForestClassifier())
])

# Fit and predict using the pipeline
pipeline.fit(X_train, y_train)
predictions = pipeline.predict(X_test)
predictions


array([0, 1, 0, 0, 0, 0, 1, 1, 0, 1, 1, 0, 1, 1, 1, 0, 1, 1, 1, 0, 1, 1,
       1, 1, 1, 0, 0, 0, 1, 0, 1, 1, 0, 0, 0, 1, 0, 1, 1, 0, 0, 0, 0, 0,
       0, 1, 0, 1, 0, 0, 1, 0, 0, 0, 0, 1, 1, 0, 0, 1, 1, 0, 1, 1, 0, 0,
       1, 0, 1, 1, 1, 1, 0, 1, 1, 1, 0, 1, 1, 1, 1, 0, 0, 0, 1, 1, 0, 0,
       0, 1, 0, 0, 0, 1, 0, 0, 1, 0, 1, 0], dtype=uint8)

In [8]:
model_selector = ModelSelector([
    ('clf1', DecisionTreeClassifier()),
    ('clf3', RandomForestClassifier()),
    ('clf3', KNeighborsClassifier(n_neighbors=7))
],
scoring=accuracy_wrapper)

model_selector.fit(X_train, y_train)
best_predictions = model_selector.predict(X_test)
print("Best Score:", model_selector.best_score)
print("Best Params:", model_selector.best_params)
print("Best Estimator:", model_selector.best_estimator_)
print("Model Scores:", model_selector.get_scores())

Best Score: 0.9199999999999999
Best Params: {'bootstrap': True, 'ccp_alpha': 0.0, 'class_weight': None, 'criterion': 'gini', 'max_depth': None, 'max_features': 'sqrt', 'max_leaf_nodes': None, 'max_samples': None, 'min_impurity_decrease': 0.0, 'min_samples_leaf': 1, 'min_samples_split': 2, 'min_weight_fraction_leaf': 0.0, 'monotonic_cst': None, 'n_estimators': 100, 'n_jobs': None, 'oob_score': False, 'random_state': None, 'verbose': 0, 'warm_start': False}
Best Estimator: RandomForestClassifier()
Model Scores: [('clf3', 0.9199999999999999), ('clf1', 0.9166666666666666), ('clf3', 0.8933333333333333)]
