# Introduction 
This Notebook will run through sevral models and model optimizing techniques. 
The dataset is the car evaluation dataset containing condition information on used car. This is a simple dataset just to demostrate how to evaluate models.

In [1]:
# Importing libraries for use later
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline

In [2]:
# REading in the dataset
data = pd.read_csv('assets/datasets/car.csv')
# Let's see what the data looks like
data.head()

Unnamed: 0,buying,maint,doors,persons,lug_boot,safety,acceptability
0,vhigh,vhigh,2,2,small,low,unacc
1,vhigh,vhigh,2,2,small,med,unacc
2,vhigh,vhigh,2,2,small,high,unacc
3,vhigh,vhigh,2,2,med,low,unacc
4,vhigh,vhigh,2,2,med,med,unacc


In [3]:
# Let's take a look at the values in each columns
print data.buying.unique()
print data.maint.unique()
print data.lug_boot.unique()
print data.safety.unique()
print data.acceptability.unique()
print data.persons.unique()
print data.doors.unique()

['vhigh' 'high' 'med' 'low']
['vhigh' 'high' 'med' 'low']
['small' 'med' 'big']
['low' 'med' 'high']
['unacc' 'acc' 'vgood' 'good']
['2' '4' 'more']
['2' '3' '4' '5more']


In [4]:
# Let's encode our values 
data['buying']= data.buying.factorize()[0]
data['maint']= data.maint.factorize()[0]
data['lug_boot']= data.lug_boot.factorize()[0]
data['safety']= data.safety.factorize()[0]
data['acceptability']= data.acceptability.factorize()[0]
data['persons']= data.persons.factorize()[0]
data['doors']= data.doors.factorize()[0]
data.buying.unique()

array([0, 1, 2, 3])

In [5]:
# Creating x and y for modeling
X = data.drop('buying', axis=1)
y = data['buying']

# Modeling
We are going to run several classification models and compare the results

In [6]:
# Importing metrics to score models on
from sklearn.cross_validation import train_test_split, KFold
from sklearn.metrics import accuracy_score
# Accuracy score measures how many predictions our model correctly classified



In [7]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=99, stratify=y)

In [8]:
def evaluate_model (model):
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    score = accuracy_score(y_test, y_pred)
    print str((float(score) * 100)) + str('%')
# Creating Dictionary of model scores for easy comparison
model_score={}

In [9]:
# Importing Models to use
from sklearn.neighbors import KNeighborsClassifier as KNN
from sklearn.ensemble import BaggingClassifier
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier

In [10]:
#Importing Gridsearch to better tune models by finding best parameters
from sklearn.model_selection import GridSearchCV
from sklearn.cross_validation import KFold

### 1. KNN 

In [12]:
# KNN
model = KNN(n_neighbors=14)
evaluate_model(model)

22.7360308285%


In [13]:
# The score was bad using GridSearch we can find the best 'n' to use.

In [14]:
x = range(2,100)
# Creating the dictionary of parameters to check
params={'n_neighbors': x}
knngrid = GridSearchCV(KNN(),
                      params, n_jobs=-1,
                      cv=KFold(len(y), n_folds=5, shuffle=True))
knngrid.fit(X,y)

print knngrid.best_params_
print knngrid.best_score_

# Our score got better with an 'n'

{'n_neighbors': 97}
0.288773148148


In [15]:
# Lets see if bagging our aggregating KNN
bagknn = BaggingClassifier(KNN(n_neighbors=77))
evaluate_model(bagknn)
# It did get better

30.4431599229%


In [18]:
# So gridsearching helped us before lets see if grid searching
# our bagging classifier gets a better score.  

In [19]:
gsbagknn.fit(X,y)
print gsbagknn.best_params_
print gsbagknn.best_score_

{'max_features': 1.0, 'max_samples': 0.7, 'n_estimators': 20, 'bootstrap_features': False}
0.286458333333


#### So our best KNN score was that our bagged KNN of 30.% So lets try another model

## 2. Logistic Regression

In [20]:
# Again we'll start with a basic Logistic Regression
evaluate_model(LogisticRegression())

28.901734104%


In [27]:
# This is our Grid Searched Logistic Regression
lrparams ={'penalty':['l1', 'l2'],
           'C': [0.001, 0.01, 0.1, 1.0, 10.0, 100.0]}
gslr = GridSearchCV(LogisticRegression(), 
                    lrparams, n_jobs=-1,
                        cv=KFold(len(y), n_folds=3, shuffle=True))
gslr.fit(X,y)

print gslr.best_params_
print gslr.best_score_
# Score did get better now we know what parameters will get us a better score.


{'penalty': 'l1', 'C': 100.0}
0.313657407407


In [29]:
# Logistic regression with bagging, to see if this helps
 
lrbag = BaggingClassifier(LogisticRegression())
gsbaglr = GridSearchCV(lrbag, bagparams,
                      n_jobs=5,
                        cv=KFold(len(y), n_folds=5, shuffle=True))
gsbaglr.fit(X,y)

print gsbaglr.best_params_
print gsbaglr.best_score_

# Okay our score got slightly better

{'max_features': 1.0, 'max_samples': 0.7, 'n_estimators': 10, 'bootstrap_features': False}
0.317708333333


#### For our Logistic Regression bagging gave us the highest score yet at 31.7%, only slightly better our bagged KNN

# 3. Support Vector Machines

In [39]:
#Let's see if SVM performs better
svm = SVC()
evaluate_model(svm)

26.3969171484%


In [43]:
# Grid Search parametes for our Support Vector Machine
params = {'C': [0.01, 0.1, 1.0, 10.0, 30.0, 100.0],
          'gamma': ['auto', 0.1, 1.0, 10.0],
          'kernel': ['linear', 'rbf']}
gssvm = GridSearchCV(svm, params,
                   n_jobs=-1, cv=KFold(len(y), n_folds=5, shuffle=True))
gssvm.fit(X,y)
print gssvm.best_params_
print gssvm.best_score_

{'kernel': 'linear', 'C': 1.0, 'gamma': 'auto'}
0.332175925926


In [None]:
#### Our GridSearch did even better with our Support vector machine. Having our highest score of 33%

## 7. Random Forest & Extra Trees

Let's see if Random Forest and Extra Trees perform better

In [46]:
# Lets try both of these models
rf = RandomForestClassifier()
evaluate_model(rf)
#This model Performed poorly

8.86319845857%
8.67052023121%


In [54]:
# Gridsearch parameters 
params = {'n_estimators':[3, 5, 10, 50],
          'criterion': ['gini', 'entropy'],
          'max_depth': [None, 3, 5],
          'min_samples_split': [2,5],
          'class_weight':[None, 'balanced']}


gsrf = GridSearchCV(RandomForestClassifier(n_jobs=-1),
                    params, n_jobs=-1,
                    cv=KFold(len(y), n_folds=5, shuffle=True))

gsrf.fit(X,y)

print gsrf.best_params_
print gsrf.best_score_

{'min_samples_split': 2, 'n_estimators': 50, 'criterion': 'entropy', 'max_depth': 3, 'class_weight': None}
0.310185185185


# Conclusion
This was great practice seeing how models can be tuned using bagging classfiers along with grid search.