In [28]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline

from sklearn.tree import DecisionTreeRegressor
from sklearn.preprocessing import LabelEncoder
from sklearn.cross_validation import cross_val_score, StratifiedKFold, train_test_split, KFold
from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier, BaggingClassifier
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.grid_search import GridSearchCV

In [2]:
df=pd.read_csv('../datasets/car.csv')
# df

In [3]:
for column in df.columns:
    print df[column].describe()

count     1728
unique       4
top        med
freq       432
Name: buying, dtype: object
count     1728
unique       4
top        med
freq       432
Name: maint, dtype: object
count     1728
unique       4
top          3
freq       432
Name: doors, dtype: object
count     1728
unique       3
top       more
freq       576
Name: persons, dtype: object
count     1728
unique       3
top        med
freq       576
Name: lug_boot, dtype: object
count     1728
unique       3
top        med
freq       576
Name: safety, dtype: object
count      1728
unique        4
top       unacc
freq       1210
Name: acceptability, dtype: object


In [10]:
le = LabelEncoder()
y = le.fit_transform(df['acceptability'])
X = pd.get_dummies(df.drop('acceptability', axis=1))

In [12]:
def plot_things(y, x):
    plt.scatter(y, x)
    plt.plot([8, 9, 10], [8, 9, 10])
    plt.xlim((8,10))
    plt.ylim((8,10))
    plt.xlabel('Actual Y values')
    plt.ylabel('Predicted Values')
    plt.show()

In [22]:
def get_cv(target):
    return StratifiedKFold(target, n_folds=3, shuffle=True, random_state=41)

In [14]:
def score(model, data, target):
    model.fit(data, target)
    return model.score(data, target)

In [44]:
# GridSearchCV?

In [36]:
def grid_search(model, params, cv):
    return GridSearchCV(estimator=model, 
                    param_grid=params,
                    cv=cv
                   )

In [73]:
def evaluate_model(model, data, target, params=None):
    x_train, x_test, y_train, y_test=train_test_split(data, target, stratify=target)
#     print score(model, x_train, y_train), "\n"
    cv=get_cv(y_train)
    if params:
        grid=grid_search(model, params, cv)
    
        grid.fit(x_train, y_train)
        model = grid.best_estimator_
        print "Best Model after Grid Search:\n", model
        
    else:
        model.fit(x_train, y_train)
        
    s=cross_val_score(model, x_train, y_train, cv=cv, n_jobs=-1)
    print "Mean score of the model is: {}".format(s.mean())
    predictions = model.predict(x_test)
    
    print "Confusion Matrix:\n",confusion_matrix(y_test, predictions), "\n"
    print "Classification Report:\n", classification_report(y_test, predictions)
    return model

In [74]:
knn=KNeighborsClassifier()
params={
    'n_neighbors':range(1, len(X.columns)),
    'weights':['uniform', 'distance']
}
knn=evaluate_model(knn, X, y, params)

Best Model after Grid Search:
KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
           metric_params=None, n_jobs=1, n_neighbors=7, p=2,
           weights='distance')
Mean score of the model is: 0.909753729266
Confusion Matrix:
[[ 84   2  10   0]
 [  8   9   0   0]
 [  6   0 296   0]
 [  3   2   1  10]] 

Classification Report:
             precision    recall  f1-score   support

          0       0.83      0.88      0.85        96
          1       0.69      0.53      0.60        17
          2       0.96      0.98      0.97       302
          3       1.00      0.62      0.77        16

avg / total       0.93      0.93      0.92       431



In [75]:
bknn=BaggingClassifier(knn)
evaluate_model(bknn, X, y)

Mean score of the model is: 0.893585459642
Confusion Matrix:
[[ 83   0  13   0]
 [  8   9   0   0]
 [  1   0 301   0]
 [  8   1   0   7]] 

Classification Report:
             precision    recall  f1-score   support

          0       0.83      0.86      0.85        96
          1       0.90      0.53      0.67        17
          2       0.96      1.00      0.98       302
          3       1.00      0.44      0.61        16

avg / total       0.93      0.93      0.92       431



BaggingClassifier(base_estimator=KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
           metric_params=None, n_jobs=1, n_neighbors=7, p=2,
           weights='distance'),
         bootstrap=True, bootstrap_features=False, max_features=1.0,
         max_samples=1.0, n_estimators=10, n_jobs=1, oob_score=False,
         random_state=None, verbose=0, warm_start=False)

In [80]:
log = LogisticRegression()
params={
    'C': [.01, .03, .05, 0.7, .09, .1, .3, .5, .7, 1, 10, 50, 100],
    'penalty': ['l1', 'l2']
}
evaluate_model(log, X, y, params)

Best Model after Grid Search:
LogisticRegression(C=100, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l1', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)
Mean score of the model is: 0.901337190607
Confusion Matrix:
[[ 78   2  15   1]
 [ 11   4   0   2]
 [ 15   0 287   0]
 [  5   0   0  11]] 

Classification Report:
             precision    recall  f1-score   support

          0       0.72      0.81      0.76        96
          1       0.67      0.24      0.35        17
          2       0.95      0.95      0.95       302
          3       0.79      0.69      0.73        16

avg / total       0.88      0.88      0.88       431



LogisticRegression(C=100, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l1', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)

In [79]:
dt=DecisionTreeClassifier()
evaluate_model(dt, X, y)

Mean score of the model is: 0.942177323324
Confusion Matrix:
[[ 93   0   3   0]
 [  0  17   0   0]
 [  6   3 293   0]
 [  0   0   0  16]] 

Classification Report:
             precision    recall  f1-score   support

          0       0.94      0.97      0.95        96
          1       0.85      1.00      0.92        17
          2       0.99      0.97      0.98       302
          3       1.00      1.00      1.00        16

avg / total       0.97      0.97      0.97       431



DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=None,
            max_features=None, max_leaf_nodes=None, min_samples_leaf=1,
            min_samples_split=2, min_weight_fraction_leaf=0.0,
            presort=False, random_state=None, splitter='best')

In [82]:
rf=RandomForestClassifier(class_weight='balanced', n_jobs=-1)
evaluate_model(rf, X, y)

Mean score of the model is: 0.932135692776
Confusion Matrix:
[[ 87   0   8   1]
 [  6  10   0   1]
 [  2   0 300   0]
 [  0   0   0  16]] 

Classification Report:
             precision    recall  f1-score   support

          0       0.92      0.91      0.91        96
          1       1.00      0.59      0.74        17
          2       0.97      0.99      0.98       302
          3       0.89      1.00      0.94        16

avg / total       0.96      0.96      0.96       431



RandomForestClassifier(bootstrap=True, class_weight='balanced',
            criterion='gini', max_depth=None, max_features='auto',
            max_leaf_nodes=None, min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=-1,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False)

In [83]:
et = ExtraTreesClassifier(class_weight='balanced', n_jobs=-1)
evaluate_model(et, X, y)

Mean score of the model is: 0.93984647036
Confusion Matrix:
[[ 91   0   3   2]
 [  4  12   0   1]
 [  5   0 297   0]
 [  0   1   0  15]] 

Classification Report:
             precision    recall  f1-score   support

          0       0.91      0.95      0.93        96
          1       0.92      0.71      0.80        17
          2       0.99      0.98      0.99       302
          3       0.83      0.94      0.88        16

avg / total       0.96      0.96      0.96       431



ExtraTreesClassifier(bootstrap=False, class_weight='balanced',
           criterion='gini', max_depth=None, max_features='auto',
           max_leaf_nodes=None, min_samples_leaf=1, min_samples_split=2,
           min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=-1,
           oob_score=False, random_state=None, verbose=0, warm_start=False)