In [1]:
from __future__ import division, print_function, unicode_literals

# Common imports
import numpy as np
import pandas as pd
import os
import seaborn as sns

# to make this notebook's output stable across runs
np.random.seed(42)

# To plot pretty figures
%matplotlib inline
import matplotlib
import matplotlib.pyplot as plt
plt.rcParams['axes.labelsize'] = 14
plt.rcParams['xtick.labelsize'] = 12
plt.rcParams['ytick.labelsize'] = 12

# Where to save the figures
PROJECT_ROOT_DIR = "."

def image_path(fig_id):
    return os.path.join(PROJECT_ROOT_DIR, "images", fig_id)

def save_fig(fig_id, tight_layout=True):
    print("Saving figure", fig_id)
    if tight_layout:
        plt.tight_layout()
    plt.savefig(image_path(fig_id) + ".png", format='png', dpi=300)

In [2]:
training = pd.read_csv('./training.csv')
test = pd.read_csv('./test.csv')

# before we drop and modify the traning and test datasets, make copies of them
df_training = training.copy()
df_test = test.copy()
numerical_features = training.select_dtypes(include = ['float64', 'int64', 'bool']).columns.values
categorical_features = training.select_dtypes(include = ['object']).columns.values

In [3]:
drop_list = ['PRIMEUNIT', 'AUCGUART', 'RefId', 'Model', 'Trim', 
             'SubModel', 'WheelType', 'BYRNO', 'VNZIP1']

for df in [df_training, df_test]:
    df.PurchDate = pd.to_datetime(df.PurchDate, format=r'%m/%d/%Y').astype(np.int64)//10**15
    
for df in [df_training, df_test]:
    # add engine feature
    df['Engine'] = df.SubModel.str.extract('\s+(\d+.\d+)L', expand = True).astype('float64')

# now we can start the drop list
for df in [df_training, df_test]:
    df.drop(drop_list, axis=1, inplace=True)

y = df_training['IsBadBuy'].copy()
df_training.drop(['IsBadBuy'], axis=1, inplace = True)

numerical_features = df_training.select_dtypes(include = ['float64', 'int64', 'bool']).columns.values
# not choose 'IsBadBuy'
categorical_features = df_training.select_dtypes(include = ['object']).columns.values

dict = {'Color': 'SILVER',
        'Transmission': 'AUTO',
        'Nationality': 'AMERICAN',
        'Size': 'MEDIUM',
        'TopThreeAmericanName': 'GM'}

for df in [df_training, df_test]:
    for feature in dict.keys():
        df[feature] = df[feature].str.upper()
        df[feature] = df[feature].fillna(dict[feature])

print(df_training.shape)
print(df_test.shape)
print(y.shape)

(72983, 25)
(48707, 25)
(72983,)


In [4]:
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import Imputer, StandardScaler, OneHotEncoder

def num_pipe(df):
    temp = Imputer(strategy = 'median').fit_transform(df[numerical_features])
    temp = pd.DataFrame(temp)
    temp = StandardScaler().fit_transform(temp)
    df[numerical_features] = pd.DataFrame(temp, columns = numerical_features)
    

def cat_pipe(df):
    temp = pd.get_dummies(df[categorical_features])
    res = pd.concat([df[numerical_features], temp], axis = 1)
    return res

In [5]:
num_pipe(df_training)
num_pipe(df_test)

onehot_training = cat_pipe(df_training)
onehot_test = cat_pipe(df_test)

# some color, state, color are different in training and test datasets, drop them
train_droplist = ['Make_HUMMER', 'Make_PLYMOUTH', 'Make_TOYOTA SCION', 'VNST_IA',
                  'VNST_ID','VNST_MA', 'VNST_MD','VNST_MI','VNST_NE']
test_droplist = ['Color_PINK', 'VNST_WI']
onehot_training.drop(train_droplist, axis = 1, inplace=True)
onehot_test.drop(test_droplist, axis = 1, inplace = True)

In [6]:
print(onehot_training.shape)
print(onehot_test.shape)

(72983, 119)
(48707, 119)


In [7]:
onehot_training[:10]

Unnamed: 0,PurchDate,VehYear,VehicleAge,WheelTypeID,VehOdo,MMRAcquisitionAuctionAveragePrice,MMRAcquisitionAuctionCleanPrice,MMRAcquisitionRetailAveragePrice,MMRAcquisitonRetailCleanPrice,MMRCurrentAuctionAveragePrice,...,VNST_OK,VNST_OR,VNST_PA,VNST_SC,VNST_TN,VNST_TX,VNST_UT,VNST_VA,VNST_WA,VNST_WV
0,-0.142534,0.379467,-0.687212,-0.909824,1.203528,0.823058,0.902005,0.994646,1.107445,0.543047,...,0,0,0,0,0,0,0,0,0,0
1,-0.142534,-0.775775,0.480877,-0.909824,1.515419,0.294555,0.370805,0.76048,0.803783,0.545105,...,0,0,0,0,0,0,0,0,0,0
2,-0.142534,-0.198154,-0.103168,1.01436,0.158244,-1.188989,-0.960135,-0.492422,-0.411749,-0.863122,...,0,0,0,0,0,0,0,0,0,0
3,-0.142534,-0.775775,0.480877,-0.909824,-0.403531,-1.720741,-1.726077,-1.216468,-1.229094,-1.76503,...,0,0,0,0,0,0,0,0,0,0
4,-0.142534,-0.198154,-0.103168,1.01436,-0.146308,-0.900161,-0.852131,-0.245264,-0.337901,-1.187496,...,0,0,0,0,0,0,0,0,0,0
5,-0.142534,-0.775775,0.480877,1.01436,0.655335,-0.905036,-0.905766,-0.56752,-0.376302,-0.585675,...,0,0,0,0,0,0,0,0,0,0
6,-0.142534,-0.775775,0.480877,1.01436,-0.423354,-1.284859,-1.225367,-0.715181,-0.400228,-1.297404,...,0,0,0,0,0,0,0,0,0,0
7,-0.142534,-0.198154,-0.103168,1.01436,-0.390635,-1.143898,-1.11369,-0.579878,-0.633292,-0.995671,...,0,0,0,0,0,0,0,0,0,0
8,-0.142534,0.957087,-1.271257,1.01436,-1.480161,0.027257,-0.036596,0.377068,0.228363,0.117409,...,0,0,0,0,0,0,0,0,0,0
9,-0.142534,0.957087,-1.271257,-0.909824,0.917222,0.697127,0.873718,1.025699,1.123987,1.249424,...,0,0,0,0,0,0,0,0,0,0


In [8]:
onehot_test[:10]

Unnamed: 0,PurchDate,VehYear,VehicleAge,WheelTypeID,VehOdo,MMRAcquisitionAuctionAveragePrice,MMRAcquisitionAuctionCleanPrice,MMRAcquisitionRetailAveragePrice,MMRAcquisitonRetailCleanPrice,MMRCurrentAuctionAveragePrice,...,VNST_OK,VNST_OR,VNST_PA,VNST_SC,VNST_TN,VNST_TX,VNST_UT,VNST_VA,VNST_WA,VNST_WV
0,-0.271831,-0.191584,-0.126587,-0.859108,0.952571,-0.449487,-0.371046,-0.835355,-0.750903,-0.498067,...,0,0,0,0,0,0,0,0,0,0
1,-0.271831,-0.191584,-0.126587,-0.859108,-0.669761,-0.665492,-0.629222,-1.01839,-0.976153,-0.60534,...,0,0,0,0,0,0,0,0,0,0
2,-0.271831,0.386289,-0.71318,-0.859108,-0.158296,1.674703,2.079962,0.962724,1.387625,1.968372,...,0,0,0,0,0,0,0,0,0,0
3,-0.271831,-1.925206,1.633191,-0.859108,1.125959,-1.457784,-1.418479,-1.688876,-1.664993,-1.31375,...,0,0,0,0,0,0,0,0,0,0
4,-0.271831,0.964163,-1.299772,-0.859108,0.128084,-0.457231,-0.386883,-0.842064,-0.764888,-0.453508,...,0,0,0,0,0,0,0,0,0,0
5,-0.271831,0.386289,-0.71318,-0.859108,1.344764,-0.727441,-0.766228,-1.070776,-1.09577,-0.805031,...,0,0,0,0,0,0,0,0,0,0
6,-0.271831,0.386289,-0.71318,-0.859108,0.019026,0.994897,1.12939,0.387109,0.558339,1.094515,...,0,0,0,0,0,0,0,0,0,0
7,-0.271831,-1.347332,1.046598,-0.859108,0.233621,-0.910842,-0.969895,-1.22602,-1.27341,-0.957276,...,0,0,0,0,0,0,0,0,0,0
8,-0.271831,-0.191584,-0.126587,-0.859108,-1.94097,-0.723773,-0.761808,-1.067582,-1.091902,-0.768311,...,0,0,0,0,0,0,0,0,0,0
9,-0.271831,0.386289,-0.71318,-0.859108,0.458017,0.882003,1.3953,0.291599,0.790432,1.434073,...,0,0,0,0,0,0,0,0,0,0


In [9]:
import pickle
onehot_training.to_pickle('my_df_training_onehot.pickle')

In [10]:
from sklearn.cross_validation import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC, LinearSVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.linear_model import Perceptron
from sklearn.linear_model import SGDClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import precision_score, accuracy_score, f1_score, roc_auc_score

import warnings
warnings.filterwarnings('ignore')

X_train, X_test, y_train, y_test = train_test_split(onehot_training, y, random_state = 24)



In [11]:
# try simple models
clfs = {'LogisticRegression':LogisticRegression(),
        #'SVC': SVC(), 
              'KNeighborsClassifier': KNeighborsClassifier(n_neighbors = 3),
              'GaussianNB': GaussianNB(), 'Perceptron': Perceptron(), 
              'LinearSVC': LinearSVC(), 'SGDClassifier': SGDClassifier(), 
              'DecisionTreeClassifier': DecisionTreeClassifier(),
              'RandomForestClassifier': RandomForestClassifier(n_estimators=100)}

for name, clf in clfs.items():
    clf.fit(X_train,y_train)
    ypred = clf.predict(X_test)
    precision_ = precision_score(ypred, y_test)
    accuracy_ = accuracy_score(ypred,y_test)
    f1_ = f1_score(ypred,y_test)
    print('%s classifier: precision = %.4f, accuracy = %.4f, f1 score = %.4f' 
          %(name, precision_, accuracy_, f1_))

DecisionTreeClassifier classifier: precision = 0.2276, accuracy = 0.7988, f1 score = 0.2181
SGDClassifier classifier: precision = 0.0996, accuracy = 0.8538, f1 score = 0.1438
GaussianNB classifier: precision = 0.9084, accuracy = 0.2504, f1 score = 0.2301
LinearSVC classifier: precision = 0.0004, accuracy = 0.8767, f1 score = 0.0009
Perceptron classifier: precision = 0.1169, accuracy = 0.8397, f1 score = 0.1524
LogisticRegression classifier: precision = 0.0044, accuracy = 0.8766, f1 score = 0.0088
KNeighborsClassifier classifier: precision = 0.1129, accuracy = 0.8508, f1 score = 0.1573
RandomForestClassifier classifier: precision = 0.0191, accuracy = 0.8778, f1 score = 0.0371


In [12]:
# undersample
from imblearn.under_sampling import RandomUnderSampler 
from imblearn.metrics import classification_report_imbalanced

X_resampled, y_resampled = RandomUnderSampler().fit_sample(X_train, y_train)

# try simple models
clfs = {'LogisticRegression':LogisticRegression(),'SVC': SVC(), 
              'KNeighborsClassifier': KNeighborsClassifier(n_neighbors = 3),
              'GaussianNB': GaussianNB(), 'Perceptron': Perceptron(), 
              'LinearSVC': LinearSVC(), 'SGDClassifier': SGDClassifier(), 
              'DecisionTreeClassifier': DecisionTreeClassifier(),
              'RandomForestClassifier': RandomForestClassifier(n_estimators=100)}

for name, clf in clfs.items():
    clf.fit(X_resampled, y_resampled)
    y_pred = clf.predict(X_test)
    print(clf)
    print(classification_report_imbalanced(y_test, y_pred))

DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=None,
            max_features=None, max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, presort=False, random_state=None,
            splitter='best')
                   pre       rec       spe        f1       geo       iba       sup

          0       0.91      0.58      0.60      0.71      0.39      0.16     15996
          1       0.17      0.60      0.58      0.26      0.39      0.14      2250

avg / total       0.82      0.58      0.59      0.65      0.39      0.16     18246

SGDClassifier(alpha=0.0001, average=False, class_weight=None, epsilon=0.1,
       eta0=0.0, fit_intercept=True, l1_ratio=0.15,
       learning_rate='optimal', loss='hinge', max_iter=5, n_iter=None,
       n_jobs=1, penalty='l2', power_t=0.5, random_state=None,
       shuffle=True, tol=None, verbose=0, warm_start=F

In [13]:
# oversample
from imblearn.over_sampling import SMOTE
from imblearn.metrics import classification_report_imbalanced

X_resampled, y_resampled = SMOTE().fit_sample(X_train, y_train)

# try simple models
clfs = {'LogisticRegression':LogisticRegression(),'SVC': SVC(), 
              'KNeighborsClassifier': KNeighborsClassifier(n_neighbors = 3),
              'GaussianNB': GaussianNB(), 'Perceptron': Perceptron(), 
              'LinearSVC': LinearSVC(), 'SGDClassifier': SGDClassifier(), 
              'DecisionTreeClassifier': DecisionTreeClassifier(),
              'RandomForestClassifier': RandomForestClassifier(n_estimators=100)}

for name, clf in clfs.items():
    clf.fit(X_resampled, y_resampled)
    y_pred = clf.predict(X_test)
    print(clf)
    print(classification_report_imbalanced(y_test, y_pred))

DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=None,
            max_features=None, max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, presort=False, random_state=None,
            splitter='best')
                   pre       rec       spe        f1       geo       iba       sup

          0       0.89      0.86      0.22      0.87      0.40      0.18     15996
          1       0.18      0.22      0.86      0.20      0.40      0.15      2250

avg / total       0.80      0.78      0.30      0.79      0.40      0.17     18246

SGDClassifier(alpha=0.0001, average=False, class_weight=None, epsilon=0.1,
       eta0=0.0, fit_intercept=True, l1_ratio=0.15,
       learning_rate='optimal', loss='hinge', max_iter=5, n_iter=None,
       n_jobs=1, penalty='l2', power_t=0.5, random_state=None,
       shuffle=True, tol=None, verbose=0, warm_start=F