In [1]:
#import libraries
import numpy as np
import pandas as pd

from sklearn.preprocessing import LabelEncoder
import os
import warnings
warnings.filterwarnings('ignore')

import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

In [2]:
print(os.listdir('../Input/'))

['application_test.csv', 'application_train.csv', 'bureau.csv', 'bureau_balance.csv', 'Cleaned_Aggregated_Features.csv', 'clean_manual.csv', 'clean_manual.csv.zip', 'credit_card_balance.csv', 'home-credit-default-risk-feature-tools.zip', 'HomeCredit_columns_description.csv', 'installments_payments.csv', 'POS_CASH_balance.csv', 'previous_application.csv', 'sample_submission.csv', 'test_app_bureau.csv', 'test_app_previous.csv', 'train_app_bureau.csv', 'train_app_previous.csv']


In [9]:
from sklearn.preprocessing import MinMaxScaler, Imputer

In [3]:
train_df = pd.read_csv('../Input/train_app_bureau.csv')
test_df = pd.read_csv('../Input/test_app_bureau.csv')


In [4]:
# Create a label encoder object
le = LabelEncoder()
le_count = 0

# Iterate through the columns
for col in train_df:
    if train_df[col].dtype == 'object':
        # If 2 or fewer unique categories
        if len(list(train_df[col].unique())) <= 2:
            # Train on the training data
            le.fit(train_df[col])
            # Transform both training and testing data
            train_df[col] = le.transform(train_df[col])
            test_df[col] = le.transform(test_df[col])
            
            # Keep track of how many columns were label encoded
            le_count += 1
            
print('%d columns were label encoded.' % le_count)

3 columns were label encoded.


In [5]:
# one-hot encoding of categorical variables
train_df = pd.get_dummies(train_df)
test_df = pd.get_dummies(test_df)

print('Training Features shape: ', train_df.shape)
print('Testing Features shape: ', test_df.shape)

Training Features shape:  (307511, 455)
Testing Features shape:  (48744, 451)


In [7]:
train_labels = train_df['TARGET']

# Align the training and testing data, keep only columns present in both dataframes
train_df, test_df = train_df.align(test_df, join = 'inner', axis = 1)

# Add the target back in
train_df['TARGET'] = train_labels

print('Training Features shape: ', train_df.shape)
print('Testing Features shape: ', test_df.shape)

Training Features shape:  (307511, 452)
Testing Features shape:  (48744, 451)


In [10]:
from sklearn.svm import SVC

# Drop the target from the training data
if 'TARGET' in train_df:
    train = train_df.drop(columns = ['TARGET'])
else:
    train = train_df.copy()
    
# Feature names
features = list(train.columns)

# Copy of the testing data
#test = test_df.copy()

# Median impuation of missing values
imputer = Imputer(strategy = 'median')

# Scale each feature to 0-1
scaler = MinMaxScaler(feature_range = (0,1))

# Fit on the traing data
imputer.fit(train)

# Transform both the training and testing data
train = imputer.transform(train)
test = imputer.transform(test_df)

# Repeat with the scaler
scaler.fit(train)
train = scaler.transform(train)
test = scaler.transform(test)

del train_df, test_df
print('Training data shape: ', train.shape)
print('Testing data shape: ', test.shape)

Training data shape:  (307511, 451)
Testing data shape:  (48744, 451)


In [None]:
from sklearn.base import BaseEstimator, ClassifierMixin
from sklearn.utils.validation import check_X_y, check_is_fitted
from sklearn.linear_model import LogisticRegression
from scipy import sparse
class NbSvmClassifier(BaseEstimator, ClassifierMixin):
    def __init__(self, C=1.0, dual=False, n_jobs=1):
        self.C = C
        self.dual = dual
        self.n_jobs = n_jobs

    def predict(self, x):
        # Verify that model has been fit
        check_is_fitted(self, ['_r', '_clf'])
        return self._clf.predict(x.multiply(self._r))

    def predict_proba(self, x):
        # Verify that model has been fit
        check_is_fitted(self, ['_r', '_clf'])
        return self._clf.predict_proba(x.multiply(self._r))

    def fit(self, x, y):
        # Check that X and y have correct shape
        y = y.values
        x, y = check_X_y(x, y, accept_sparse=True)

        def pr(x, y_i, y):
            p = x[y==y_i].sum(0)
            return (p+1) / ((y==y_i).sum()+1)

        self._r = sparse.csr_matrix(np.log(pr(x,1,y) / pr(x,0,y)))
        x_nb = x.multiply(self._r)
        self._clf = LogisticRegression(C=self.C, dual=self.dual, n_jobs=self.n_jobs).fit(x_nb, y)
        return self

In [None]:
clf = NbsvmClassifier(C=1, dual=True, n_jobs = -1)
clf.fit(train, train_labels)
preds = clf.predit_proba(test)[:,1]

# Submission dataframe
submit = test[['SK_ID_CURR']]
submit['TARGET'] = preds
# Save the submission to csv file
submit.to_csv('../Submissions/NBSVM_baseline.csv', index = False)