## Data Processing

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import Imputer
from sklearn.preprocessing import PolynomialFeatures
from sklearn.preprocessing import StandardScaler
from sklearn.feature_selection import VarianceThreshold
from sklearn.feature_selection import SelectFromModel
from sklearn.utils import shuffle
from sklearn.model_selection import StratifiedKFold
from sklearn.linear_model import LogisticRegression
from sklearn import metrics, cross_validation



In [2]:
## gini coefficient
def gini(actual, pred, cmpcol = 0, sortcol = 1):
    assert( len(actual) == len(pred) )
    all = np.asarray(np.c_[ actual, pred, np.arange(len(actual)) ], dtype=np.float)
    all = all[ np.lexsort((all[:,2], -1*all[:,1])) ]
    totalLosses = all[:,0].sum()
    giniSum = all[:,0].cumsum().sum() / totalLosses
 
    giniSum -= (len(actual) + 1) / 2.
    return giniSum / len(actual)
 
def gini_normalized(a, p):
    return gini(a, p) / gini(a, a)

In [3]:
train = pd.read_csv('train.csv')
train_label = train['target']
train_id = train['id']
test = pd.read_csv('test.csv')
test_id = test['id']

In [4]:
nFolds = 10
kfold = StratifiedKFold(n_splits = nFolds, shuffle = True)

In [5]:
y = train['target'].values
#since ps_car_03_cat and ps_car_05_cathas 
#a lot of missing data 
data_feature = train.drop(['id','target',
            'ps_car_03_cat', 'ps_car_05_cat'],axis = 1)
feature_names = data_feature.columns.tolist()

mean_imp = Imputer(missing_values = -1, strategy = 'mean'
                   , axis = 0)
mode_imp = Imputer(missing_values = -1, strategy = 'most_frequent'
                   ,axis = 0)
train['ps_reg_03'] = mean_imp.fit_transform(train[['ps_reg_03']])
train['ps_car_12'] = mean_imp.fit_transform(train[['ps_car_12']])
train['ps_car_14'] = mean_imp.fit_transform(train[['ps_car_14']])
train['ps_car_11'] = mode_imp.fit_transform(train[['ps_car_11']])
cat_features = [i for i in feature_names if 
                ('cat' in i and '_1' not in i )]
train = pd.get_dummies(train, columns = cat_features, 
                       drop_first=True)

In [6]:
selector = VarianceThreshold(threshold=.01)
selector.fit(train.drop(['id', 'target'], axis=1)) # Fit to train without id and target variables
f = np.vectorize(lambda x : not x) # Function to toggle boolean array elements
v = train.drop(['id', 'target'], axis=1).columns[f(selector.get_support())]

In [7]:
print('{} variables have too low variance.'.format(len(v)))
print('These variables are {}'.format(list(v)))

24 variables have too low variance.
These variables are ['ps_ind_10_bin', 'ps_ind_11_bin', 'ps_ind_12_bin', 'ps_ind_13_bin', 'ps_car_10_cat', 'ps_car_12', 'ps_car_14', 'ps_ind_05_cat_2', 'ps_ind_05_cat_5', 'ps_car_01_cat_0', 'ps_car_01_cat_1', 'ps_car_01_cat_2', 'ps_car_04_cat_3', 'ps_car_04_cat_4', 'ps_car_04_cat_5', 'ps_car_04_cat_6', 'ps_car_04_cat_7', 'ps_car_06_cat_2', 'ps_car_06_cat_5', 'ps_car_06_cat_8', 'ps_car_06_cat_12', 'ps_car_06_cat_16', 'ps_car_06_cat_17', 'ps_car_09_cat_4']


In [None]:
from sklearn import metrics, cross_validation
for c in [10**1,10**2,10**3,30,150]:
    print(c)
    logreg=LogisticRegression(class_weight = 'balanced',C=c)
    predicted = cross_validation.cross_val_predict(logreg,train.drop(['id','target'],axis =1),y,cv=10)
    print(gini_normalized(y, predicted))
    print(metrics.classification_report(y, predicted) )

10
0.185623674704
             precision    recall  f1-score   support

          0       0.97      0.63      0.76    573518
          1       0.05      0.56      0.10     21694

avg / total       0.94      0.63      0.74    595212

100
0.185625812962
             precision    recall  f1-score   support

          0       0.97      0.63      0.76    573518
          1       0.05      0.56      0.10     21694

avg / total       0.94      0.63      0.74    595212

1000


In [53]:
logreg.fit(train.drop(['id','target'],axis =1),y)
logreg.predict(test.drop(['id'],axis = 1))


array([0, 1, 0, ..., 0, 0, 0])

In [44]:
print("gini score is: ", gini_normalized(y, predicted))

gini score is:  0.185795070237


In [54]:
prediction = logreg.predict(test.drop(['id'],axis = 1))
pd.DataFrame({'id': test_id, 'target': prediction}).to_csv('pred_avg1.csv', index=False)

In [50]:
data_feature = train.drop(['id','target',
            'ps_car_03_cat', 'ps_car_05_cat'],axis = 1)
kf = kfold.split(data_feature, train_label)
alg=LogisticRegression(class_weight = 'balanced')
pradication=[]
for t1,t2 in kf:
    train_data = train.drop(['id', 'target'],axis = 1).iloc[t1]
    train_prad = y[t1]
    alg.fit(train_data,train_prad)
    test_prad = alg.predict(train.drop(['id', 'target'],axis = 1).iloc[t2])
    pradication.append(test_prad)
pradication=np.concatenate(pradication,axis=0)

In [51]:
accuracy=sum(pradication == train['target'])/len(pradication)
print("The accuracy of train data is :",accuracy)

The accuracy of train data is : 0.612973192745


In [52]:
print("gini score is: ", gini_normalized(y, pradication))

gini score is:  -0.00100961770412


In [42]:
data_feature1 = test.drop(['id',
            'ps_car_03_cat', 'ps_car_05_cat'],axis = 1)
feature_names1 = data_feature1.columns.tolist()

mean_imp = Imputer(missing_values = -1, strategy = 'mean'
                   , axis = 0)
mode_imp = Imputer(missing_values = -1, strategy = 'most_frequent'
                   ,axis = 0)
test['ps_reg_03'] = mean_imp.fit_transform(test[['ps_reg_03']])
test['ps_car_12'] = mean_imp.fit_transform(test[['ps_car_12']])
test['ps_car_14'] = mean_imp.fit_transform(test[['ps_car_14']])
test['ps_car_11'] = mode_imp.fit_transform(test[['ps_car_11']])
cat_features = [i for i in feature_names1 if 
                ('cat' in i and '_1' not in i )]
test = pd.get_dummies(test, columns = cat_features, 
                       drop_first=True)
prediction = alg.predict(test.drop(["id"],axis = 1))
pd.DataFrame({'id': test_id, 'target': prediction}).to_csv('pred_avg.csv', index=False)

202087