In [1]:
"""
import gc
import numpy as np
import pandas as pd
import xgboost as xgb
from sklearn.cross_validation import StratifiedKFold
from sklearn.metrics import matthews_corrcoef
from operator import itemgetter

def get_date_features():
    directory = '../input/'
    trainfile = 'train_date.csv'
    
    for i, chunk in enumerate(pd.read_csv(directory + trainfile,
                                          chunksize=1,
                                          low_memory=False)):
        features = list(chunk.columns)
        break

    seen = np.zeros(52)
    rv = []
    for f in features:
        if f == 'Id' or 'S24' in f or 'S25' in f:
            rv.append(f)
            continue
            
        station = int(f.split('_')[1][1:])
#        print(station)
        
        if seen[station]:
            continue
        
        seen[station] = 1
        rv.append(f)
        
    return rv
        
usefuldatefeatures = get_date_features()

def get_mindate():
    directory = '../input/'
    trainfile = 'train_date.csv'
    testfile = 'test_date.csv'
    
    features = None
    subset = None
    
    for i, chunk in enumerate(pd.read_csv(directory + trainfile,
                                          usecols=usefuldatefeatures,
                                          chunksize=50000,
                                          low_memory=False)):
        print(i)
        
        if features is None:
            features = list(chunk.columns)
            features.remove('Id')
        
        df_mindate_chunk = chunk[['Id']].copy()
        df_mindate_chunk['mindate'] = chunk[features].min(axis=1).values
        
        if subset is None:
            subset = df_mindate_chunk.copy()
        else:
            subset = pd.concat([subset, df_mindate_chunk])
            
        del chunk
        gc.collect()

    for i, chunk in enumerate(pd.read_csv(directory + testfile,
                                          usecols=usefuldatefeatures,
                                          chunksize=50000,
                                          low_memory=False)):
        print(i)
        
        df_mindate_chunk = chunk[['Id']].copy()
        df_mindate_chunk['mindate'] = chunk[features].min(axis=1).values
        subset = pd.concat([subset, df_mindate_chunk])
        
        del chunk
        gc.collect()      
        
    return subset


df_mindate = get_mindate()

df_mindate.sort_values(by=['mindate', 'Id'], inplace=True)

df_mindate['mindate_id_diff'] = df_mindate.Id.diff()

midr = np.full_like(df_mindate.mindate_id_diff.values, np.nan)
midr[0:-1] = -df_mindate.mindate_id_diff.values[1:]

df_mindate['mindate_id_diff_reverse'] = midr

def mcc(tp, tn, fp, fn):
    sup = tp * tn - fp * fn
    inf = (tp + fp) * (tp + fn) * (tn + fp) * (tn + fn)
    if inf == 0:
        return 0
    else:
        return sup / np.sqrt(inf)


def eval_mcc(y_true, y_prob, show=False):
    idx = np.argsort(y_prob)
    y_true_sort = y_true[idx]
    n = y_true.shape[0]
    nump = 1.0 * np.sum(y_true)  # number of positive
    numn = n - nump  # number of negative
    tp = nump
    tn = 0.0
    fp = numn
    fn = 0.0
    best_mcc = 0.0
    best_id = -1
    mccs = np.zeros(n)
    for i in range(n):
        if y_true_sort[i] == 1:
            tp -= 1.0
            fn += 1.0
        else:
            fp -= 1.0
            tn += 1.0
        new_mcc = mcc(tp, tn, fp, fn)
        mccs[i] = new_mcc
        if new_mcc >= best_mcc:
            best_mcc = new_mcc
            best_id = i
    if show:
        best_proba = y_prob[idx[best_id]]
        y_pred = (y_prob > best_proba).astype(int)
        return best_proba, best_mcc, y_pred
    else:
        return best_mcc


def mcc_eval(y_prob, dtrain):
    y_true = dtrain.get_label()
    best_mcc = eval_mcc(y_true, y_prob)
    return 'MCC', best_mcc


def create_feature_map(features):
    outfile = open('xgb.fmap', 'w')
    for i, feat in enumerate(features):
        outfile.write('{0}\t{1}\tq\n'.format(i, feat))
    outfile.close()


def get_importance(gbm, features):
    create_feature_map(features)
    importance = gbm.get_fscore(fmap='xgb.fmap')
    importance = sorted(importance.items(), key=itemgetter(1), reverse=True)
    return importance


def LeaveOneOut(data1, data2, columnName, useLOO=False):
    grpOutcomes = data1.groupby(columnName)['Response'].mean().reset_index()
    grpCount = data1.groupby(columnName)['Response'].count().reset_index()
    grpOutcomes['cnt'] = grpCount.Response
    if(useLOO):
        grpOutcomes = grpOutcomes[grpOutcomes.cnt > 1]
    grpOutcomes.drop('cnt', inplace=True, axis=1)
    outcomes = data2['Response'].values
    x = pd.merge(data2[[columnName, 'Response']], grpOutcomes,
                 suffixes=('x_', ''),
                 how='left',
                 on=columnName,
                 left_index=True)['Response']
    if(useLOO):
        x = ((x*x.shape[0])-outcomes)/(x.shape[0]-1)
        #  x = x + np.random.normal(0, .01, x.shape[0])
    return x.fillna(x.mean())


def GrabData():
    directory = '../input/'
    trainfiles = ['train_categorical.csv',
                  'train_date.csv',
                  'train_numeric.csv']
    testfiles = ['test_categorical.csv',
                 'test_date.csv',
                 'test_numeric.csv']

    cols = [['Id',
             'L1_S24_F1559', 'L3_S32_F3851',
             'L1_S24_F1827', 'L1_S24_F1582',
             'L3_S32_F3854', 'L1_S24_F1510',
             'L1_S24_F1525'],
            ['Id',
             'L3_S30_D3496', 'L3_S30_D3506',
             'L3_S30_D3501', 'L3_S30_D3516',
             'L3_S30_D3511'],
            ['Id',
             'L1_S24_F1846', 'L3_S32_F3850',
             'L1_S24_F1695', 'L1_S24_F1632',
             'L3_S33_F3855', 'L1_S24_F1604',
             'L3_S29_F3407', 'L3_S33_F3865',
             'L3_S38_F3952', 'L1_S24_F1723',
             'Response']]
    traindata = None
    testdata = None
    for i, f in enumerate(trainfiles):
        print(f)
        subset = None
        for i, chunk in enumerate(pd.read_csv(directory + f,
                                              usecols=cols[i],
                                              chunksize=50000,
                                              low_memory=False)):
            print(i)
            if subset is None:
                subset = chunk.copy()
            else:
                subset = pd.concat([subset, chunk])
            del chunk
            gc.collect()
        if traindata is None:
            traindata = subset.copy()
        else:
            traindata = pd.merge(traindata, subset.copy(), on="Id")
        del subset
        gc.collect()
    del cols[2][-1]  # Test doesn't have response!
    for i, f in enumerate(testfiles):
        print(f)
        subset = None
        for i, chunk in enumerate(pd.read_csv(directory + f,
                                              usecols=cols[i],
                                              chunksize=50000,
                                              low_memory=False)):
            print(i)
            if subset is None:
                subset = chunk.copy()
            else:
                subset = pd.concat([subset, chunk])
            del chunk
            gc.collect()
        if testdata is None:
            testdata = subset.copy()
        else:
            testdata = pd.merge(testdata, subset.copy(), on="Id")
        del subset
        gc.collect()
        
    traindata = traindata.merge(df_mindate, on='Id')
    testdata = testdata.merge(df_mindate, on='Id')
        
    testdata['Response'] = 0  # Add Dummy Value
    visibletraindata = traindata[::2]
    blindtraindata = traindata[1::2]
    print(blindtraindata.columns)
    for i in range(2):
        for col in cols[i][1:]:
            print(col)
            blindtraindata.loc[:, col] = LeaveOneOut(visibletraindata,
                                                     blindtraindata,
                                                     col, False).values
            testdata.loc[:, col] = LeaveOneOut(visibletraindata,
                                               testdata, col, False).values
    del visibletraindata
    gc.collect()
    testdata.drop('Response', inplace=True, axis=1)
    return blindtraindata, testdata


def Train():
    train, test = GrabData()
    print('Train:', train.shape)
    print('Test', test.shape)
    features = list(train.columns)
    features.remove('Response')
    features.remove('Id')
    print(features)
    num_rounds = 50
    params = {}
    params['objective'] = "binary:logistic"
    params['eta'] = 0.021
    params['max_depth'] = 7
    params['colsample_bytree'] = 0.82
    params['min_child_weight'] = 3
    params['base_score'] = 0.005
    params['silent'] = True

    print('Fitting')
    trainpredictions = None
    testpredictions = None

    dvisibletrain = \
        xgb.DMatrix(train[features],
                    train.Response,
                    silent=True)
    dtest = \
        xgb.DMatrix(test[features],
                    silent=True)

    folds = 1
    for i in range(folds):
        print('Fold:', i)
        params['seed'] = i
        watchlist = [(dvisibletrain, 'train'), (dvisibletrain, 'val')]
        clf = xgb.train(params, dvisibletrain,
                        num_boost_round=num_rounds,
                        evals=watchlist,
                        early_stopping_rounds=20,
                        feval=mcc_eval,
                        maximize=True
                        )
        limit = clf.best_iteration+1
        # limit = clf.best_ntree_limit
        predictions = \
            clf.predict(dvisibletrain, ntree_limit=limit)

        best_proba, best_mcc, y_pred = eval_mcc(train.Response,
                                                predictions,
                                                True)
        print('tree limit:', limit)
        print('mcc:', best_mcc)
        print(matthews_corrcoef(train.Response,
                                y_pred))
        if(trainpredictions is None):
            trainpredictions = predictions
        else:
            trainpredictions += predictions
        predictions = clf.predict(dtest, ntree_limit=limit)
        if(testpredictions is None):
            testpredictions = predictions
        else:
            testpredictions += predictions
        imp = get_importance(clf, features)
        print('Importance array: ', imp)

    best_proba, best_mcc, y_pred = eval_mcc(train.Response,
                                            trainpredictions/folds,
                                            True)
    print(matthews_corrcoef(train.Response,
                            y_pred))

    submission = pd.DataFrame({"Id": train.Id,
                               "Prediction": trainpredictions/folds,
                               "Response": train.Response})
    submission[['Id',
                'Prediction',
                'Response']].to_csv('rawtrainxgbsubmission'+str(folds)+'.csv',
                                    index=False)

    submission = pd.DataFrame({"Id": test.Id.values,
                               "Response": testpredictions/folds})
    submission[['Id', 'Response']].to_csv('rawxgbsubmission'+str(folds)+'.csv',
                                          index=False)
    y_pred = (testpredictions/folds > .08).astype(int)
    submission = pd.DataFrame({"Id": test.Id.values,
                               "Response": y_pred})
    submission[['Id', 'Response']].to_csv('xgbsubmission'+str(folds)+'.csv',
                                          index=False)

if __name__ == "__main__":
    print('Started')
    Train()
    print('Finished')
"""

'import gc\nimport numpy as np\nimport pandas as pd\nimport xgboost as xgb\nfrom sklearn.cross_validation import StratifiedKFold\nfrom sklearn.metrics import matthews_corrcoef\nfrom operator import itemgetter\n\n# per raddar, all date features except for stations 24+25 are identical\n\ndef get_date_features():\n    directory = \'../input/\'\n    trainfile = \'train_date.csv\'\n    \n    for i, chunk in enumerate(pd.read_csv(directory + trainfile,\n                                          chunksize=1,\n                                          low_memory=False)):\n        features = list(chunk.columns)\n        break\n\n    seen = np.zeros(52)\n    rv = []\n    for f in features:\n        if f == \'Id\' or \'S24\' in f or \'S25\' in f:\n            rv.append(f)\n            continue\n            \n        station = int(f.split(\'_\')[1][1:])\n#        print(station)\n        \n        if seen[station]:\n            continue\n        \n        seen[station] = 1\n        rv.append(f)\n  

In [2]:
import gc
import numpy as np
import pandas as pd
import xgboost as xgb
from sklearn.cross_validation import StratifiedKFold
from sklearn.metrics import matthews_corrcoef
from operator import itemgetter

def get_date_features():
    directory = '../data/'
    trainfile = 'train_date.csv'
    
    for i, chunk in enumerate(pd.read_csv(directory + trainfile,
                                          chunksize=1,
                                          low_memory=False)):
        features = list(chunk.columns)
        break

    seen = np.zeros(52)
    rv = []
    for f in features:
        if f == 'Id' or 'S24' in f or 'S25' in f:
            rv.append(f)
            continue
            
        station = int(f.split('_')[1][1:])
        print(station)
        
        if seen[station]:
            continue
        
        seen[station] = 1
        rv.append(f)
        
    return rv
        
usefuldatefeatures = get_date_features()


0
0
0
0
0
0
0
0
0
0
0
0
1
1
2
2
2
2
2
2
2
2
2
3
3
3
3
3
3
3
3
3
4
4
5
5
6
6
6
6
6
7
7
7
7
7
8
8
8
8
9
9
9
9
9
9
9
9
9
9
9
9
9
10
10
10
10
10
10
10
10
10
10
10
10
10
11
11
11
11
11
11
11
11
11
11
11
11
11
12
12
12
12
12
12
12
12
12
12
12
12
13
13
14
14
14
14
14
14
14
14
14
15
15
15
15
15
15
15
15
15
16
16
17
17
18
18
18
18
18
19
19
19
19
19
20
20
20
20
21
21
21
21
21
21
21
21
21
21
21
21
21
21
21
22
22
22
22
22
22
22
22
22
22
22
22
22
22
22
23
23
23
23
23
23
23
23
23
23
23
23
23
23
23
26
26
26
26
26
26
26
26
26
26
26
26
26
26
26
26
26
26
26
26
26
26
26
26
26
26
27
27
27
27
27
27
27
27
27
27
27
27
27
27
27
27
27
27
27
27
27
27
27
27
27
27
28
28
28
28
28
28
28
28
28
28
28
28
28
28
28
28
28
28
28
28
28
28
28
28
28
28
29
29
29
29
29
29
29
29
29
29
29
29
29
29
29
29
29
29
29
29
29
29
29
29
29
29
29
29
29
29
29
29
29
29
29
29
29
29
29
29
29
29
29
29
29
29
29
29
29
29
29
29
29
29
29
29
29
29
29
29
29
29
29
30
30
30
30
30
30
30
30
30
30
30
30
30
30
30
30
30
30
30
30
30
30
30
30
30
30
30
30
30
3

In [3]:
usefuldatefeatures

['Id',
 'L0_S0_D1',
 'L0_S1_D26',
 'L0_S2_D34',
 'L0_S3_D70',
 'L0_S4_D106',
 'L0_S5_D115',
 'L0_S6_D120',
 'L0_S7_D137',
 'L0_S8_D145',
 'L0_S9_D152',
 'L0_S10_D216',
 'L0_S11_D280',
 'L0_S12_D331',
 'L0_S13_D355',
 'L0_S14_D360',
 'L0_S15_D395',
 'L0_S16_D423',
 'L0_S17_D432',
 'L0_S18_D437',
 'L0_S19_D454',
 'L0_S20_D462',
 'L0_S21_D469',
 'L0_S22_D543',
 'L0_S23_D617',
 'L1_S24_D677',
 'L1_S24_D681',
 'L1_S24_D685',
 'L1_S24_D689',
 'L1_S24_D693',
 'L1_S24_D697',
 'L1_S24_D702',
 'L1_S24_D707',
 'L1_S24_D712',
 'L1_S24_D716',
 'L1_S24_D721',
 'L1_S24_D725',
 'L1_S24_D730',
 'L1_S24_D735',
 'L1_S24_D739',
 'L1_S24_D743',
 'L1_S24_D748',
 'L1_S24_D753',
 'L1_S24_D758',
 'L1_S24_D763',
 'L1_S24_D768',
 'L1_S24_D772',
 'L1_S24_D777',
 'L1_S24_D782',
 'L1_S24_D787',
 'L1_S24_D792',
 'L1_S24_D797',
 'L1_S24_D801',
 'L1_S24_D804',
 'L1_S24_D807',
 'L1_S24_D809',
 'L1_S24_D811',
 'L1_S24_D813',
 'L1_S24_D815',
 'L1_S24_D818',
 'L1_S24_D822',
 'L1_S24_D826',
 'L1_S24_D831',
 'L1_S24_D836',


In [53]:

def get_mindate():
    directory = '../data/'
    trainfile = 'train_date.csv'
    testfile = 'test_date.csv'
    
    features = None
    subset = None
    
    for i, chunk in enumerate(pd.read_csv(directory + trainfile,
                                          usecols=usefuldatefeatures,
                                          chunksize=50000,
                                          low_memory=False)):
        print(i)
        
        if features is None:
            features = list(chunk.columns)
            features.remove('Id')
        
        df_mindate_chunk = chunk[['Id']].copy()
        df_mindate_chunk['mindate'] = chunk[features].min(axis=1).values
        df_mindate_chunk['maxdate'] = chunk[features].max(axis=1).values 
        df_mindate_chunk['diff'] = df_mindate_chunk['maxdate'] - df_mindate_chunk['mindate']
        
        if subset is None:
            subset = df_mindate_chunk.copy()
        else:
            subset = pd.concat([subset, df_mindate_chunk])
            
        del chunk
        gc.collect()

    for i, chunk in enumerate(pd.read_csv(directory + testfile,
                                          usecols=usefuldatefeatures,
                                          chunksize=50000,
                                          low_memory=False)):
        print(i)
        
        df_mindate_chunk = chunk[['Id']].copy()
        df_mindate_chunk['mindate'] = chunk[features].min(axis=1).values
        df_mindate_chunk['maxdate'] = chunk[features].max(axis=1).values 
        df_mindate_chunk['diff'] = df_mindate_chunk['maxdate'] - df_mindate_chunk['mindate']
        subset = pd.concat([subset, df_mindate_chunk])
        
        del chunk
        gc.collect()      
        
    return subset


df_mindate_0 = get_mindate()



0
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
0
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23


In [54]:
df_mindate_0

Unnamed: 0,Id,mindate,maxdate,diff
0,4,82.24,87.29,5.05
1,6,1313.12,1315.75,2.63
2,7,1618.70,1624.42,5.72
3,9,1149.20,1154.16,4.96
4,11,602.64,606.02,3.38
5,13,1331.66,1339.73,8.07
6,14,1662.63,1664.04,1.41
7,16,791.22,804.36,13.14
8,18,517.64,518.08,0.44
9,23,156.27,157.89,1.62


In [55]:
df_mindate_0.sort_values(by=['mindate', 'Id'], inplace=True)
df_mindate_1 = df_mindate_0
df_mindate_1['mindate_id_diff'] = df_mindate_1.Id.diff()

midr = np.full_like(df_mindate_1.mindate_id_diff.values, np.nan)
midr[0:-1] = -df_mindate_1.mindate_id_diff.values[1:]

df_mindate_1['mindate_id_diff_reverse'] = midr
df_mindate = df_mindate_1 

df_mindate

Unnamed: 0,Id,mindate,maxdate,diff,mindate_id_diff,mindate_id_diff_reverse
5472,510783,0.00,1.61,1.61,,-140759.0
25556,651542,0.00,1.53,1.53,140759.0,543349.0
4038,108193,0.01,1.61,1.60,-543349.0,-322219.0
15108,430412,0.01,1.53,1.52,322219.0,-13085.0
21628,443497,0.01,1.53,1.52,13085.0,-69868.0
6783,513365,0.01,1.53,1.52,69868.0,-3711.0
8658,517076,0.01,1.53,1.52,3711.0,-3140.0
10251,520216,0.01,1.52,1.51,3140.0,-1046.0
10490,521262,0.01,1.53,1.52,1046.0,-53267.0
37271,574529,0.01,1.52,1.51,53267.0,-6182.0


In [None]:

def mcc(tp, tn, fp, fn):
    sup = tp * tn - fp * fn
    inf = (tp + fp) * (tp + fn) * (tn + fp) * (tn + fn)
    if inf == 0:
        return 0
    else:
        return sup / np.sqrt(inf)


def eval_mcc(y_true, y_prob, show=False):
    idx = np.argsort(y_prob)
    y_true_sort = y_true[idx]
    n = y_true.shape[0]
    nump = 1.0 * np.sum(y_true)  # number of positive
    numn = n - nump  # number of negative
    tp = nump
    tn = 0.0
    fp = numn
    fn = 0.0
    best_mcc = 0.0
    best_id = -1
    mccs = np.zeros(n)
    for i in range(n):
        if y_true_sort[i] == 1:
            tp -= 1.0
            fn += 1.0
        else:
            fp -= 1.0
            tn += 1.0
        new_mcc = mcc(tp, tn, fp, fn)
        mccs[i] = new_mcc
        if new_mcc >= best_mcc:
            best_mcc = new_mcc
            best_id = i
    if show:
        best_proba = y_prob[idx[best_id]]
        y_pred = (y_prob > best_proba).astype(int)
        return best_proba, best_mcc, y_pred
    else:
        return best_mcc


def mcc_eval(y_prob, dtrain):
    y_true = dtrain.get_label()
    best_mcc = eval_mcc(y_true, y_prob)
    return 'MCC', best_mcc


def create_feature_map(features):
    outfile = open('xgb.fmap', 'w')
    for i, feat in enumerate(features):
        outfile.write('{0}\t{1}\tq\n'.format(i, feat))
    outfile.close()


def get_importance(gbm, features):
    create_feature_map(features)
    importance = gbm.get_fscore(fmap='xgb.fmap')
    importance = sorted(importance.items(), key=itemgetter(1), reverse=True)
    return importance


def LeaveOneOut(data1, data2, columnName, useLOO=False):
    grpOutcomes = data1.groupby(columnName)['Response'].mean().reset_index()
    grpCount = data1.groupby(columnName)['Response'].count().reset_index()
    grpOutcomes['cnt'] = grpCount.Response
    #if(useLOO):
    #    grpOutcomes = grpOutcomes[grpOutcomes.cnt > 1]
    if(useLOO): 
        grpOutcomes = grpOutcomes[grpOutcomes.cnt > 30]
    else: 
        grpOutcomes = grpOutcomes[grpOutcomes.cnt >= 30]
        
    grpOutcomes.drop('cnt', inplace=True, axis=1)
    outcomes = data2['Response'].values
    x = pd.merge(data2[[columnName, 'Response']], grpOutcomes,
                 suffixes=('x_', ''),
                 how='left',
                 on=columnName,
                 left_index=True)['Response']
    if(useLOO):
        x = ((x*x.shape[0])-outcomes)/(x.shape[0]-1)
        #  x = x + np.random.normal(0, .01, x.shape[0])
    return x.fillna(x.mean())


def GrabData():
    directory = '../data/'
    trainfiles = ['train_categorical.csv',
                  'train_date.csv',
                  'train_numeric.csv']
    testfiles = ['test_categorical.csv',
                 'test_date.csv',
                 'test_numeric.csv']

    cols = [['Id',
             'L1_S24_F1559', 'L3_S32_F3851',
             'L1_S24_F1827', 'L1_S24_F1582',
             'L3_S32_F3854', 'L1_S24_F1510',
             'L1_S24_F1525'],
            ['Id',
             'L3_S30_D3496', 'L3_S30_D3506',
             'L3_S30_D3501', 'L3_S30_D3516',
             'L3_S30_D3511',
             #added this piece for data vars
             'L0_S2_D34', 'L0_S3_D70', 'L0_S7_D137', 'L3_S29_D3474',
             #'L3_S32_D3852', 'L3_S33_D3856', 'L3_S34_D3875', 'L3_S35_D3886',
             #'L3_S49_D4208'
            ],
            ['Id',
             'L1_S24_F1846', 'L3_S32_F3850',
             'L1_S24_F1695', 'L1_S24_F1632',
             'L3_S33_F3855', 'L1_S24_F1604',
             'L3_S29_F3407', 
             'L3_S33_F3865',
             'L3_S38_F3952', 'L1_S24_F1723',
             #added this piece for numeric vars
             'L0_S0_F18', 'L0_S0_F20', 
             'L0_S4_F104', 
             'L0_S10_F264', 
                            
             'L3_S29_F3327', 'L3_S29_F3339', 'L3_S29_F3342',
             'L3_S29_F3382', 
               
             'L3_S30_F3704',
             'L3_S30_F3754', 'L3_S30_F3759',
             'L3_S33_F3857', 'L3_S33_F3859',
             'L3_S36_F3920',
             
             'Response']]
    traindata = None
    testdata = None
    for i, f in enumerate(trainfiles):
        print(f)
        subset = None
        for i, chunk in enumerate(pd.read_csv(directory + f,
                                              usecols=cols[i],
                                              chunksize=50000,
                                              low_memory=False)):
            print(i)
            if subset is None:
                subset = chunk.copy()
            else:
                subset = pd.concat([subset, chunk])
            del chunk
            gc.collect()
        if traindata is None:
            traindata = subset.copy()
        else:
            traindata = pd.merge(traindata, subset.copy(), on="Id")
        del subset
        gc.collect()
    del cols[2][-1]  # Test doesn't have response!
    for i, f in enumerate(testfiles):
        print(f)
        subset = None
        for i, chunk in enumerate(pd.read_csv(directory + f,
                                              usecols=cols[i],
                                              chunksize=50000,
                                              low_memory=False)):
            print(i)
            if subset is None:
                subset = chunk.copy()
            else:
                subset = pd.concat([subset, chunk])
            del chunk
            gc.collect()
        if testdata is None:
            testdata = subset.copy()
        else:
            testdata = pd.merge(testdata, subset.copy(), on="Id")
        del subset
        gc.collect()
        
    traindata = traindata.merge(df_mindate, on='Id')
    testdata = testdata.merge(df_mindate, on='Id')
        
    testdata['Response'] = 0  # Add Dummy Value
    visibletraindata = traindata[::2]
    blindtraindata = traindata[1::2]
    print(blindtraindata.columns)
    for i in range(2):
        for col in cols[i][1:]:
            print(col)
            blindtraindata.loc[:, col] = LeaveOneOut(visibletraindata,
                                                     blindtraindata,
                                                     col, False).values
            testdata.loc[:, col] = LeaveOneOut(visibletraindata,
                                               testdata, col, False).values
    del visibletraindata
    gc.collect()
    testdata.drop('Response', inplace=True, axis=1)
    return blindtraindata, testdata


In [82]:
"""
def Train():
    train, test = GrabData()
    print('Train:', train.shape)
    print('Test', test.shape)
    features = list(train.columns)
    features.remove('Response')
    features.remove('Id')
    print(features)
    num_rounds = 50
    params = {}
    params['objective'] = "binary:logistic"
    params['eta'] = 0.021
    params['max_depth'] = 7
    params['colsample_bytree'] = 0.82
    params['min_child_weight'] = 3
    params['base_score'] = 0.005
    params['silent'] = True

    print('Fitting')
    trainpredictions = None
    testpredictions = None

    dvisibletrain = \
        xgb.DMatrix(train[features],
                    train.Response,
                    silent=True)
    dtest = \
        xgb.DMatrix(test[features],
                    silent=True)

    folds = 1
    for i in range(folds):
        print('Fold:', i)
        params['seed'] = i
        watchlist = [(dvisibletrain, 'train'), (dvisibletrain, 'val')]
        clf = xgb.train(params, dvisibletrain,
                        num_boost_round=num_rounds,
                        evals=watchlist,
                        early_stopping_rounds=20,
                        feval=mcc_eval,
                        maximize=True
                        )
        limit = clf.best_iteration+1
        # limit = clf.best_ntree_limit
        predictions = \
            clf.predict(dvisibletrain, ntree_limit=limit)

        best_proba, best_mcc, y_pred = eval_mcc(train.Response,
                                                predictions,
                                                True)
        print('tree limit:', limit)
        print('mcc:', best_mcc)
        print(matthews_corrcoef(train.Response,
                                y_pred))
        if(trainpredictions is None):
            trainpredictions = predictions
        else:
            trainpredictions += predictions
        predictions = clf.predict(dtest, ntree_limit=limit)
        if(testpredictions is None):
            testpredictions = predictions
        else:
            testpredictions += predictions
        imp = get_importance(clf, features)
        print('Importance array: ', imp)

    best_proba, best_mcc, y_pred = eval_mcc(train.Response,
                                            trainpredictions/folds,
                                            True)
    print(matthews_corrcoef(train.Response,
                            y_pred))

    submission = pd.DataFrame({"Id": train.Id,
                               "Prediction": trainpredictions/folds,
                               "Response": train.Response})
    submission[['Id',
                'Prediction',
                'Response']].to_csv('rawtrainxgbsubmission'+str(folds)+'.csv',
                                    index=False)

    submission = pd.DataFrame({"Id": test.Id.values,
                               "Response": testpredictions/folds})
    submission[['Id', 'Response']].to_csv('rawxgbsubmission'+str(folds)+'.csv',
                                          index=False)
    y_pred = (testpredictions/folds > .08).astype(int)
    submission = pd.DataFrame({"Id": test.Id.values,
                               "Response": y_pred})
    submission[['Id', 'Response']].to_csv('xgbsubmission'+str(folds)+'.csv',
                                          index=False)

if __name__ == "__main__":
    print('Started')
    Train()
    print('Finished')
    """

'\ndef Train():\n    train, test = GrabData()\n    print(\'Train:\', train.shape)\n    print(\'Test\', test.shape)\n    features = list(train.columns)\n    features.remove(\'Response\')\n    features.remove(\'Id\')\n    print(features)\n    num_rounds = 50\n    params = {}\n    params[\'objective\'] = "binary:logistic"\n    params[\'eta\'] = 0.021\n    params[\'max_depth\'] = 7\n    params[\'colsample_bytree\'] = 0.82\n    params[\'min_child_weight\'] = 3\n    params[\'base_score\'] = 0.005\n    params[\'silent\'] = True\n\n    print(\'Fitting\')\n    trainpredictions = None\n    testpredictions = None\n\n    dvisibletrain =         xgb.DMatrix(train[features],\n                    train.Response,\n                    silent=True)\n    dtest =         xgb.DMatrix(test[features],\n                    silent=True)\n\n    folds = 1\n    for i in range(folds):\n        print(\'Fold:\', i)\n        params[\'seed\'] = i\n        watchlist = [(dvisibletrain, \'train\'), (dvisibletrain, \'val\

In [None]:
#def Train(train, test):
def Train():
    train, test = GrabData()
    print('Train:', train.shape)
    print('Test', test.shape)
    features = list(train.columns)
    features.remove('Response')
    features.remove('Id')
    print(features)
    num_rounds = 50
    params = {}
    params['objective'] = "binary:logistic"
    params['eta'] = 0.021
    params['max_depth'] = 7
    params['colsample_bytree'] = 0.82
    params['min_child_weight'] = 3
    params['base_score'] = 0.005
    params['silent'] = True

    print('Fitting')
    trainpredictions = None
    testpredictions = None

    dvisibletrain = \
        xgb.DMatrix(train[features],
                    train.Response,
                    silent=True)
    dtest = \
        xgb.DMatrix(test[features],
                    silent=True)

    folds = 1
    for i in range(folds):
        print('Fold:', i)
        params['seed'] = i
        watchlist = [(dvisibletrain, 'train'), (dvisibletrain, 'val')]
        clf = xgb.train(params, dvisibletrain,
                        num_boost_round=num_rounds,
                        evals=watchlist,
                        early_stopping_rounds=20,
                        feval=mcc_eval,
                        maximize=True
                        )
        limit = clf.best_iteration+1
        # limit = clf.best_ntree_limit
        predictions = \
            clf.predict(dvisibletrain, ntree_limit=limit)

        best_proba, best_mcc, y_pred = eval_mcc(train.Response.values,
                                                predictions,
                                                True)
        print('tree limit:', limit)
        print('mcc:', best_mcc)
        print(matthews_corrcoef(train.Response,
                                y_pred))
        if(trainpredictions is None):
            trainpredictions = predictions
        else:
            trainpredictions += predictions
        predictions = clf.predict(dtest, ntree_limit=limit)
        if(testpredictions is None):
            testpredictions = predictions
        else:
            testpredictions += predictions
        imp = get_importance(clf, features)
        print('Importance array: ', imp)

    best_proba, best_mcc, y_pred = eval_mcc(train.Response.values,
                                            trainpredictions/folds,
                                            True)
    print(matthews_corrcoef(train.Response,
                            y_pred))

    submission = pd.DataFrame({"Id": train.Id,
                               "Prediction": trainpredictions/folds,
                               "Response": train.Response})
    submission[['Id',
                'Prediction',
                'Response']].to_csv('rawtrainxgbsubmission'+str(folds)+'.csv',
                                    index=False)

    submission = pd.DataFrame({"Id": test.Id.values,
                               "Response": testpredictions/folds})
    submission[['Id', 'Response']].to_csv('rawxgbsubmission'+str(folds)+'.csv',
                                          index=False)
    #y_pred = (testpredictions/folds > .08).astype(int)
    y_pred = (testpredictions/folds > .08).astype(int)
    submission = pd.DataFrame({"Id": test.Id.values,
                               "Response": y_pred})
    submission[['Id', 'Response']].to_csv('xgbsubmission'+str(folds)+'.csv',
                                          index=False)

if __name__ == "__main__":
    print('Started')
    Train()
    print('Finished')
 


In [84]:
train, test = GrabData()


train_categorical.csv
0
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
train_date.csv
0
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
train_numeric.csv
0
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
test_categorical.csv
0
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
test_date.csv
0
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
test_numeric.csv
0
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
Index(['Id', 'L1_S24_F1510', 'L1_S24_F1525', 'L1_S24_F1559', 'L1_S24_F1582',
       'L1_S24_F1827', 'L3_S32_F3851', 'L3_S32_F3854', 'L0_S2_D34',
       'L0_S3_D70', 'L0_S7_D137', 'L3_S29_D3474', 'L3_S30_D3496',
       'L3_S30_D3501', 'L3_S30_D3506', 'L3_S30_D3511', 'L3_S30_D3516',
       'L3_S32_D3852', 'L3_S33_D3856', 'L3_S34_D3875', 'L3_S35_D3886',
       'L3_S49_D4208', 'L0_S0_F18', 'L0_S0_F20', 'L0_S2_F44', 'L0_S3_F72',
       'L0_S4_F104', 'L0_S9_F160', 'L0_S10_F224', 'L0_S10_F264', 'L0_S15_F418',
       'L1_S2

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self.obj[item] = s


L3_S32_F3851
L1_S24_F1827
L1_S24_F1582
L3_S32_F3854
L1_S24_F1510
L1_S24_F1525
L3_S30_D3496
L3_S30_D3506
L3_S30_D3501
L3_S30_D3516
L3_S30_D3511
L0_S2_D34
L0_S3_D70
L0_S7_D137
L3_S29_D3474
L3_S32_D3852
L3_S33_D3856
L3_S34_D3875
L3_S35_D3886
L3_S49_D4208


In [85]:
if __name__ == "__main__":
    print('Started')
    Train(train, test)
    print('Finished')
    

Started
Train: (591873, 70)
Test (1183748, 69)
['L1_S24_F1510', 'L1_S24_F1525', 'L1_S24_F1559', 'L1_S24_F1582', 'L1_S24_F1827', 'L3_S32_F3851', 'L3_S32_F3854', 'L0_S2_D34', 'L0_S3_D70', 'L0_S7_D137', 'L3_S29_D3474', 'L3_S30_D3496', 'L3_S30_D3501', 'L3_S30_D3506', 'L3_S30_D3511', 'L3_S30_D3516', 'L3_S32_D3852', 'L3_S33_D3856', 'L3_S34_D3875', 'L3_S35_D3886', 'L3_S49_D4208', 'L0_S0_F18', 'L0_S0_F20', 'L0_S2_F44', 'L0_S3_F72', 'L0_S4_F104', 'L0_S9_F160', 'L0_S10_F224', 'L0_S10_F264', 'L0_S15_F418', 'L1_S24_F1134', 'L1_S24_F1498', 'L1_S24_F1514', 'L1_S24_F1609', 'L1_S24_F1723', 'L1_S24_F1844', 'L1_S24_F1846', 'L1_S25_F2136', 'L2_S26_F3117', 'L3_S29_F3327', 'L3_S29_F3339', 'L3_S29_F3342', 'L3_S29_F3354', 'L3_S29_F3376', 'L3_S29_F3382', 'L3_S29_F3479', 'L3_S30_F3494', 'L3_S30_F3554', 'L3_S30_F3569', 'L3_S30_F3604', 'L3_S30_F3704', 'L3_S30_F3709', 'L3_S30_F3754', 'L3_S30_F3759', 'L3_S30_F3774', 'L3_S30_F3784', 'L3_S32_F3850', 'L3_S33_F3855', 'L3_S33_F3857', 'L3_S33_F3859', 'L3_S33_F3865', 'L3

In [80]:
Importance_array = ('maxdate', 177), ('mindate', 173), ('diff', 126), ('mindate_id_diff_reverse', 114), ('mindate_id_diff', 111), ('L3_S33_F3859', 97), ('L3_S33_F3857', 72), ('L3_S30_F3704', 72), ('L3_S32_F3854', 71), ('L3_S32_F3850', 71), ('L0_S0_F20', 68), ('L0_S4_F104', 67), ('L3_S30_F3754', 62), ('L3_S29_F3327', 57), ('L3_S29_F3339', 50), ('L1_S24_F1846', 50), ('L3_S30_F3759', 50), ('L3_S30_F3604', 50), ('L0_S0_F18', 48), ('L3_S29_F3382', 46), ('L3_S30_F3709', 44), ('L3_S29_F3342', 43), ('L3_S36_F3920', 43), ('L1_S24_F1514', 38), ('L3_S33_F3865', 38), ('L3_S38_F3952', 36), ('L0_S2_F44', 35), ('L3_S30_F3554', 34), ('L1_S24_F1844', 31), ('L3_S33_F3855', 30), ('L0_S9_F160', 27), ('L1_S24_F1723', 26), ('L3_S30_F3569', 25), ('L0_S3_F72', 24), ('L3_S29_F3376', 24), ('L1_S24_F1498', 23), ('L3_S29_F3354', 22), ('L3_S29_F3479', 22), ('L3_S30_F3774', 22), ('L1_S24_F1609', 22), ('L3_S30_F3784', 21), ('L2_S26_F3117', 21), ('L3_S30_F3494', 20), ('L0_S1_F28', 18), ('L3_S30_F3534', 18), ('L1_S24_F1604', 17), ('L3_S30_F3794', 14), ('L3_S29_F3373', 13), ('L1_S24_F1632', 13), ('L3_S29_F3315', 13), ('L1_S24_F1565', 13), ('L3_S29_F3321', 12), ('L3_S30_F3509', 12), ('L1_S24_F1783', 12), ('L1_S24_F1695', 12), ('L3_S29_F3407', 11), ('L0_S13_F354', 10), ('L0_S9_F175', 10), ('L0_S2_F60', 7), ('L1_S25_F2126', 7), ('L0_S11_F290', 7), ('L3_S29_F3461', 6), ('L0_S15_F406', 5), ('L0_S10_F249', 4), ('L1_S25_F2176', 4), ('L1_S24_F897', 2), ('L0_S16_F426', 1)
Importance_array

(('maxdate', 177),
 ('mindate', 173),
 ('diff', 126),
 ('mindate_id_diff_reverse', 114),
 ('mindate_id_diff', 111),
 ('L3_S33_F3859', 97),
 ('L3_S33_F3857', 72),
 ('L3_S30_F3704', 72),
 ('L3_S32_F3854', 71),
 ('L3_S32_F3850', 71),
 ('L0_S0_F20', 68),
 ('L0_S4_F104', 67),
 ('L3_S30_F3754', 62),
 ('L3_S29_F3327', 57),
 ('L3_S29_F3339', 50),
 ('L1_S24_F1846', 50),
 ('L3_S30_F3759', 50),
 ('L3_S30_F3604', 50),
 ('L0_S0_F18', 48),
 ('L3_S29_F3382', 46),
 ('L3_S30_F3709', 44),
 ('L3_S29_F3342', 43),
 ('L3_S36_F3920', 43),
 ('L1_S24_F1514', 38),
 ('L3_S33_F3865', 38),
 ('L3_S38_F3952', 36),
 ('L0_S2_F44', 35),
 ('L3_S30_F3554', 34),
 ('L1_S24_F1844', 31),
 ('L3_S33_F3855', 30),
 ('L0_S9_F160', 27),
 ('L1_S24_F1723', 26),
 ('L3_S30_F3569', 25),
 ('L0_S3_F72', 24),
 ('L3_S29_F3376', 24),
 ('L1_S24_F1498', 23),
 ('L3_S29_F3354', 22),
 ('L3_S29_F3479', 22),
 ('L3_S30_F3774', 22),
 ('L1_S24_F1609', 22),
 ('L3_S30_F3784', 21),
 ('L2_S26_F3117', 21),
 ('L3_S30_F3494', 20),
 ('L0_S1_F28', 18),
 ('L3_

In [86]:
Importance_array_2 = ('mindate', 203), ('maxdate', 191), ('mindate_id_diff_reverse', 119), ('mindate_id_diff', 119), ('diff', 114), ('L3_S33_F3857', 96), ('L3_S33_F3859', 96), ('L0_S0_F20', 93), ('L1_S24_F1846', 73), ('L3_S32_F3854', 73), ('L3_S32_F3850', 69), ('L3_S30_F3704', 67), ('L0_S4_F104', 62), ('L1_S24_F1844', 59), ('L3_S30_F3754', 56), ('L3_S36_F3920', 52), ('L3_S29_F3342', 52), ('L3_S30_F3759', 51), ('L0_S0_F18', 51), ('L3_S29_F3339', 46), ('L3_S29_F3382', 44), ('L3_S29_F3327', 42), ('L3_S33_F3865', 42), ('L3_S30_F3604', 39), ('L3_S38_F3952', 38), ('L1_S24_F1514', 38), ('L3_S30_F3784', 37), ('L3_S30_F3709', 35), ('L0_S2_F44', 35), ('L1_S24_F1723', 34), ('L3_S29_F3376', 33), ('L3_S30_F3569', 30), ('L1_S24_F1609', 29), ('L3_S30_F3554', 28), ('L1_S24_F1498', 26), ('L3_S30_F3494', 24), ('L2_S26_F3117', 24), ('L0_S9_F160', 23), ('L3_S29_F3479', 23), ('L3_S33_F3855', 21), ('L0_S3_F72', 21), ('L3_S30_F3774', 20), ('L3_S29_F3354', 15), ('L1_S25_F2136', 11), ('L0_S15_F418', 5), ('L0_S10_F224', 1)
Importance_array_2

(('mindate', 203),
 ('maxdate', 191),
 ('mindate_id_diff_reverse', 119),
 ('mindate_id_diff', 119),
 ('diff', 114),
 ('L3_S33_F3857', 96),
 ('L3_S33_F3859', 96),
 ('L0_S0_F20', 93),
 ('L1_S24_F1846', 73),
 ('L3_S32_F3854', 73),
 ('L3_S32_F3850', 69),
 ('L3_S30_F3704', 67),
 ('L0_S4_F104', 62),
 ('L1_S24_F1844', 59),
 ('L3_S30_F3754', 56),
 ('L3_S36_F3920', 52),
 ('L3_S29_F3342', 52),
 ('L3_S30_F3759', 51),
 ('L0_S0_F18', 51),
 ('L3_S29_F3339', 46),
 ('L3_S29_F3382', 44),
 ('L3_S29_F3327', 42),
 ('L3_S33_F3865', 42),
 ('L3_S30_F3604', 39),
 ('L3_S38_F3952', 38),
 ('L1_S24_F1514', 38),
 ('L3_S30_F3784', 37),
 ('L3_S30_F3709', 35),
 ('L0_S2_F44', 35),
 ('L1_S24_F1723', 34),
 ('L3_S29_F3376', 33),
 ('L3_S30_F3569', 30),
 ('L1_S24_F1609', 29),
 ('L3_S30_F3554', 28),
 ('L1_S24_F1498', 26),
 ('L3_S30_F3494', 24),
 ('L2_S26_F3117', 24),
 ('L0_S9_F160', 23),
 ('L3_S29_F3479', 23),
 ('L3_S33_F3855', 21),
 ('L0_S3_F72', 21),
 ('L3_S30_F3774', 20),
 ('L3_S29_F3354', 15),
 ('L1_S25_F2136', 11),
 ('