In [1]:
import pandas as pd
from sklearn.cross_validation import train_test_split
from sklearn.linear_model import LogisticRegressionCV
from sklearn.metrics import confusion_matrix, roc_auc_score
from sklearn.grid_search import GridSearchCV
import numpy as np
import math
import scipy
from sklearn.feature_extraction import DictVectorizer
import csv

In [2]:
def make_dict(file, dct):
    reader = csv.DictReader(open(file))
    for row in reader:
        key = int(row.pop('segment'))
        if key in dct:
            if row['id'] in dct[key]:
                dct[key][row['id']].update({row['key']:1})
            else:
                dct[key].update({row['id']:{row['key']:1}})
        else:
            dct[key] = {row['id']: {row['key']:1}}
    return dct

In [3]:
dv = DictVectorizer()

In [4]:
def make_x_y(dct, seg1, seg2, validate=False):
    y = [1]*len(dct[seg1])+[0]*len(dct[seg2])
    values = list(dct[seg1].values())+list(dct[seg2].values())
    if validate:
        x = dv.transform(values)
    else:
        x = dv.fit_transform(values)
    return x, y

In [93]:
model = LogisticRegressionCV(Cs=list(np.linspace(.0001, 10, 30)), n_jobs=-1, max_iter=100, class_weight='balanced', fit_intercept=False, refit=True)

In [89]:
def logit(xtr, xte, ytr, yte, v_x, v_y):
    logit = model.fit(xtr, ytr)
    score = logit.score(xte, yte)
    
    preds = logit.predict(xte)
    probs = logit.predict_proba(xte)
    val_preds = logit.predict(v_x)
    val_probs = logit.predict_proba(v_x)
    cm = confusion_matrix(yte, preds)
    
    print('score:', score)
    print('roc_auc:', roc_auc_score(yte, probs[:, 1]))
    print('confusion matrix:\n', cm)

    print('validation score:', logit.score(v_x, v_y))
    print('validation roc_auc:', roc_auc_score(v_y, val_probs[:, 1]))
    print('validation cm:\n', confusion_matrix(v_y, val_preds))
    
    return probs, val_probs

In [82]:
def dec(prob, y_test):
    p = pd.DataFrame(prob)
    p['decile'] = pd.qcut(p[1], 10, labels=False)
    p['true_pos'] = y_test
    
    p_thresh = p[[1, 'decile']].groupby('decile').min().reset_index()
    p_outcome = p[['true_pos', 'decile']].groupby('decile').sum().reset_index()
    
    dec_prob = pd.merge(p_thresh, p_outcome, on='decile').sort_values(by='decile', ascending=False)
    return dec_prob

In [None]:
rfc = RandomForestClassifier(n_jobs=-1, max_features= 'sqrt', n_estimators=50, oob_score = True, warm_start=True, verbose=1) 
param_grid = {'n_estimators': [50, 100, 200, 300, 500], 'max_features': ['auto', 'sqrt', 'log2']}
cv_rfc = GridSearchCV(estimator=rfc, param_grid=param_grid, cv = 3)

def rf(xtr, xte, ytr, yte, v_x, v_y):
    rf = cv_rfc.fit(xtr, ytr)
    rf_score = rf.score(xte, yte)
    preds = rf.predict_proba(xte)
    rf_cm = confusion_matrix(yte, rf.predict(xte))
    val_preds = rf.predict_proba(v_x)
    
    print('rf_score:', rf_score)
    print('confusion matrix:\n', rf_cm)
    
    print('validation score:', rf.score(v_x, v_y))
    print('validation cm:\n', confusion_matrix(v_y, rf.predict(v_x)))
    return preds, val_preds

## first time data

In [8]:
third_train = {}
third_train = make_dict('training_12_19.csv', third_train)

In [9]:
third_val = {}
third_val = make_dict('testing_12_19.csv', third_val)

In [16]:
len(third_train[1]), len(third_train[2])

(34339, 40000)

In [17]:
len(third_val[1]), len(third_val[2])

(41732, 40000)

### 3pa model

In [37]:
x, y = make_x_y(third_train, 1, 2)

In [38]:
xtr, xte, ytr, yte = train_test_split(x, y, test_size=0.2, stratify=y)

In [39]:
val_x, val_y = make_x_y(third_val, 1, 2, validate=True)

In [50]:
prob, val_prob = logit(xtr, xte, ytr, yte, val_x, val_y)

score: 0.598466505246
roc_auc: 0.648197146185
confusion matrix:
 [[3824 4176]
 [1794 5074]]
validation score: 0.584153085695
validation roc_auc: 0.623097874233
validation cm:
 [[19205 20795]
 [13193 28539]]


In [83]:
dec(prob, yte)

Unnamed: 0,decile,1,true_pos
9,9,0.600508,953
8,8,0.571504,904
7,7,0.555091,833
6,6,0.546752,798
5,5,0.533643,704
4,4,0.506891,722
3,3,0.469537,662
2,2,0.417177,528
1,1,0.342099,450
0,0,0.101201,314


In [84]:
dec(val_prob, val_y)

Unnamed: 0,decile,1,true_pos
9,9,0.592402,5479
8,8,0.566209,5256
7,7,0.553764,5038
6,6,0.546479,4040
5,5,0.532073,4329
4,4,0.501484,4251
3,3,0.45435,4149
2,2,0.390982,3678
1,1,0.304864,3225
0,0,0.052536,2287


### census model

In [10]:
census_train = pd.read_csv('census_training_12_19.csv')

In [11]:
def census_make_dict(file, dct, data):
    reader = csv.DictReader(open(file))
    for row in reader:
        key = int(row.pop('segment'))
        if data == 'total':
            if key in dct:
                if row['id'] in dct[key]:
                    dct[key][row['id']].update({row['category']:row['total']})
                else:
                    dct[key].update({row['id']:{row['category']:row['total']}})
            else:
                dct[key] = {row['id']:{row['category']:row['total']}}
        elif data == 'bool':
            if key in dct:
                if row['id'] in dct[key]:
                    dct[key][row['id']].update({row['category']:1})
                else:
                    dct[key].update({row['id']:{row['category']:1}})
            else:
                dct[key] = {row['id']: {row['category']:1}}
    return dct

In [13]:
c = {}
c = census_make_dict('census_training_12_19.csv', c, data='total')

In [15]:
cb = {}
cb = census_make_dict('census_training_12_19.csv', c, data='bool')

In [14]:
cval = {}
cval = census_make_dict('census_testing_12_19.csv', c, data='total')

In [16]:
cbval = {}
cbval = census_make_dict('census_testing_12_19.csv', c, data='bool')

#### total visits

In [85]:
cx, cy = make_x_y(c, 1, 2)

In [86]:
cxtr, cxte, cytr, cyte = train_test_split(cx, cy, test_size=0.2, stratify=cy)

In [87]:
cval_x, cval_y = make_x_y(cval, 1, 2, validate=True)

In [90]:
cpred, cval_pred = logit(cxtr, cxte, cytr, cyte, cval_x, cval_y)

score: 0.561407219402
roc_auc: 0.595384050172
confusion matrix:
 [[4447 4053]
 [2168 3516]]
validation score: 0.566823655489
validation roc_auc: 0.603638993553
validation cm:
 [[22388 20113]
 [10607 17810]]


In [91]:
dec(cpred, cyte)

Unnamed: 0,decile,1,true_pos
9,9,0.581008,756
8,8,0.56657,680
7,7,0.54871,652
6,6,0.529881,654
5,5,0.507538,593
4,4,0.483958,551
3,3,0.457855,511
2,2,0.428861,452
1,1,0.394997,458
0,0,0.307722,377


In [92]:
dec(cval_pred, cval_y)

Unnamed: 0,decile,1,true_pos
9,9,0.581204,3817
8,8,0.566866,3517
7,7,0.550032,3311
6,6,0.530692,3199
5,5,0.50825,2990
4,4,0.483897,2754
3,3,0.457239,2561
2,2,0.428475,2333
1,1,0.394919,2119
0,0,0.305972,1816


#### boolean

In [94]:
cbx, cby = make_x_y(cb, 1, 2)

In [95]:
cbxtr, cbxte, cbytr, cbyte = train_test_split(cbx, cby, test_size=0.2, stratify=cby)

In [96]:
cbval_x, cbval_y = make_x_y(cbval, 1, 2, validate=True)

In [97]:
cbpred, cbval_pred = logit(cbxtr, cbxte, cbytr, cbyte, cbval_x, cbval_y)

score: 0.567752397067
roc_auc: 0.612939106677
confusion matrix:
 [[4348 4152]
 [1979 3705]]
validation score: 0.576186581686
validation roc_auc: 0.625907270166
validation cm:
 [[22090 20411]
 [ 9645 18772]]


In [98]:
dec(cbpred, cbyte)

Unnamed: 0,decile,1,true_pos
9,9,0.62195,769
8,8,0.596414,729
7,7,0.573718,674
6,6,0.54811,651
5,5,0.519101,590
4,4,0.484274,540
3,3,0.443424,519
2,2,0.394538,446
1,1,0.329785,445
0,0,0.068295,321


In [99]:
dec(cbval_pred, cbval_y)

Unnamed: 0,decile,1,true_pos
9,9,0.620542,4036
8,8,0.59657,3626
7,7,0.573361,3472
6,6,0.547408,3222
5,5,0.517975,2974
4,4,0.483052,2778
3,3,0.443223,2507
2,2,0.395077,2274
1,1,0.330198,2037
0,0,0.041673,1491


### combined model

In [17]:
from collections import defaultdict

In [18]:
def combine_dicts(dct1, dct2, seg):
    d = defaultdict(dict)
    for a, b in list(dct1[seg].items()) + list(dct2[seg].items()):
        d[a].update(b)
    return d

#### 3pa with total visits census

In [19]:
d1 = combine_dicts(third_train, c, 1)
d2 = combine_dicts(third_train, c, 2)

In [20]:
dval1 = combine_dicts(third_val, cval, 1)
dval2 = combine_dicts(third_val, cval, 2)

In [None]:
y = [1]*len(d1)+[0]*len(d2)
v = list(d1.values())+list(d2.values())
x = dv.fit_transform(v)

In [24]:
y_val = [1]*len(dval1)+[0]*len(dval2)
v_val = list(dval1.values())+list(dval2.values())
x_val = dv.transform(v_val)

In [27]:
dxtr, dxte, dytr, dyte = train_test_split(x, y, test_size=0.2, stratify=y)

In [28]:
dpred, dval_pred = logit(dxtr, dxte, dytr, dyte, x_val, y_val)

score: 0.62203128017
roc_auc: 0.660867262795
confusion matrix:
 [[7767 4446]
 [3384 5119]]
validation score: 0.628055524831
validation roc_auc: 0.661932432985
validation cm:
 [[35607 25816]
 [14537 32532]]


In [29]:
dec(dpred, dyte)

Unnamed: 0,decile,1,true_pos
9,9,0.797485,1319
8,8,0.688246,1204
7,7,0.602413,1087
6,6,0.541524,945
5,5,0.475948,888
4,4,0.414934,820
3,3,0.353534,694
2,2,0.286408,593
1,1,0.184349,503
0,0,5.6e-05,450


In [30]:
dec(dval_pred, y_val)

Unnamed: 0,decile,1,true_pos
9,9,0.917629,6196
8,8,0.783923,6471
7,7,0.675399,6490
6,6,0.5920656,6083
5,5,0.531257,5358
4,4,0.4441436,4781
3,3,0.3624798,3699
2,2,0.2806156,3083
1,1,0.1758767,2158
0,0,5.052747e-07,2750


#### 3pa with boolean census

In [100]:
db1 = combine_dicts(third_train, cb, 1)
db2 = combine_dicts(third_train, cb, 2)

In [101]:
dbval1 = combine_dicts(third_val, cbval, 1)
dbval2 = combine_dicts(third_val, cbval, 2)

In [102]:
y = [1]*len(db1)+[0]*len(db2)
v = list(db1.values())+list(db2.values())
x = dv.fit_transform(v)

In [103]:
y_val = [1]*len(dbval1)+[0]*len(dbval2)
v_val = list(dbval1.values())+list(dbval2.values())
x_val = dv.transform(v_val)

In [104]:
dbxtr, dbxte, dbytr, dbyte = train_test_split(x, y, test_size=0.2, stratify=y)

In [105]:
dbpred, dbval_pred = logit(dbxtr, dbxte, dbytr, dbyte, x_val, y_val)

score: 0.620390036687
roc_auc: 0.651669007463
confusion matrix:
 [[7840 4373]
 [3491 5012]]
validation score: 0.620939792796
validation roc_auc: 0.652466038751
validation cm:
 [[33522 27901]
 [13224 33845]]


In [106]:
dec(dbpred, dbyte)

Unnamed: 0,decile,1,true_pos
9,9,0.810624,1262
8,8,0.692913,1218
7,7,0.601971,1108
6,6,0.538708,938
5,5,0.468009,869
4,4,0.403224,794
3,3,0.339329,693
2,2,0.267745,608
1,1,0.152078,492
0,0,1e-05,521


In [107]:
dec(dbval_pred, y_val)

Unnamed: 0,decile,1,true_pos
9,9,0.9830537,5821
8,8,0.879524,5944
7,7,0.7400507,6514
6,6,0.6301693,6381
5,5,0.5575015,5547
4,4,0.4696905,5072
3,3,0.3786764,3953
2,2,0.2928698,3133
1,1,0.1896739,2249
0,0,4.109132e-07,2455


### top cookie combined model

In [109]:
test_lookup = pd.read_csv('lookup_testing_12_19.csv')
train_lookup = pd.read_csv('lookup_training_12_19.csv')

In [112]:
thirdpa_train = pd.read_csv('training_12_19.csv')

In [155]:
thirdpa_val = pd.read_csv('testing_12_19.csv')

In [176]:
df = pd.merge(thirdpa_train[thirdpa_train.segment==1], train_lookup, on='id')

In [178]:
top = list(df.groupby(['fed_id', 'id']).sum()['segment'].groupby(level=0, group_keys=False).nlargest(1).reset_index()['id'])

In [179]:
top1 = [str(i) for i in top]

In [180]:
top_3pa = dict((k, third_train[1][k]) for k in top1)

In [177]:
dfval = pd.merge(thirdpa_val[thirdpa_val.segment==1], test_lookup, on='id')

In [181]:
topval = list(dfval.groupby(['fed_id', 'id']).sum()['segment'].groupby(level=0, group_keys=False).nlargest(1).reset_index()['id'])

In [183]:
topval1 = [str(i) for i in topval]

In [186]:
top_3pa_val = dict((k, third_val[1][k]) for k in topval1)

In [208]:
d = defaultdict(dict)
for a, b in list(top_3pa.items()) + list(c[1].items()):
    if a in top_3pa.keys():
        d[a].update(b)

In [209]:
dval = defaultdict(dict)
for a, b in list(top_3pa_val.items()) + list(cval[1].items()):
    if a in top_3pa_val.keys():
        dval[a].update(b)

In [211]:
y = [1]*len(d)+[0]*len(d2)
values = list(d.values())+list(d2.values())
x = dv.fit_transform(values)

In [212]:
y_val = [1]*len(dval)+[0]*len(dval2)
values = list(dval.values()) + list(dval2.values())
x_val = dv.transform(values)

In [215]:
xtr, xte, ytr, yte = train_test_split(x, y, test_size=0.2, stratify=y)

In [216]:
preds, vpreds = logit(xtr, xte, ytr, yte, x_val, y_val)

score: 0.713219547018
roc_auc: 0.742170948014
confusion matrix:
 [[9114 3098]
 [1321 1876]]
validation score: 0.706094916887
validation roc_auc: 0.653498851699
validation cm:
 [[47461 13962]
 [ 9218  8228]]


In [217]:
dec(preds, yte)

Unnamed: 0,decile,1,true_pos
9,9,0.8823841,729
8,8,0.6972095,566
7,7,0.5327399,490
6,6,0.413054,353
5,5,0.2639293,343
4,4,0.1007694,323
3,3,0.0217097,212
2,2,0.007987071,81
1,1,0.004034765,35
0,0,6.971271e-09,65


In [219]:
dec(vpreds, y_val)

Unnamed: 0,decile,1,true_pos
9,9,0.8947813,3330
8,8,0.666855,2824
7,7,0.4738812,2415
6,6,0.3554303,1665
5,5,0.1353289,1832
4,4,0.01228963,1935
3,3,0.004639481,600
2,2,0.002764503,260
1,1,0.0008919072,497
0,0,3.179288e-14,2088
