In [376]:
# ----- standard modules -----

import sys

from __future__ import division

# ----- libraries -----

from sklearn.linear_model import LogisticRegression
from sklearn import metrics
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

In [411]:
np.set_printoptions(suppress=True)

In [377]:
custFile1 = 'mcf1.csv'
custFile2 = 'mcf2.csv'

In [408]:
def munge(sample):
    labelColumn = 'WINETAST'
    idColumn = 'CUSTID'

    # print(sample.columns)

    # get feature/predictor matrix as numpy array
    x = sample.drop(idColumn, axis=1).drop(labelColumn, axis=1)

    x = x.replace('.', np.nan).applymap(np.float64)
    x = x.dropna(axis=1)
    print(x.head())
    # x.fillna(x.mean(), inplace=True)
    # x.fillna(0, inplace=True)
    
    # get labels array
    y = sample[labelColumn]
    
    return x, y


def regress(x, y, **kwargs):
    model = LogisticRegression(**kwargs)
    model.fit(x, y)
    
    return model


def get_coef(model, x, sort=False):
    # examine the coefficients
    values = map(lambda x: x[0], np.transpose(model.coef_))
    coef = pd.DataFrame(list(zip(x.columns, values)))
    
    coef.columns = ['Label', 'Value']
    
    if sort:
        coef = coef.sort_values(by='Value', ascending=False)
    
    return coef


In [379]:
sample1 = pd.read_csv(custFile1)
sample2 = pd.read_csv(custFile2)
sample = pd.concat([sample1, sample2])

x, y = munge(sample)

class_weight = {0:0.1, 1:0.9}

# model, coef = regress(x, y, class_weight=class_weight)
model = regress(x, y, solver='newton-cg', max_iter=1000, class_weight=class_weight)
# model, coef = regress(x, y, solver='lbfgs', max_iter=1000, class_weight=class_weight)

   AGE  AGEGRP1  AGEGRP2  AGEGRP3  AGEGRP4  AGEGRP5  AGEGRP6  SEX  EDUC  \
0   27        0        1        0        0        0        0    0    19   
1   70        0        0        0        0        0        1    0    18   
2   31        0        1        0        0        0        0    0    16   
3   37        0        0        1        0        0        0    0    18   
4   48        0        0        0        1        0        0    0    15   

   INCOME    ...     PERCDR  PERCSR  PERCDW  PERCSW  PERCDE  PERCEX  TOTLCOMP  \
0   30649    ...          0       0       0       0     100       0         0   
1  164417    ...         35      20       0      35      10      25         0   
2   29648    ...          0      50       0      50       0       0         0   
3   26188    ...        100       0       0       0       0       0         0   
4  130141    ...         52      30      11       4       3      12         1   

   TOTWCOMP  TOTPCOMM  FACEBOOK  
0         0         0       

In [380]:
model.score(x,y)

0.88849442472123608

In [452]:
get_coef(model, x, True)

Unnamed: 0,Label,Value
19,WINEMAGS,1.927829
18,GOURMAGS,1.039522
21,HOUSMAGS,0.519607
15,INCGRP6,0.339545
20,STYLMAGS,0.271499
25,COMPMAGS,0.259702
4,AGEGRP4,0.197466
2,AGEGRP2,0.195918
30,TOTCAT,0.188015
44,PERCSR,0.156255


In [382]:
#get_coef(model, x, False).to_csv('coef.csv', float_format='{:f}'.format, encoding='utf-8')

In [383]:
predictedAcceptRate

0.18085904295214761

In [384]:
precision

0.42410837710810062

In [448]:
def calculate_profit(model, x, y, _custPop=None):
    predicted = model.predict(x)

    predictedAcceptRate = predicted.mean()
    precision = metrics.precision_score(y, predicted)
    
    costPerCall = 3.20
    
    custPop = _custPop if _custPop is not None else 280000
    
    numCalls = predictedAcceptRate * custPop
    totalCallCost = numCalls * costPerCall
    testPromotionCost = 24000
    
    totalCost = totalCallCost + testPromotionCost 
    
    numResponses = precision * numCalls
    profit = 25
    grossProfit = profit * numResponses
    
    netProfit = grossProfit - totalCost
    return netProfit

In [386]:
calculate_profit(model, x, y)

350877.14385719283

In [387]:
def find_best_model(x, y, linParams):
    
    maxProfit = -sys.maxint - 1
    bestModel = None
    
    for i in np.linspace(*linParams):
        class_weight = {0:i, 1:1-i}
        model = regress(x, y, solver='newton-cg', max_iter=1000, class_weight=class_weight)
        profit = calculate_profit(model, x, y)
        
        print(class_weight, profit)
        
        if profit > maxProfit:
            maxProfit = profit
            bestModel = model
        
    return bestModel

In [388]:
#bestModel = find_best_model(x, y, (0.09, 0.11, 10+1))
#calculate_profit(bestModel, x, y)

In [389]:
model.get_params()

{'C': 1.0,
 'class_weight': {0: 0.1, 1: 0.9},
 'dual': False,
 'fit_intercept': True,
 'intercept_scaling': 1,
 'max_iter': 1000,
 'multi_class': 'ovr',
 'penalty': 'l2',
 'random_state': None,
 'solver': 'newton-cg',
 'tol': 0.0001,
 'verbose': 0}

In [390]:
predicted = model.predict(x)
predictedAcceptRate = predicted.mean()
precision = metrics.precision_score(y, predicted)

In [391]:
print(predictedAcceptRate)
print(precision)

0.180859042952
0.424108377108


In [392]:
x.head(10)

Unnamed: 0,AGE,AGEGRP1,AGEGRP2,AGEGRP3,AGEGRP4,AGEGRP5,AGEGRP6,SEX,EDUC,INCOME,...,PERCDR,PERCSR,PERCDW,PERCSW,PERCDE,PERCEX,TOTLCOMP,TOTWCOMP,TOTPCOMM,FACEBOOK
0,27,0,1,0,0,0,0,0,19,30649,...,0,0,0,0,100,0,0,0,0,0
1,70,0,0,0,0,0,1,0,18,164417,...,35,20,0,35,10,25,0,0,0,0
2,31,0,1,0,0,0,0,0,16,29648,...,0,50,0,50,0,0,0,0,0,0
3,37,0,0,1,0,0,0,0,18,26188,...,100,0,0,0,0,0,0,0,0,0
4,48,0,0,0,1,0,0,0,15,130141,...,52,30,11,4,3,12,1,0,0,0
5,60,0,0,0,0,1,0,1,17,149972,...,0,100,0,0,0,0,0,0,0,0
6,67,0,0,0,0,0,1,0,20,107571,...,59,23,0,9,9,0,0,0,0,0
7,60,0,0,0,0,1,0,0,12,131288,...,52,28,13,5,2,15,0,0,0,0
8,56,0,0,0,0,1,0,1,20,75083,...,0,0,0,0,100,0,0,0,0,0
9,74,0,0,0,0,0,1,1,20,83762,...,33,67,0,0,0,0,0,0,0,0


In [393]:
model.decision_function(x.head(10))

array([-5.43987388, -0.43201208, -5.96429442, -9.81467201,  0.22586001,
       -1.65126692,  1.94110873,  1.72003775, -5.294323  , -3.03026437])

In [394]:
model.predict(x.head(10))

array([0, 0, 0, 0, 1, 0, 1, 1, 0, 0])

In [395]:
model.intercept_

array([-19.47604984])

In [422]:
def construct_decision_equation(model, x, y):
    eq = '({})'.format(model.intercept_[0])
    
    coefs = get_coef(model, x)
    
    for i in coefs.index:
        eq += ' + ({} * {})'.format(coefs['Value'][i], coefs['Label'][i])
        
    return eq

In [423]:
construct_decision_equation(model, x, y)

'(-19.4760498402) + (0.00323642809063 * AGE) + (-0.129337782167 * AGEGRP1) + (0.195917525412 * AGEGRP2) + (-0.234886978876 * AGEGRP3) + (0.19746565346 * AGEGRP4) + (0.0663924279262 * AGEGRP5) + (-0.0962625158959 * AGEGRP6) + (-0.399035861315 * SEX) + (-0.157619919721 * EDUC) + (5.17004482356e-06 * INCOME) + (-0.0305505558763 * INCGRP1) + (-0.266046084692 * INCGRP2) + (-0.103478438587 * INCGRP3) + (0.00408439103736 * INCGRP4) + (0.0557339914639 * INCGRP5) + (0.339545026514 * INCGRP6) + (-3.19513244721 * KIDHOME) + (-1.88275438836 * TEENHOME) + (1.03952152631 * GOURMAGS) + (1.92782909236 * WINEMAGS) + (0.271499009826 * STYLMAGS) + (0.519606890683 * HOUSMAGS) + (0.0249994469138 * SPRTMAGS) + (-0.326484625943 * TRAVMAGS) + (-0.37307193912 * CULTMAGS) + (0.259702303498 * COMPMAGS) + (-0.00281925146687 * FIRSTPUR) + (-0.000668390005902 * LASTPUR) + (0.0333776297608 * NPURCH) + (-0.00799168299087 * TBOTTLES) + (0.188014579598 * TOTCAT) + (-0.00236609418628 * TOTCREV) + (0.00221061370842 * TOT

In [None]:
x1, y1 = munge(sample1)
x2, y2 = munge(sample2)

In [420]:
calculate_profit(model, x1, y1, 10000)

358779.20160000003

In [421]:
calculate_profit(model, x2, y2, 10000)

366324.95409540954

In [439]:
xDegrouped = x.drop('AGEGRP1', axis=1).drop('AGEGRP2', axis=1).drop('AGEGRP3', axis=1).drop('AGEGRP4', axis=1).drop('AGEGRP5', axis=1).drop('AGEGRP6', axis=1).drop('INCGRP1', axis=1).drop('INCGRP2', axis=1).drop('INCGRP3', axis=1).drop('INCGRP4', axis=1).drop('INCGRP5', axis=1).drop('INCGRP6', axis=1)
corr = xDegrouped.corr()
corr = corr.replace(1, np.nan)
corr

Unnamed: 0,AGE,SEX,EDUC,INCOME,KIDHOME,TEENHOME,GOURMAGS,WINEMAGS,STYLMAGS,HOUSMAGS,...,PERCDR,PERCSR,PERCDW,PERCSW,PERCDE,PERCEX,TOTLCOMP,TOTWCOMP,TOTPCOMM,FACEBOOK
AGE,,0.002619,0.004008,0.7109,-0.646689,-0.219803,0.410975,0.221585,0.362207,0.267592,...,-0.034101,0.094689,0.088678,-0.100478,-0.014312,0.109337,0.105599,0.114501,0.041707,-0.115318
SEX,0.002619,,-0.000184,0.001138,0.002303,-0.003635,0.002184,-0.001291,0.435966,0.390781,...,-0.005965,0.00561,0.006305,-0.002653,0.001294,0.0064,0.002032,-0.000674,0.008251,0.00481
EDUC,0.004008,-0.000184,,0.002187,0.007665,-0.005426,0.137069,0.31053,-0.000258,0.000696,...,0.002287,-0.008159,0.001824,0.003582,0.001768,-0.006638,0.106542,0.100368,-0.013074,0.004056
INCOME,0.7109,0.001138,0.002187,,-0.47981,-0.155058,0.506764,0.264022,0.481214,0.332192,...,-0.047264,0.110663,0.106584,-0.115206,-0.007902,0.137107,0.133814,0.142149,0.030944,-0.137645
KIDHOME,-0.646689,0.002303,0.007665,-0.47981,,0.150782,-0.342302,-0.203127,-0.276748,-0.235332,...,0.037588,-0.08895,-0.073766,0.080994,0.013125,-0.100116,-0.091494,-0.087854,-0.041996,0.110675
TEENHOME,-0.219803,-0.003635,-0.005426,-0.155058,0.150782,,-0.217249,-0.122305,-0.155992,-0.155848,...,0.021599,-0.050395,-0.058333,0.052619,0.011207,-0.065858,-0.056391,-0.056513,-0.0211,0.075065
GOURMAGS,0.410975,0.002184,0.137069,0.506764,-0.342302,-0.217249,,0.502898,0.283683,0.303636,...,-0.058934,0.12303,0.120368,-0.123963,-0.004227,0.135017,0.116408,0.138891,-0.044032,-0.153698
WINEMAGS,0.221585,-0.001291,0.31053,0.264022,-0.203127,-0.122305,0.502898,,0.147261,0.156403,...,-0.073336,0.154527,0.075233,-0.108547,-0.009584,0.089212,0.114708,0.119341,-0.030177,-0.138094
STYLMAGS,0.362207,0.435966,-0.000258,0.481214,-0.276748,-0.155992,0.283683,0.147261,,0.58544,...,-0.031393,0.062509,0.063532,-0.062493,-0.001705,0.082559,0.077651,0.072658,0.00773,-0.064607
HOUSMAGS,0.267592,0.390781,0.000696,0.332192,-0.235332,-0.155848,0.303636,0.156403,0.58544,,...,-0.031239,0.040872,0.075783,-0.047523,0.000827,0.08778,0.063898,0.062371,-0.021289,-0.072857


In [434]:
for column in corr.columns:
    print(column, corr[column].max())

('CUSTID', 0.015946771099111374)
('AGE', 0.71089979036539142)
('AGEGRP1', 0.071807003763905169)
('AGEGRP2', 0.46032555132893938)
('AGEGRP3', 0.39160481270662262)
('AGEGRP4', 0.19351752061846086)
('AGEGRP5', 0.36208645844158166)
('AGEGRP6', 0.69187483283043771)
('SEX', 0.43596587644947765)
('EDUC', 0.47755133769366165)
('INCOME', 0.79284778499548181)
('INCGRP1', 0.08791983613111469)
('INCGRP2', 0.46032555132893938)
('INCGRP3', 0.17461744141881402)
('INCGRP4', 0.084590477643812972)
('INCGRP5', 0.17578750037256149)
('INCGRP6', 0.79284778499548181)
('KIDHOME', 0.44556162839251423)
('TEENHOME', 0.35807402127772125)
('GOURMAGS', 0.5755663541030962)
('WINEMAGS', 0.50289759928980771)
('STYLMAGS', 0.58544041588393714)
('HOUSMAGS', 0.58544041588393714)
('SPRTMAGS', 0.27296260293959329)
('TRAVMAGS', 0.42614206522796738)
('CULTMAGS', 0.47755133769366165)
('COMPMAGS', 0.34922757052109193)
('FIRSTPUR', 0.99805967521013161)
('LASTPUR', 0.3216766762034024)
('NPURCH', 0.94584656749503593)
('TBOTTLES', 

In [440]:
corr.to_csv('corr.csv')

In [449]:
predicted1 = model.predict(x1)
predictedAcceptRate1 = predicted1.mean()
precision1 = metrics.precision_score(y1, predicted1)

print(predictedAcceptRate1)
print(precision1)
print(calculate_profit(model, x1, y1))

0.1771
0.427442123094
347218.4


In [451]:
predicted2 = model.predict(x2)
predictedAcceptRate2 = predicted2.mean()
precision2 = metrics.precision_score(y2, predicted2)

print(predictedAcceptRate2)
print(precision2)
print(calculate_profit(model, x2, y2))

0.184618461846
0.42091007584
354536.253625
