In [1]:
import seaborn as sns
import matplotlib.pyplot as plt

ImportError: /usr/lib64/libxcb-xlib.so.0: undefined symbol: _xcb_lock_io

In [37]:
import pandas as pd
import numpy as np
import itertools
import random
import csv
import json
import re
from sklearn import preprocessing
from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_selection import SelectFromModel
from sklearn.model_selection import cross_val_score,cross_val_predict
from sklearn.model_selection import train_test_split
from sklearn.model_selection import KFold
from sklearn.metrics import r2_score,make_scorer
from sklearn.metrics import accuracy_score
from collections import Counter
from sklearn.metrics import confusion_matrix

In [38]:
DATA_DIR = '/data/rstudio/iberia/data/'
FEATURES = range(55,654) # con ss
TARGET_COLS = ['CAT_BRONZE']

In [39]:
data = pd.read_csv(DATA_DIR + 'feature_matrix_extra_all_w_derivates_weightedaveragefinalSSperstore_def_binarybronze.csv',sep='|',index_col='CLIENT_ID',encoding='utf-8')

In [40]:
data = data[data.USABLE == True]

## Data Preparation

Convert all features to float and fill NAs in \_CLOSEST variables 

In [41]:
X_all = data[FEATURES]

closest=list()
ind=list()
for i in range(X_all.columns.shape[0]):
    if re.match('^.*_CLOSEST$',X_all.columns[i]) != None:
        closest.append(i) 
    if re.match('^ind_.*',X_all.columns[i]) != None:
        ind.append(i)

# Fill NA (should be only in _closest) with the max value of _closest
nan_cols_closest = X_all[closest].columns[pd.isnull(X_all[closest]).sum() > 0].tolist()
max_mean = X_all[nan_cols_closest].max().mean()
X_all.loc[:,nan_cols_closest] = X_all[nan_cols_closest].fillna(X_all[nan_cols_closest].max())
X_all = X_all.astype(np.float)

# Fill NA (should be only in ind_) with 0
nan_cols_ind = X_all[ind].columns[pd.isnull(X_all[ind]).sum() > 0].tolist()
X_all.loc[:,nan_cols_ind] = X_all[nan_cols_ind].fillna(0)
X_all = X_all.astype(np.float)

# Repare boolean variables
X_all.loc[:,['IS_KIOSK','COAST','MEGA_CITY']] = X_all[['IS_KIOSK','COAST','MEGA_CITY']].astype('bool')

Add TYPE_ESTABLISHMENT_NIELSEN

In [42]:
le_establ = preprocessing.LabelEncoder()
le_establ.fit(data['TYPE_ESTABLISHMENT_NIELSEN'])
X_all['TYPE_ESTABLISHMENT_NIELSEN'] = le_establ.transform(data.TYPE_ESTABLISHMENT_NIELSEN)

In [43]:
X_nielsen = X_all[data.IN_NIELSEN == True]
Y_nielsen = data[data.IN_NIELSEN == True][TARGET_COLS]

## Models definitions

In [44]:
rf_class = RandomForestClassifier(n_estimators=100, random_state=123)
#rf_class = RandomForestClassifier(n_estimators=100, random_state=123, class_weight='balanced_subsample')
#rf_class = RandomForestClassifier(n_estimators=100, random_state=123, class_weight={1: 0.2, 2: 0.5})
kf = KFold(n_splits=10)

## Functions

__Feature Selection__

In [27]:
def feature_selection(X,Y,target_cols,model):
    
    features = dict()
    features_all = dict()
    names = X.columns.tolist()

    for col in target_cols:    

        y = Y[col]
        #y = y[y < y.quantile(.95)]
        X_pred = X.loc[y.index,:]
        
        #model.fit(X_pred,y,w)
        model.fit(X_pred,y)

        thres = 2*np.mean(model.feature_importances_)

        features_tmp = []
        features_tmp_all = sorted(zip(model.feature_importances_, names), reverse=True)
        for imp,name in features_tmp_all:
            if imp > thres:
                features_tmp.append((imp,name))
        
        features[col] = features_tmp
        features_all[col] = features_tmp_all
    
    return features, features_all

In [28]:
features_rel, features_rel_all = feature_selection(X_nielsen,Y_nielsen,TARGET_COLS,rf_class,)
features_rel

{'CAT_BRONZE': [(0.070941644638310361, 'TYPE_ESTABLISHMENT_NIELSEN'),
  (0.014006159766126314, u'LANGUAGE_SCHOOL_CLOSEST'),
  (0.011349214061642078, u'FOODKIDS_CLOSEST'),
  (0.010489020848390894, u'DRIVING_SCHOOL_CLOSEST'),
  (0.010435401056536784, u'RAIL_STATION_CLOSEST'),
  (0.010240337367545116, u'IS_KIOSK'),
  (0.0099069749457466535, u'DRINKING_WATER_CLOSEST'),
  (0.0098957915306923955, u'USERS_OVER50_022016'),
  (0.0098466426312491286, u'VISITS_NOCHE_SUM'),
  (0.0097462192084006964, u'SUBWAY_ENTRANCE_CLOSEST'),
  (0.0097410576543069533, u'MUSIC_SCHOOL_CLOSEST'),
  (0.0096607097158421477, u'GOLF_COURSE_CLOSEST'),
  (0.0096425463441631824, u'ACCOMMODATION_CLOSEST'),
  (0.0096201409866053262, u'PITCH_CLOSEST'),
  (0.009594153070974068, u'TOURIST_ATTRACTION_CLOSEST'),
  (0.0095796846456824986, u'PARKING_CLOSEST'),
  (0.0094214969754085158, u'SCHOOL_CLOSEST'),
  (0.009397177323811505, u'VISITS_NOCHE_062015'),
  (0.0093421961340092955, u'SWIMMING_POOL_CLOSEST'),
  (0.0093374523086541313

In [44]:
with open(DATA_DIR + 'features_importance_POS_category_classification_all_weight.json', 'w') as fp:
    json.dump(features_rel_all, fp)

__Classification__

In [45]:
def category_classification_score(X,Y,target_cols,model,features=False,cv=kf):
    
    scores = dict()

    for col in target_cols:
        print col

        y = Y[col]
        #y = y[y < y.quantile(.95)]
        X_pred = X.loc[y.index,:]

        if features != False:
            feat = [f[1] for f in features[col]]
            X_pred = X_pred[feat]    

        #scores[col] = cross_val_score(model, X_pred, y, cv=cv, fit_params={'sample_weight': w})
        scores[col] = cross_val_score(model, X_pred, y, cv=cv)
        
        print scores[col]
        print np.mean(scores[col])
    
    return scores

In [46]:
class_scores = category_classification_score(X_nielsen,Y_nielsen,TARGET_COLS,rf_class,cv=kf)

CAT_BRONZE
[ 0.59036145  0.5873494   0.6214178   0.61689291  0.61387632  0.53242836
  0.54600302  0.58823529  0.53846154  0.56561086]
0.580063693689


__Predicciones__

In [12]:
def category_classification_prediction(X_train,Y_train,X_test,Y_test,target_cols,model,features=False):
    
    targets = dict()
    preds = dict()
    
    for col in target_cols:
        
        print col
        
        y = Y_train[col]
        #y = y[y < y.quantile(.95)]
        X_pred = X_train.loc[y.index,:]
              
        X_test_aux = X_test

        if features != False:
            feat = [f[1] for f in features[col]]
            X_pred = X_pred[feat]
            X_test_aux = X_test[feat]
            
        #model.fit(X_pred,y,w)
        model.fit(X_pred,y)
        
        targets[col] = Y_test[col]
        preds[col] = model.predict(X_test_aux)
    
    return targets,preds

In [163]:
data_topredict = pd.read_csv(DATA_DIR + 'feature_matrix_extra_all_w_derivates_weightedaveragefinalSSperstore_def.csv',sep='|',index_col='CLIENT_ID',encoding='utf-8')

data_topredict = data_topredict[data_topredict.USABLE == True]

X_topredict = data_topredict[FEATURES]
Y_topredict = data_topredict[TARGET_COLS]

closest=list()
ind=list()
for i in range(X_topredict.columns.shape[0]):
    if re.match('^.*_CLOSEST$',X_topredict.columns[i]) != None:
        closest.append(i) 
    if re.match('^ind_.*',X_topredict.columns[i]) != None:
        ind.append(i)

# Fill NA (should be only in _closest) with the mean max value of _closest
nan_cols_closest = X_topredict[closest].columns[pd.isnull(X_topredict[closest]).sum() > 0].tolist()
max_mean = X_topredict[nan_cols_closest].max().mean()
X_topredict.loc[:,nan_cols_closest] = X_topredict[nan_cols_closest].fillna(X_topredict[nan_cols_closest].max())
X_topredict = X_topredict.astype(np.float)

# Fill NA (should be only in ind_) with 0
nan_cols_ind = X_topredict[ind].columns[pd.isnull(X_topredict[ind]).sum() > 0].tolist()
X_topredict.loc[:,nan_cols_ind] = X_topredict[nan_cols_ind].fillna(0)
X_topredict = X_topredict.astype(np.float)

# Repare boolean variables
X_topredict.loc[:,['IS_KIOSK','COAST','MEGA_CITY']] = X_topredict[['IS_KIOSK','COAST','MEGA_CITY']].astype('bool')

# Add TYPE ESTABLISHMENT
le_establ = preprocessing.LabelEncoder()
le_establ.fit(data_topredict['TYPE_ESTABLISHMENT_NIELSEN'])
X_topredict['TYPE_ESTABLISHMENT_NIELSEN'] = le_establ.transform(data_topredict.TYPE_ESTABLISHMENT_NIELSEN)

In [None]:
targets,preds = category_classification_prediction(X_nielsen,Y_nielsen,X_topredict,Y_topredict,TARGET_COLS,rf_class)
Y_pred = pd.DataFrame(preds)
Y_pred.index = X_topredict.index
Y_pred.columns = [x + '_pred' for x in Y_pred.columns.tolist()]
Y_pred.join(Y_topredict).to_csv(DATA_DIR + 'POS_category_classification_prediction_all_weight.csv',sep=';',encoding='utf-8')

In [77]:
for col in TARGET_COLS:
    data_topredict.loc[targets[col].index,col + '_pred'] = preds[col]

In [78]:
for col in TARGET_COLS:
    print col
    results = data_topredict[data_topredict[col].notnull()][[col,col + '_pred']]
    print r2_score(results[col],results[col + '_pred'])
    results = results.sort_values(col)

TOTAL_CHIPS_LESS_50
0.107432147647
TOTAL_CHIPS_MORE_50
0.30273039743
TOTAL_SNACKS_LESS_50
0.257690411939
TOTAL_SNACKS_MORE_50
0.214973831111


In [None]:
def classification_cross_val_predict(X,Y,target_cols,model,features=False,cv=kf):
    
    targets = dict()
    preds = dict()
    
    for col in target_cols:
        
        print col

        y = Y[col]
        #y = y[y < y.quantile(.95)]
        X_pred = X.loc[y.index,:]
        
        if features != False:
            feat = [f[1] for f in features[col]]
            X_pred = X_pred[feat]
                
        targets[col] = y
        preds[col] = cross_val_predict(model, X_pred, y, cv=cv, fit_params={'sample_weight': w})
        #preds[col] = cross_val_predict(model, X_pred, y, cv=cv)

    return targets,preds

In [None]:
targets,preds = classification_cross_val_predict(X_nielsen,Y_nielsen,TARGET_COLS,rf_class,cv=kf)
Y_pred = pd.DataFrame(preds)
Y_pred.index = X_nielsen.index
Y_pred.columns = [x + '_pred' for x in Y_pred.columns.tolist()]
Y_pred.join(Y_nielsen).to_csv(DATA_DIR + 'POS_category_classification_prediction_all_weight_w_cross_valid.csv',sep=';',encoding='utf-8')