#### Train RandomForest Model to deliver 1st generation of prediction
    * feature: demographic informaiton + binary varialbe (about if session data available)
    * algorithm: random-forest

In [1]:
import sys
import os
import glob
import numpy as np
import scipy as sp
import pandas as pd
import matplotlib.pyplot as plt
import sklearn

%matplotlib inline
%precision 4

'%.4f'

In [34]:
root_dir = os.getcwd()
data_dir = os.path.join(root_dir, 'data')
output_dir = os.path.join(root_dir, 'output')
images_dir = os.path.join(root_dir, 'images')
model_dir = os.path.join(root_dir, 'models')

In [301]:
from os.path import join

train_user = pd.read_csv(join(data_dir, 'train_users.csv'), header=0)
test_user  = pd.read_csv(join(data_dir, 'test_users.csv'), header=0)
session_df = pd.read_csv(join(data_dir, 'sessions.csv'), header=0)

In [4]:
# create session availabilit indicator
session_uid = session_df['user_id'].unique()

In [5]:
train_user['session_ind'] = train_user['id'].isin(session_uid)
train_user['session_ind'] = train_user['session_ind'].apply(lambda x: 1 if x==True else 0)
print("{} ({}) of users found session records".format(sum(train_user['session_ind']), train_user.shape[0]))

31202 (171239) of users found session records


In [6]:
test_user['session_ind'] = test_user['id'].isin(session_uid)
test_user['session_ind'] = test_user['session_ind'].apply(lambda x: 1 if x==True else 0)
print("{} ({}) of users found session records".format(sum(test_user['session_ind']), test_user.shape[0]))

43408 (43673) of users found session records


In [7]:
train_dest_ind = train_user['country_destination'].apply(lambda x: 1 if x!='NDF' else 0)
pd.crosstab(train_user['session_ind'], train_dest_ind, margins=True)

country_destination,0,1,All
session_ind,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
0,80163,59874,140037
1,18989,12213,31202
All,99152,72087,171239


### conditional probability of booked trip on session_id

* p(booked | session_ind=False) = 0.428
* p(booked | session_indx=True) = 0.391

### conclusion: if having session record is irrelevant on if customer booked a trip

In [8]:
# chi-sqaure test on correlation between destination country and lanaguage preferrence
from scipy.stats import chi2_contingency

dest_lang_crosstab = pd.crosstab(train_user['country_destination'], train_user['language'])
chi2, p, dof, ex  = chi2_contingency(dest_lang_crosstab, correction=True)

print("chi-square:{}, p-value:{}".format(chi2, p))

chi-square:753.5918275937361, p-value:3.246969993693663e-51


### Conclusion: 
strong correlation between session_record availibility and booked trip, P-value << 0.05% rejected the null hypothesis: (two variable are independent), the alternative hypothesis (destination country is correlated with language perferrence) is accepted.

In [9]:
pd.crosstab(train_dest_ind, train_user['gender'], margins=True)

gender,-unknown-,FEMALE,MALE,OTHER,All
country_destination,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
0,50829,25960,22265,98,99152
1,23179,26290,22475,143,72087
All,74008,52250,44740,241,171239


In [10]:
# Gender: OTHER's influence on country_destination
print("ratio: {}".format( (143/72087.0) / (98/99152.0) ))

ratio: 2.007032885150398


In [11]:
# age outlier
age_lt_15 = train_user['age'].apply(lambda x: 1 if x<15 else 0)
age_gt_80 = train_user['age'].apply(lambda x: 1 if x>80 else 0)
age_gt_100 = train_user['age'].apply(lambda x: 1 if x>100 else 0)

pd.crosstab(train_dest_ind, age_lt_15, margins=True)

age,0,1,All
country_destination,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
0,99108,44,99152
1,72072,15,72087
All,171180,59,171239


In [12]:
pd.crosstab(train_dest_ind, age_gt_80, margins=True)

age,0,1,All
country_destination,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
0,98007,1145,99152
1,70869,1218,72087
All,168876,2363,171239


In [13]:
pd.crosstab(train_dest_ind, age_gt_100, margins=True)

age,0,1,All
country_destination,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
0,98261,891,99152
1,70978,1109,72087
All,169239,2000,171239


In [14]:
x_ctg_feats = ['gender', 'signup_method', 'signup_flow', 'language', 'affiliate_channel', 'affiliate_provider', \
               'first_affiliate_tracked', 'signup_app', 'first_device_type', 'first_browser', 'session_ind']
x_num_feats = ['age']
x_feats     = x_ctg_feats + x_num_feats
y_feat      = 'country_destination'

In [15]:
# LabelEncoder: convert string-based categorical to integers
# OneHotEncoder: re-encode integers with a set of binary indicators
# DictVectorizer: convert dictionary (feat_name: value) to vectorization, do not transfrom numeric value, but, record 
#            categorical vector
data = train_user.ix[1:10, ['gender', 'signup_method']]
data

Unnamed: 0,gender,signup_method
1,MALE,facebook
2,FEMALE,basic
3,FEMALE,facebook
4,-unknown-,basic
5,-unknown-,basic
6,FEMALE,basic
7,FEMALE,basic
8,-unknown-,basic
9,-unknown-,basic
10,FEMALE,basic


In [16]:
data.T.to_dict().values()

dict_values([{'signup_method': 'facebook', 'gender': 'MALE'}, {'signup_method': 'basic', 'gender': 'FEMALE'}, {'signup_method': 'facebook', 'gender': 'FEMALE'}, {'signup_method': 'basic', 'gender': '-unknown-'}, {'signup_method': 'basic', 'gender': '-unknown-'}, {'signup_method': 'basic', 'gender': 'FEMALE'}, {'signup_method': 'basic', 'gender': 'FEMALE'}, {'signup_method': 'basic', 'gender': '-unknown-'}, {'signup_method': 'basic', 'gender': '-unknown-'}, {'signup_method': 'basic', 'gender': 'FEMALE'}])

In [17]:
from sklearn.feature_extraction import DictVectorizer

feat_names = data.columns
data_dict = data.T.to_dict().values()

vectorizer = DictVectorizer(sparse=False)
train_data = vectorizer.fit_transform(data_dict)
feat_names = vectorizer.get_feature_names()

### Pre-processing Features
* encode categorical variables with OneHotEncoder
* populate missing value with Median (numeric), Mode (Categorical)

In [18]:
x_cat_feats = ['gender', 'signup_method', 'signup_flow', 'language', 'affiliate_channel', \
               'affiliate_provider', 'first_affiliate_tracked', 'signup_app', 'first_device_type', \
               'session_ind']
               #'first_browser', 'session_ind']
# first_affiliate_tracked
x_num_feats = ['age']
y_feat      = 'country_destination'

In [19]:
# handling Missing value
# first_affiliate_tracked
nan_idx = train_user['first_affiliate_tracked'].isnull()
train_user.loc[nan_idx, 'first_affiliate_tracked'] = 'untracked'

nan_idx = test_user['first_affiliate_tracked'].isnull()
test_user.loc[nan_idx, 'first_affiliate_tracked'] = 'untracked'

# age
nan_idx = train_user['age'].isnull()
age_median = train_user['age'].quantile(.5)
train_user.loc[nan_idx, 'age'] = age_median

nan_idx = test_user['age'].isnull()
test_user.loc[nan_idx, 'age'] = age_median

In [20]:
# numeric variable, data tranformation
def intAge(x):
    int10 = (x-x%10)/10
    if int10 < 2:
        res = 1
    elif int10 >= 8:
        res = 8
    else:
        res = int10
    return int10

train_user['age_int10'] = train_user['age'].apply(intAge)
test_user['age_int10'] = test_user['age'].apply(intAge)

# handling categorical variable which was encoded by integers
singup_flow_freqs = train_user['signup_flow'].value_counts() / train_user.shape[0]
insign_signups = np.array(singup_flow_freqs.index)[singup_flow_freqs.values < 0.01].tolist()

def signupFlowRecoder(x, infreq_types=None):
    if infreq_types!=None:
        res = -1 if x in infreq_types else x
    else:
        res = x
    return str(res)

train_user['signup_flow'] = train_user['signup_flow'].apply(lambda x: signupFlowRecoder(x, insign_signups))
test_user['signup_flow'] = test_user['signup_flow'].apply(lambda x: signupFlowRecoder(x, insign_signups))

In [21]:
# categorical variables transformation
from sklearn.feature_extraction import DictVectorizer

cat_train_dict = train_user[x_cat_feats].T.to_dict().values()
cat_test_dict = test_user[x_cat_feats].T.to_dict().values()

# training vectorizer 
vectorizer    = DictVectorizer(sparse=False)
cat_train_vec = vectorizer.fit_transform(cat_train_dict)
cat_test_vec  = vectorizer.transform(cat_test_dict)

# retrieve transformed feature names
vec_featnames = vectorizer.get_feature_names()

In [31]:
# convert dependent variable
from sklearn.preprocessing import LabelBinarizer

LabelEncoder = LabelBinarizer()
y = LabelEncoder.fit_transform(train_user[y_feat])

In [23]:
# complete predictor x
cat_train_df = pd.DataFrame(cat_train_vec, columns=vec_featnames)
train_frames = [cat_train_df, train_user['age_int10']]
x_train_df = pd.concat(train_frames, axis=1)
#x_train_df.set_index(train_user['id'])

cat_test_df = pd.DataFrame(cat_test_vec, columns=vec_featnames)
test_frames = [cat_test_df, test_user['age_int10']]
x_test_df = pd.concat(test_frames, axis=1)

In [24]:
# rename test data to avoid confusion in cross_validation
x_new_df = x_test_df

#### develop first RandomForest Model

In [32]:
from sklearn.cross_validation import train_test_split

random_seed = 1234

# split data to Development and Test
x_dev, x_test, y_dev, y_test = train_test_split(x_train_df, y, test_size=0.2, 
                                                    random_state=random_seed )

# split data to Training and Validation
x_train, x_valid, y_train, y_valid = train_test_split(x_dev, y_dev, test_size=0.3, 
                                                    random_state=random_seed)

In [33]:
# develop random forest
from sklearn.ensemble import RandomForestClassifier as RF

# parameter configuration
n_trees = 100
max_features = int( round( np.sqrt( x_train.shape[1] ) * 2 )) # try more features at each split
max_features = 'auto'
n_jobs = 2

# turn off deprecated function warning
import warnings
warnings.filterwarnings("ignore")

rf = RF( n_estimators = n_trees, max_features = max_features, n_jobs = n_jobs )
rf.fit( x_train, y_train )

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=100, n_jobs=2,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False)

In [277]:
def getPredLabel(x, class_labels):
    """get label information from binary indicators"""
    idx = [i for i, val in enumerate(x) if val==1][0]
    label = class_labels[idx]
    return label

def getUserClassProb(row_idx, multiclass_prob):
    """ return a particular row's probability production
        generated by multi_class_estimator.predict_proba()
    
    Parameters:
    ----------
    row_idx: <integer> the row_index where values 
    multiclass_prob: <numpy.array> output of 
        sklearn.multi_class_estimator.predict_proba()
    
    Return:
    ------
    res: <vector-like, float> 
    """
    return [col[row_idx][1] for col in multiclass_prob]

def getTopKLabel(probas, all_labels, K=5, weights=None):
    """ extract top k most prossible labels
    
    Parameters:
    ----------
    probas: <vector-like, float>, the vector of predcited 
         likelihood for each possible labels/classes
    all_labels: <vector-like, label>, the vector of all
         possible labels which can be assigned to sample
    K: <integer> the number of labels returned as prediction
    weights: <vector-like> the weights can be assigned to
         every label's prediction
    
    Returns:
    --------
    res: <vector-like> a vector of predicted label of size
         defined by K
    """
    import numpy as np
    
    if len(probas) != len(all_labels):
        raise ValueError("all_lables's length is different from probas'")
    
    if weights == None:
        weighted_prob = probas
    else: 
        if len(weights) != len(all_labels):
            raise ValueError("all_lables's length is different from weights'")
        weighted_prob = np.array(probas) * np.array(weights)
        
    predDict = [(a, b) for a, b in zip(class_labels, weighted_prob)]
    sortedPred = sorted(predDict, key=lambda x:x[1], reverse=True)
    res = [x[0] for i,x in enumerate(sortedPred) if i < K]
    
    return res

def calDCG(pred_labels, true_label):
    """ calculate discounted Cumulative Gain
        
    Parameters:
    -----------
    pred_labels: <vector-like, list> sorted prediction labels
    true_label: <value> string or integer denotes the true label
    
    Returns:
    --------
    score: <float> NDCG score
    """
    score = 0
    for i, label in enumerate(pred_labels):
        power = 1 if label == true_label else 0
        score += (2**power-1) / np.log2(i+1+1) # 0-based index should be adjusted up by 1
        if power == 1:
            break
    return score


def calNDCG(pred_prob, true_labels, all_labels, K, weights=None, return_prediction=False):
    """ calcualte Normalized Discrete Comulative Gain
    
    Paramters:
    ---------
    pred_prob: <matrix-like, float> probabilities <n_samples, n_classes>
    true_labels: <vector-like, integer or string> the list of true labels of samples 
    all_labels: <vecotor-like, integer or string> all possible classes
    K: <integer> the number of prediction can be made for a sample
    weights: <vector-like, float> the weights assigned to adjust class-wise
        probability piror to prediciton ranking
    return_prediction: <boolean> return_prediction==True, it returns 
        predicted labels per sample in list structure
        
    Returns:
    -------
    NDCG: <numeric> 
    pred_labels_list: <list, list> 
    """
    
    tot_DCG = 0
    DCG_list = []
    pred_labels_list = []
    counter = 1
    
    tot_rows = pred_prob[0].shape[0]
    
    if tot_rows != len(true_labels):
        raise ValueError("pred_prob matrix is not compatible with true_labels !")
    
    for row_idx in np.arange(tot_rows):
        the_true_label  = true_labels[row_idx]
        the_proba_list  = getUserClassProb(row_idx, pred_prob)
        the_pred_labels = getTopKLabel(the_proba_list, all_labels, K=5, weights=weights)
        the_DCG = calDCG(the_pred_labels, the_true_label)
    
        DCG_list.append(the_DCG)
        pred_labels_list.append(the_pred_labels)
    
        tot_DCG += the_DCG
        counter += 1

    NDCG = tot_DCG / counter
    
    if return_prediction == True:
        return (NDCG, pred_labels_list)
    else:
        return NDCG

In [278]:
# train score
valid_pred_prob  = rf.predict_proba(x_train)
true_labels = [getPredLabel(row, class_labels) for row in y_train]
res = calNDCG(valid_pred_prob, true_labels, class_labels, K=5, return_prediction=False)
res

0.8477

In [279]:
# validation score
valid_pred_prob  = rf.predict_proba(x_valid)
true_labels = [getPredLabel(row, class_labels) for row in y_valid]
res = calNDCG(valid_pred_prob, true_labels, class_labels, K=5, return_prediction=False)
res

0.8004

In [280]:
# test score
valid_pred_prob  = rf.predict_proba(x_test)
true_labels = [getPredLabel(row, class_labels) for row in y_test]
res = calNDCG(valid_pred_prob, true_labels, class_labels, K=5, return_prediction=False)
res

0.7995

In [296]:
# re-train model with all train data
rf.fit( x_dev, y_dev )
new_pred_prob = rf.predict_proba(x_new_df)
probs = [getUserClassProb(row_idx, new_pred_prob) for row_idx in range(x_new_df.shape[0])]
predictions = [getTopKLabel(prob, class_labels, K=5) for prob in probs]

In [304]:
# format recommendation(prediction) to the submission format
counter = 0
recom_pairs = []
for uid, countries in zip(test_user['id'], predictions):
    for country in countries:
        recom_pairs.append({"id":uid, "country":country}) 
recom_pairs = pd.DataFrame(recom_pairs)
recom_pairs = recom_pairs[["id", "country"]]

In [305]:
# output submission
recom_pairs.to_csv(join(output_dir, "initial_recommendation_submission.csv"), 
                   sep=',', header=True, encoding ='utf-8', index=False)

In [299]:
from sklearn.externals import joblib
model_file = join(model_dir, 'randomForest_allTrain_20151128.pkl')
joblib.dump(rf, model_file) 

['/Users/beingzy/Dropbox/kaggle/airbnb/models/randomForest_allTrain_20151128.pkl',
 '/Users/beingzy/Dropbox/kaggle/airbnb/models/randomForest_allTrain_20151128.pkl_01.npy',
 '/Users/beingzy/Dropbox/kaggle/airbnb/models/randomForest_allTrain_20151128.pkl_02.npy',
 '/Users/beingzy/Dropbox/kaggle/airbnb/models/randomForest_allTrain_20151128.pkl_03.npy',
 '/Users/beingzy/Dropbox/kaggle/airbnb/models/randomForest_allTrain_20151128.pkl_04.npy',
 '/Users/beingzy/Dropbox/kaggle/airbnb/models/randomForest_allTrain_20151128.pkl_05.npy',
 '/Users/beingzy/Dropbox/kaggle/airbnb/models/randomForest_allTrain_20151128.pkl_06.npy',
 '/Users/beingzy/Dropbox/kaggle/airbnb/models/randomForest_allTrain_20151128.pkl_07.npy',
 '/Users/beingzy/Dropbox/kaggle/airbnb/models/randomForest_allTrain_20151128.pkl_08.npy',
 '/Users/beingzy/Dropbox/kaggle/airbnb/models/randomForest_allTrain_20151128.pkl_09.npy',
 '/Users/beingzy/Dropbox/kaggle/airbnb/models/randomForest_allTrain_20151128.pkl_10.npy',
 '/Users/beingzy/

In [None]:
# bins age, inspired by https://www.kaggle.com/gshguru/airbnb-recruiting-new-user-bookings/fisrtscript/run/111618/code