[x for (y,x) in sorted(zip(Y,X))]
http://stackoverflow.com/questions/6618515/sorting-list-based-on-values-from-another-list

In [1]:
import numpy as np, pandas as pd
from sklearn.preprocessing import LabelEncoder
from xgboost.sklearn import XGBClassifier

np.random.seed(0)

#Loading data
df_train = pd.read_csv('../input/train_users_2.csv')
df_test = pd.read_csv('../input/test_users.csv')
sessions = pd.read_csv('../input/sessions.csv')
#Getting ids in the test set
id_test = df_test['id']

In [2]:
#Getting the pivot point of the training set
piv_train = df_train.shape[0]
#Creating a DataFrame with train+test data
df_all = pd.concat((df_train, df_test), axis=0, ignore_index=True).drop(['date_first_booking'], axis=1)

#####Feature engineering#######
#date_account_created
dac = np.vstack(df_all.date_account_created.astype(str).apply(lambda x: list(map(int, x.split('-')))).values)
df_all['dac_year'] = dac[:,0]
df_all['dac_month'] = dac[:,1]
df_all['dac_day'] = dac[:,2]
df_all = df_all.drop(['date_account_created'], axis=1)
#timestamp_first_active
tfa = np.vstack(df_all.timestamp_first_active.astype(str).apply(lambda x: list(map(int, [x[:4],x[4:6],x[6:8],x[8:10],x[10:12],x[12:14]]))).values)
df_all['tfa_year'] = tfa[:,0]
df_all['tfa_month'] = tfa[:,1]
df_all['tfa_day'] = tfa[:,2]
df_all = df_all.drop(['timestamp_first_active'], axis=1)

In [3]:
#Parsing sessions
sessions_parsed = sessions.groupby(['user_id'],as_index=False).aggregate(np.count_nonzero).rename(columns={'action':'session_count','secs_elapsed':'secs_tot'}).drop(['action_type','action_detail','device_type'],axis=1)
sessions_parsed['secs_tot'] = sessions.groupby(['user_id'],as_index=False).aggregate(np.sum)['secs_elapsed']
sessions_parsed['secs_avg'] = sessions_parsed['secs_tot']/sessions_parsed['session_count']
#Meging sessions_parsed to df_all
df_all = df_all.merge(sessions_parsed, how='left', left_on='id', right_on='user_id').drop(['user_id'], axis=1)

In [4]:
#Splitting df_all back into df_train and df_test
df_train = df_all[:piv_train].drop(['id'], axis=1)
df_test = df_all[piv_train:].drop(['country_destination'], axis=1)
#Splitting df_train to df_train_s and df_train_
df_train_s = df_train[~df_train.session_count.isnull()]
df_train_ = df_train[df_train.session_count.isnull()].drop(['session_count', 'secs_tot', 'secs_avg'], axis=1)
#Splitting df_test to df_test_s and df_test_
df_test_s = df_test[~df_test.session_count.isnull()]
df_test_ = df_test[df_test.session_count.isnull()].drop(['session_count', 'secs_tot', 'secs_avg'], axis=1)

In [5]:
print len(df_train_s), len(df_train_), len(df_test_s), len(df_test_)

73815 139636 61668 428


The sessions data is available for most of the users in the test set, while it is available for only about 1/3 of the users in the training set.

In [6]:
#Getting labels in the training set
labels_s = df_train_s['country_destination'].values
labels_ = df_train_['country_destination'].values
#Dropping country destinations in the training set
df_train_s = df_train_s.drop(['country_destination'], axis=1)
df_train_ = df_train_.drop(['country_destination'], axis=1)
#Getting ids in the test set
id_test_s = df_test_s['id'].values
id_test_ = df_test_['id'].values
#Dropping ids in the test set
df_test_s = df_test_s.drop(['id'], axis=1)
df_test_ = df_test_.drop(['id'], axis=1)

In [16]:
def more_feature_engring(df=None):
    #Filling nan
    df = df.fillna(-1)
    #Age
    av = df.age.values
    df['age'] = np.where(np.logical_or(av<14, av>100), -1, av)
    #One-hot-encoding features
    ohe_feats = ['gender', 'signup_method', 'signup_flow', 'language', 'affiliate_channel', 'affiliate_provider', 'first_affiliate_tracked', 'signup_app', 'first_device_type', 'first_browser']
    for f in ohe_feats:
        df_dummy = pd.get_dummies(df[f], prefix=f)
        df = df.drop([f], axis=1)
        df = pd.concat((df, df_dummy), axis=1)
    return df

df_train_s = more_feature_engring(df_train_s)
df_train_ = more_feature_engring(df_train_)
df_test_s = more_feature_engring(df_test_s)
df_test_ = more_feature_engring(df_test_)

In [17]:
#Splitting train and test
X_s = df_train_s.values
X_ = df_train_.values
le = LabelEncoder()
y_s = le.fit_transform(labels_s)
y_ = le.fit_transform(labels_)
X_test_s = df_test_s.values
X_test_ = df_test_.values

In [19]:
#Classifiers
xgb_s = XGBClassifier(max_depth=6, learning_rate=0.25, n_estimators=43,
                    objective='multi:softprob', subsample=0.6, colsample_bytree=0.6, seed=0)                  
xgb_s.fit(X_s, y_s)
y_pred_s = xgb_s.predict_proba(X_test_s)

xgb_ = XGBClassifier(max_depth=6, learning_rate=0.25, n_estimators=40,
                    objective='multi:softprob', subsample=0.6, colsample_bytree=0.6, seed=0)                  
xgb_.fit(X_, y_)
y_pred_ = xgb_.predict_proba(X_test_)

In [45]:
x_pred = np.append(id_test_s.values,id_test_.values)
y_pred = np.vstack((y_pred_s,y_pred_))

In [71]:
order = dict(zip(id_test.index,id_test.values))
pred_dict = {}
y_predd = np.ndarray(shape=(y_pred.shape[0],y_pred.shape[1]))
for i,j in order.iteritems():
    idx = np.where(x_pred==j)[0][0]
    y_predd[i] = y_pred[idx]
x_predd = id_test.values

In [73]:
#Taking the 5 classes with highest probabilities
ids = []  #list of ids
cts = []  #list of countries
for i in range(len(id_test.values)):
    idx = id_test.values[i]
    ids += [idx]*5
    cts += le.inverse_transform(np.argsort(y_predd[i])[::-1])[:5].tolist()

In [74]:
#Generate submission
sub_s = pd.DataFrame(np.column_stack((ids_s, cts_s)), columns=['id', 'country'])
sub_ = pd.DataFrame(np.column_stack((ids_, cts_)), columns=['id', 'country'])

sub = pd.concat((sub_s, sub_), axis=0, ignore_index=True)
sub.head(50)

Unnamed: 0,id,country
0,5uwns89zht,NDF
1,5uwns89zht,US
2,5uwns89zht,IT
3,5uwns89zht,other
4,5uwns89zht,ES
5,jtl0dijy2j,NDF
6,jtl0dijy2j,other
7,jtl0dijy2j,US
8,jtl0dijy2j,IT
9,jtl0dijy2j,ES


In [75]:
sub.to_csv('../output/sub.csv',index=False)