In [None]:
import pandas as pd
import numpy as np

from datetime import datetime, date

pd.set_option('display.max_columns', 500)
pd.set_option('display.max_rows', 150)

* ** id**: user id
* ** date_account_created**: the date of account creation
* ** timestamp_first_active**: timestamp of the first activity, note that it can be earlier than date_account_created or date_first_booking because a user can search before signing up
* ** date_first_booking**: date of first booking
* ** signup_flow**: the page a user came to signup up from
* ** language**: international language preference
* ** affiliate_channel**: what kind of paid marketing
* ** affiliate_provider**: where the marketing is e.g. google, craigslist, other
* ** first_affiliate_tracked**: whats the first marketing the user interacted with before the signing up
* ** country_destination**: this is the target variable you are to predict

In [2]:
# len(test_data_df['id'].unique()) + len(train_data_df['id'].unique())

-----------------------------------------------------
## Train

In [3]:
_train_data_df = pd.read_csv('./assets/train_users_2.csv/train_users_2.csv')
print _train_data_df.shape
_train_data_df.info()

(213451, 16)
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 213451 entries, 0 to 213450
Data columns (total 16 columns):
id                         213451 non-null object
date_account_created       213451 non-null object
timestamp_first_active     213451 non-null int64
date_first_booking         88908 non-null object
gender                     213451 non-null object
age                        125461 non-null float64
signup_method              213451 non-null object
signup_flow                213451 non-null int64
language                   213451 non-null object
affiliate_channel          213451 non-null object
affiliate_provider         213451 non-null object
first_affiliate_tracked    207386 non-null object
signup_app                 213451 non-null object
first_device_type          213451 non-null object
first_browser              213451 non-null object
country_destination        213451 non-null object
dtypes: float64(1), int64(2), object(13)
memory usage: 26.1+ MB


In [4]:
_train_data_df.isnull().sum()

id                              0
date_account_created            0
timestamp_first_active          0
date_first_booking         124543
gender                          0
age                         87990
signup_method                   0
signup_flow                     0
language                        0
affiliate_channel               0
affiliate_provider              0
first_affiliate_tracked      6065
signup_app                      0
first_device_type               0
first_browser                   0
country_destination             0
dtype: int64

### Fixing date columns

In [5]:
train_data_df = _train_data_df.copy()
print train_data_df.shape
train_data_df.head()

(213451, 16)


Unnamed: 0,id,date_account_created,timestamp_first_active,date_first_booking,gender,age,signup_method,signup_flow,language,affiliate_channel,affiliate_provider,first_affiliate_tracked,signup_app,first_device_type,first_browser,country_destination
0,gxn3p5htnn,2010-06-28,20090319043255,,-unknown-,,facebook,0,en,direct,direct,untracked,Web,Mac Desktop,Chrome,NDF
1,820tgsjxq7,2011-05-25,20090523174809,,MALE,38.0,facebook,0,en,seo,google,untracked,Web,Mac Desktop,Chrome,NDF
2,4ft3gnwmtx,2010-09-28,20090609231247,2010-08-02,FEMALE,56.0,basic,3,en,direct,direct,untracked,Web,Windows Desktop,IE,US
3,bjjt8pjhuk,2011-12-05,20091031060129,2012-09-08,FEMALE,42.0,facebook,0,en,direct,direct,untracked,Web,Mac Desktop,Firefox,other
4,87mebub9p4,2010-09-14,20091208061105,2010-02-18,-unknown-,41.0,basic,0,en,direct,direct,untracked,Web,Mac Desktop,Chrome,US


In [6]:
print type(train_data_df['date_account_created'][0])
print type(train_data_df['timestamp_first_active'][0])
print type(train_data_df['date_first_booking'][2])

<type 'str'>
<type 'numpy.int64'>
<type 'str'>


In [7]:
print train_data_df['date_account_created'][0]
print train_data_df['timestamp_first_active'][0]
print train_data_df['date_first_booking'][2]

2010-06-28
20090319043255
2010-08-02


In [12]:
train_data_df['date_account_created'] = pd.to_datetime(train_data_df['date_account_created'])
train_data_df['year_acc_created'] = train_data_df.date_account_created.dt.year
train_data_df['month_acc_created'] = train_data_df.date_account_created.dt.month
train_data_df['day_acc_created'] = train_data_df.date_account_created.dt.day
train_data_df['dayofyear_acc_created'] = train_data_df.date_account_created.dt.dayofyear

train_data_df['date_first_booking'] = pd.to_datetime(train_data_df['date_first_booking'])
train_data_df['year_1st_booking'] = train_data_df.date_first_booking.dt.year
train_data_df['year_1st_booking'] = train_data_df.date_first_booking.dt.month
train_data_df['year_1st_booking'] = train_data_df.date_first_booking.dt.day
train_data_df['year_1st_booking'] = train_data_df.date_first_booking.dt.dayofyear

tmstmp = train_data_df['timestamp_first_active'].astype(str)
train_data_df['timestamp_first_active'] = tmstmp.apply(lambda x: datetime.strptime(x, "%Y%m%d%H%M%S"))
train_data_df['year_1st_tmstmp'] = train_data_df.timestamp_first_active.dt.year
train_data_df['month_1st_tmstmp'] = train_data_df.timestamp_first_active.dt.month
train_data_df['day_1st_tmstmp'] = train_data_df.timestamp_first_active.dt.day
train_data_df['hour_1st_tmstmp'] = train_data_df.timestamp_first_active.dt.hour
train_data_df['dayofyear_1st_tmstmp'] = train_data_df.timestamp_first_active.dt.dayofyear

------------------

In [13]:
train_data_df.head()

Unnamed: 0,id,date_account_created,timestamp_first_active,date_first_booking,gender,age,signup_method,signup_flow,language,affiliate_channel,affiliate_provider,first_affiliate_tracked,signup_app,first_device_type,first_browser,country_destination,year_acc_created,month_acc_created,day_acc_created,dayofyear_acc_created,year_1st_booking,year_1st_tmstmp,month_1st_tmstmp,day_1st_tmstmp,hour_1st_tmstmp,dayofyear_1st_tmstmp
0,gxn3p5htnn,2010-06-28,2009-03-19 04:32:55,NaT,-unknown-,,facebook,0,en,direct,direct,untracked,Web,Mac Desktop,Chrome,NDF,2010,6,28,179,,2009,3,19,4,78
1,820tgsjxq7,2011-05-25,2009-05-23 17:48:09,NaT,MALE,38.0,facebook,0,en,seo,google,untracked,Web,Mac Desktop,Chrome,NDF,2011,5,25,145,,2009,5,23,17,143
2,4ft3gnwmtx,2010-09-28,2009-06-09 23:12:47,2010-08-02,FEMALE,56.0,basic,3,en,direct,direct,untracked,Web,Windows Desktop,IE,US,2010,9,28,271,214.0,2009,6,9,23,160
3,bjjt8pjhuk,2011-12-05,2009-10-31 06:01:29,2012-09-08,FEMALE,42.0,facebook,0,en,direct,direct,untracked,Web,Mac Desktop,Firefox,other,2011,12,5,339,252.0,2009,10,31,6,304
4,87mebub9p4,2010-09-14,2009-12-08 06:11:05,2010-02-18,-unknown-,41.0,basic,0,en,direct,direct,untracked,Web,Mac Desktop,Chrome,US,2010,9,14,257,49.0,2009,12,8,6,342


In [14]:
train_data_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 213451 entries, 0 to 213450
Data columns (total 26 columns):
id                         213451 non-null object
date_account_created       213451 non-null datetime64[ns]
timestamp_first_active     213451 non-null datetime64[ns]
date_first_booking         88908 non-null datetime64[ns]
gender                     213451 non-null object
age                        125461 non-null float64
signup_method              213451 non-null object
signup_flow                213451 non-null int64
language                   213451 non-null object
affiliate_channel          213451 non-null object
affiliate_provider         213451 non-null object
first_affiliate_tracked    207386 non-null object
signup_app                 213451 non-null object
first_device_type          213451 non-null object
first_browser              213451 non-null object
country_destination        213451 non-null object
year_acc_created           213451 non-null int64
month_acc_created

In [15]:
columns = train_data_df.columns
columns[5:15]

Index([u'age', u'signup_method', u'signup_flow', u'language',
       u'affiliate_channel', u'affiliate_provider', u'first_affiliate_tracked',
       u'signup_app', u'first_device_type', u'first_browser'],
      dtype='object')

In [16]:
for col in columns[4:15]:
    print col, 'unique values:'
    print len(train_data_df[col].unique())
    print col, "unique values and count"
    print train_data_df[col].value_counts()
    print '----------------------------------------------------------'

gender unique values:
4
gender unique values and count
-unknown-    95688
FEMALE       63041
MALE         54440
OTHER          282
Name: gender, dtype: int64
----------------------------------------------------------
age unique values:
128
age unique values and count
30.0      6124
31.0      6016
29.0      5963
28.0      5939
32.0      5855
27.0      5738
33.0      5527
26.0      5044
34.0      5029
35.0      4860
25.0      4459
36.0      4083
37.0      3694
38.0      3384
24.0      3220
39.0      2998
40.0      2766
41.0      2538
23.0      2462
42.0      2243
45.0      2149
44.0      2137
43.0      2056
46.0      1875
22.0      1702
47.0      1646
48.0      1469
50.0      1387
51.0      1337
49.0      1331
52.0      1210
105.0     1131
19.0      1102
53.0      1098
54.0      1019
55.0      1011
21.0       982
56.0       941
57.0       915
58.0       823
59.0       780
60.0       734
2014.0     710
61.0       680
18.0       669
62.0       593
63.0       573
64.0       549
20.0       5

In [None]:
# dict(zip(*np.unique(train_data_df['first_browser'],return_counts=True)))
# zip(*np.unique(d,return_counts=True))

In [None]:
# train_data_df['affiliate_provider'].mask()

In [17]:
def change_to_other(cell, _list):
    if cell in _list:
        cell = 'Other'

### Cleaning age column and creating age buckets

In [18]:
train_data_df.age.fillna(-1,inplace=True)
age_group = []
age_bckts = [21,25,28,32,37,45,55,65,75]
for cell in train_data_df.age:
#     print cell
    if cell< 0:
        aggp = '-unknown-'
#         age_group.append(aggp)
    elif (cell < 15) or (cell > 1900):
        if (cell< 15) or (cell>1998):#data is from 2014, minimum age for travel is assumed 15.
            aggp = 'Mistake'
#             print aggp
#             age_group.append(aggp)
#             break
        else:
            cell == 2014 - cell
    elif cell<=21:
            aggp = '15-21'
    elif cell>75:
            aggp = '75+'
    else:
        for i, agbkt in enumerate(age_bckts):
            if cell<=agbkt:
                aggp = str(age_bckts[i-1])+'-'+str(agbkt)
                break
    age_group.append(aggp)
train_data_df['age_bckts'] = age_group
# CC = [22,36,76,54,44,21,11,2,-1,-2,1923,2012,88,23]

In [19]:
train_data_df.age_bckts.unique()

array(['-unknown-', '37-45', '55-65', '45-55', '32-37', '28-32', '25-28',
       '15-21', '21-25', '65-75', 'Mistake', '75+'], dtype=object)

### Reducing number of features of some columns

#### Browser

In [20]:
brow_cnt = zip(train_data_df['first_browser'].value_counts().index, train_data_df['first_browser'].value_counts().values)
brow_other = [x[0] for x in brow_cnt if x[1]<150]
train_data_df.first_browser = train_data_df.first_browser.apply(lambda x: 'Other' if x in brow_other else x)

In [21]:
train_data_df.first_browser.unique()

array(['Chrome', 'IE', 'Firefox', 'Safari', '-unknown-', 'Mobile Safari',
       'Chrome Mobile', 'Other', 'Android Browser', 'AOL Explorer', 'Opera'], dtype=object)

#### Affiliate provider 

In [22]:
affprov_cnt = zip(train_data_df['affiliate_provider'].value_counts().index,
                  train_data_df['affiliate_provider'].value_counts().values)
affprov_other = [x[0] for x in affprov_cnt if x[1]<100]
train_data_df.affiliate_provider = train_data_df.affiliate_provider.apply(lambda x: 'Other' if x in affprov_other else x)

In [23]:
train_data_df.affiliate_provider.unique()

array(['direct', 'google', 'other', 'craigslist', 'facebook', 'vast',
       'bing', 'meetup', 'facebook-open-graph', 'email-marketing', 'yahoo',
       'padmapper', 'gsp', 'Other'], dtype=object)

In [24]:
train_data_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 213451 entries, 0 to 213450
Data columns (total 27 columns):
id                         213451 non-null object
date_account_created       213451 non-null datetime64[ns]
timestamp_first_active     213451 non-null datetime64[ns]
date_first_booking         88908 non-null datetime64[ns]
gender                     213451 non-null object
age                        213451 non-null float64
signup_method              213451 non-null object
signup_flow                213451 non-null int64
language                   213451 non-null object
affiliate_channel          213451 non-null object
affiliate_provider         213451 non-null object
first_affiliate_tracked    207386 non-null object
signup_app                 213451 non-null object
first_device_type          213451 non-null object
first_browser              213451 non-null object
country_destination        213451 non-null object
year_acc_created           213451 non-null int64
month_acc_created

In [30]:
y = train_data_df['country_destination']
train_data_df.drop('country_destination',axis=1,inplace=True)

In [31]:
users = train_data_df['id']
train_data_df.drop('id',axis=1,inplace=True)

In [32]:
train_data_df.columns

Index([u'date_account_created', u'timestamp_first_active',
       u'date_first_booking', u'gender', u'age', u'signup_method',
       u'signup_flow', u'language', u'affiliate_channel',
       u'affiliate_provider', u'first_affiliate_tracked', u'signup_app',
       u'first_device_type', u'first_browser', u'year_acc_created',
       u'month_acc_created', u'day_acc_created', u'dayofyear_acc_created',
       u'year_1st_booking', u'year_1st_tmstmp', u'month_1st_tmstmp',
       u'day_1st_tmstmp', u'hour_1st_tmstmp', u'dayofyear_1st_tmstmp',
       u'age_bckts'],
      dtype='object')

In [38]:
X = train_data_df.copy()
X.drop(['date_account_created', 'timestamp_first_active', 'date_first_booking'],axis=1,inplace=True)
X.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 213451 entries, 0 to 213450
Data columns (total 22 columns):
gender                     213451 non-null object
age                        213451 non-null float64
signup_method              213451 non-null object
signup_flow                213451 non-null int64
language                   213451 non-null object
affiliate_channel          213451 non-null object
affiliate_provider         213451 non-null object
first_affiliate_tracked    207386 non-null object
signup_app                 213451 non-null object
first_device_type          213451 non-null object
first_browser              213451 non-null object
year_acc_created           213451 non-null int64
month_acc_created          213451 non-null int64
day_acc_created            213451 non-null int64
dayofyear_acc_created      213451 non-null int64
year_1st_booking           88908 non-null float64
year_1st_tmstmp            213451 non-null int64
month_1st_tmstmp           213451 non-null i

In [39]:
_X = X.dtypes.index

In [51]:
_x = [col for col in X.columns if X[col].dtype=='O']

In [52]:
_X = X[_x]

In [54]:
_X_ = pd.get_dummies(_X,dummy_na=True)

In [56]:
_X_

Unnamed: 0,gender_-unknown-,gender_FEMALE,gender_MALE,gender_OTHER,gender_nan,signup_method_basic,signup_method_facebook,signup_method_google,signup_method_nan,language_ca,language_cs,language_da,language_de,language_el,language_en,language_es,language_fi,language_fr,language_hr,language_hu,language_id,language_is,language_it,language_ja,language_ko,language_nl,language_no,language_pl,language_pt,language_ru,language_sv,language_th,language_tr,language_zh,language_nan,affiliate_channel_api,affiliate_channel_content,affiliate_channel_direct,affiliate_channel_other,affiliate_channel_remarketing,affiliate_channel_sem-brand,affiliate_channel_sem-non-brand,affiliate_channel_seo,affiliate_channel_nan,affiliate_provider_Other,affiliate_provider_bing,affiliate_provider_craigslist,affiliate_provider_direct,affiliate_provider_email-marketing,affiliate_provider_facebook,affiliate_provider_facebook-open-graph,affiliate_provider_google,affiliate_provider_gsp,affiliate_provider_meetup,affiliate_provider_other,affiliate_provider_padmapper,affiliate_provider_vast,affiliate_provider_yahoo,affiliate_provider_nan,first_affiliate_tracked_linked,first_affiliate_tracked_local ops,first_affiliate_tracked_marketing,first_affiliate_tracked_omg,first_affiliate_tracked_product,first_affiliate_tracked_tracked-other,first_affiliate_tracked_untracked,first_affiliate_tracked_nan,signup_app_Android,signup_app_Moweb,signup_app_Web,signup_app_iOS,signup_app_nan,first_device_type_Android Phone,first_device_type_Android Tablet,first_device_type_Desktop (Other),first_device_type_Mac Desktop,first_device_type_Other/Unknown,first_device_type_SmartPhone (Other),first_device_type_Windows Desktop,first_device_type_iPad,first_device_type_iPhone,first_device_type_nan,first_browser_-unknown-,first_browser_AOL Explorer,first_browser_Android Browser,first_browser_Chrome,first_browser_Chrome Mobile,first_browser_Firefox,first_browser_IE,first_browser_Mobile Safari,first_browser_Opera,first_browser_Other,first_browser_Safari,first_browser_nan,age_bckts_-unknown-,age_bckts_15-21,age_bckts_21-25,age_bckts_25-28,age_bckts_28-32,age_bckts_32-37,age_bckts_37-45,age_bckts_45-55,age_bckts_55-65,age_bckts_65-75,age_bckts_75+,age_bckts_Mistake,age_bckts_nan
0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
3,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
4,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
5,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
6,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
7,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
8,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
9,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0


In [57]:
_X_.shape

(213451, 107)