In [1]:
import numpy as np
import pandas as pd
import re

In [2]:
longtime_feat = pd.read_csv('feat_input/longtime_feat.csv')
encoded_feat = pd.read_csv('feat_input/feat_stats_encoding.csv')

### Create dataframe from csv file

In [3]:
feat_basic_csv = pd.read_csv('feat_input/basic_feat.csv')
feat_basic_csv.features = feat_basic_csv.features.apply(lambda x: re.findall(r"[\w'|\ ]+", x))

In [4]:
feat_csv = feat_basic_csv.merge(longtime_feat, on='listing_id')
feat_csv = feat_csv.merge(encoded_feat, on='listing_id')
#feat_csv = feat_csv.fillna(-1)

### Create dataframe from json file

In [5]:
feat_basic_json = pd.read_json('feat_input/basic_feat.json')

In [6]:
feat_basic_json.display_address = feat_basic_json.display_address.replace(r'\r$', '', regex=True)
feat_basic_json.street_address = feat_basic_json.street_address.replace(r'\r$', '', regex=True)

In [7]:
feat_json = feat_basic_json.merge(longtime_feat, on='listing_id')
feat_json = feat_json.merge(encoded_feat, on='listing_id')
#feat_json = feat_json.replace('', np.nan)
#feat_json = feat_json.fillna(-1)

### Reset index

In [8]:
feat_csv = feat_csv.sort_values('listing_id', ascending=True).reset_index()
feat_json = feat_json.sort_values('listing_id', ascending=True).reset_index()

### Compare two dataframes

In [9]:
for a in feat_csv.columns.values:
    if a not in ['index']:
        if type(feat_json[a][0]) == np.float64:
            print(a, sum(abs(1 - feat_csv[a] / feat_json[a]) > 1e-4))
        else:
            print(a, sum(feat_json[a] != feat_csv[a]))

bathrooms 0
bedrooms 0
building_id 0
display_address 350
features 107521
interest_level 0
latitude 0
listing_id 0
longitude 0
manager_id 0
price 0
street_address 23
feature_num 124011
photo_num 124011
desc_wordcount 0
distance_city 0
day_of_year 0
created_month 0
created_day 0
created_hour 0
day_of_week 0
price_bed_rt 0
price_bath_rt 0
price_room_rt 0
bed_bath_rt 0
bed_bath_dif 0
bed_bath_sum 0
bed_room_rt 0
time_stamp 0
img_sizes_mean 0
jwd_type_low_than_num 0
jwd_type_all 0
jwd_type_rt 0
building_zero_num 0
bathrooms_size_manager_id 0
bathrooms_mean_manager_id 0
bathrooms_std_manager_id 0
bathrooms_median_manager_id 0
bathrooms_max_manager_id 0
bathrooms_min_manager_id 0
bedrooms_size_manager_id 0
bedrooms_mean_manager_id 0
bedrooms_std_manager_id 0
bedrooms_median_manager_id 0
bedrooms_max_manager_id 0
bedrooms_min_manager_id 0
latitude_size_manager_id 0
latitude_mean_manager_id 0
latitude_std_manager_id 0
latitude_median_manager_id 0
latitude_max_manager_id 0
latitude_min_manager_i

### To see where the difference comes from

In [10]:
diff_street_address = feat_csv[['street_address']].loc[feat_csv.street_address != feat_json.street_address]
sum(diff_street_address.street_address.isnull()) / len(diff_street_address)

1.0

In [11]:
diff_street_address_json = feat_json[['street_address']].loc[feat_csv.street_address != feat_json.street_address]
sum(diff_street_address_json.street_address == '') / len(diff_street_address_json)

1.0

In [12]:
diff_display_address = feat_csv[['display_address']].loc[feat_csv.display_address != feat_json.display_address]
sum(diff_display_address.display_address.isnull()) / len(diff_display_address)

1.0

### Check features column

In [13]:
feat_csv.features.head()

0           [Doorman,  Elevator,  Laundry In Building]
1    [Common Outdoor Space,  Cats Allowed,  Private...
2    [Cats Allowed,  Dogs Allowed,  No Fee,  Doorma...
3    [Common Outdoor Space,  Cats Allowed,  Private...
4    [Cats Allowed,  Dogs Allowed,  No Fee,  Laundr...
Name: features, dtype: object

In [14]:
feat_json.features.head()

0             [Doorman, Elevator, Laundry In Building]
1    [Common Outdoor Space, Cats Allowed, Private O...
2    [Cats Allowed, Dogs Allowed, No Fee, Doorman, ...
3    [Common Outdoor Space, Cats Allowed, Private O...
4    [Cats Allowed, Dogs Allowed, No Fee, Laundry I...
Name: features, dtype: object

### Continue to work on features column

In [15]:
from nltk.stem.snowball import SnowballStemmer
stemmer = SnowballStemmer('english')

In [16]:
feat_csv.features = feat_csv.features.apply(lambda x: [stemmer.stem(y) for y in x])
feat_csv.features = feat_csv.features.apply(lambda x: ['_'.join(['feature'] + y.split()) for y in x])
feat_csv.features = feat_csv.features.apply(lambda x: ' '.join(x))

In [17]:
feat_json.features = feat_json.features.apply(lambda x: [stemmer.stem(y) for y in x])
feat_json.features = feat_json.features.apply(lambda x: ['_'.join(['feature'] + y.split()) for y in x])
feat_json.features = feat_json.features.apply(lambda x: ' '.join(x))

In [18]:
for a in feat_csv.columns.values:
    if a in ['display_address', 'street_address', 'features']:
        if type(feat_json[a][0]) == np.float64:
            print(a, sum(abs(1 - feat_csv[a] / feat_json[a]) > 1e-4))
        else:
            print(a, sum(feat_json[a] != feat_csv[a]))

display_address 350
features 32377
street_address 23


In [19]:
diff_features_csv = feat_csv.features.loc[feat_csv.features != feat_json.features]
diff_features_json = feat_json.features.loc[feat_csv.features != feat_json.features]

In [20]:
pd.set_option('display.max_colwidth', -1)

In [21]:
diff_features = pd.concat([diff_features_csv, diff_features_json], axis=1)
diff_features.columns = ['features_csv', 'features_json']

In [22]:
diff_features.head()

Unnamed: 0,features_csv,features_json
14,feature_pre feature_war feature_dogs_allow feature_cats_allow,feature_pre-war feature_dogs_allow feature_cats_allow
15,feature_pre feature_war,feature_pre-war
16,feature_pre feature_war feature_dogs_allow feature_cats_allow,feature_pre-war feature_dogs_allow feature_cats_allow
17,feature_doorman feature_fitness_cent feature_pre feature_war,feature_doorman feature_fitness_cent feature_pre-war
19,feature_pre feature_war feature_dogs_allow feature_cats_allow,feature_pre-war feature_dogs_allow feature_cats_allow


### Create Term-Document matrix using sklearn CountVectorizer

In [23]:
from sklearn.feature_extraction.text import CountVectorizer
vector_csv = CountVectorizer(max_features = 100)

In [24]:
vector_csv.fit_transform(diff_features_csv.iloc[:10])

<10x7 sparse matrix of type '<class 'numpy.int64'>'
	with 39 stored elements in Compressed Sparse Row format>

In [25]:
vector_csv.get_feature_names()

['feature_cats_allow',
 'feature_dogs_allow',
 'feature_doorman',
 'feature_fitness_cent',
 'feature_no_fe',
 'feature_pre',
 'feature_war']

In [26]:
vector_json = CountVectorizer(max_features = 100, token_pattern=r'[\w-]+')
vector_json.fit_transform(diff_features_json.iloc[:10])
vector_json.get_feature_names()

['feature_cats_allow',
 'feature_dogs_allow',
 'feature_doorman',
 'feature_fitness_cent',
 'feature_no_fe',
 'feature_pre-war']

In [27]:
diff_features.head(10)

Unnamed: 0,features_csv,features_json
14,feature_pre feature_war feature_dogs_allow feature_cats_allow,feature_pre-war feature_dogs_allow feature_cats_allow
15,feature_pre feature_war,feature_pre-war
16,feature_pre feature_war feature_dogs_allow feature_cats_allow,feature_pre-war feature_dogs_allow feature_cats_allow
17,feature_doorman feature_fitness_cent feature_pre feature_war,feature_doorman feature_fitness_cent feature_pre-war
19,feature_pre feature_war feature_dogs_allow feature_cats_allow,feature_pre-war feature_dogs_allow feature_cats_allow
20,feature_fitness_cent feature_pre feature_war feature_no_fe feature_dogs_allow feature_cats_allow,feature_fitness_cent feature_pre-war feature_no_fe feature_dogs_allow feature_cats_allow
21,feature_pre feature_war feature_dogs_allow feature_cats_allow,feature_pre-war feature_dogs_allow feature_cats_allow
22,feature_pre feature_war,feature_pre-war
25,feature_pre feature_war,feature_pre-war
28,feature_doorman feature_fitness_cent feature_pre feature_war feature_no_fe feature_dogs_allow feature_cats_allow,feature_doorman feature_fitness_cent feature_pre-war feature_no_fe feature_dogs_allow feature_cats_allow


### Dump to file

In [28]:
feat_csv.to_csv('feat_csv.csv', index=False)
feat_json.to_csv('feat_json.csv', index=False)

In [29]:
feat_csv_re = pd.read_csv('feat_csv.csv')
feat_json_re = pd.read_csv('feat_json.csv')

In [30]:
for a in feat_csv_re.columns.values:
    if a not in ['index']:
        if type(feat_json_re[a][0]) == np.float64:
            print(a, sum(abs(1 - feat_csv_re[a] / feat_json_re[a]) > 1e-4))
        else:
            print(a, sum(feat_json_re[a] != feat_csv_re[a]))

bathrooms 0
bedrooms 0
building_id 0
display_address 350
features 40512
interest_level 0
latitude 0
listing_id 0
longitude 0
manager_id 0
price 0
street_address 23
feature_num 124011
photo_num 124011
desc_wordcount 0
distance_city 0
day_of_year 0
created_month 0
created_day 0
created_hour 0
day_of_week 0
price_bed_rt 0
price_bath_rt 0
price_room_rt 0
bed_bath_rt 0
bed_bath_dif 0
bed_bath_sum 0
bed_room_rt 0
time_stamp 0
img_sizes_mean 0
jwd_type_low_than_num 0
jwd_type_all 0
jwd_type_rt 0
building_zero_num 0
bathrooms_size_manager_id 0
bathrooms_mean_manager_id 0
bathrooms_std_manager_id 0
bathrooms_median_manager_id 0
bathrooms_max_manager_id 0
bathrooms_min_manager_id 0
bedrooms_size_manager_id 0
bedrooms_mean_manager_id 0
bedrooms_std_manager_id 0
bedrooms_median_manager_id 0
bedrooms_max_manager_id 0
bedrooms_min_manager_id 0
latitude_size_manager_id 0
latitude_mean_manager_id 0
latitude_std_manager_id 0
latitude_median_manager_id 0
latitude_max_manager_id 0
latitude_min_manager_id

### Define xgb cv 

In [31]:
import xgboost as xgb
def xgb_cv(dtrain, num_rounds = 50000, early_stop_rounds=250):
    print('Start xgboost cross-validation')
    params = {'booster': 'gbtree',
              'objective': 'multi:softprob',
              'eval_metric': 'mlogloss',
              'gamma': 1,
              'min_child_weight': 1.5,
              'max_depth': 5,
              'lambda': 10,
              'subsample': 0.7,
              'colsample_bytree': 0.7,
              'colsample_bylevel': 0.7,
              'eta': 0.03,
              'tree_method': 'exact',
              'seed': 36683,
              'nthread': 4,
              'num_class': 3,
              'silent': 1
              }
    xgb2cv = xgb.cv(params=params,
                    dtrain=dtrain,
                    num_boost_round=num_rounds,
                    nfold=5,
                    stratified=True,
                    verbose_eval=50,
                    early_stopping_rounds=early_stop_rounds)
    return xgb2cv

### Train Csv dataframe using xgb cv

In [32]:
from sklearn.preprocessing import LabelEncoder
categorical = ["display_address", "manager_id", "building_id", "street_address"]
for f in categorical:
    if feat_csv[f].dtype == 'object':
        lbl = LabelEncoder()
        lbl.fit(list(feat_csv[f].values))
        feat_csv[f] = lbl.transform(list(feat_csv[f].values))

In [33]:
feat_csv.drop(['features'], axis=1, inplace=True)

In [34]:
train_csv = feat_csv[feat_csv.interest_level != -1].copy()
test_csv = feat_csv[feat_csv.interest_level == -1].copy()
y_train_csv=train_csv["interest_level"]
x_train_csv = train_csv.drop(['interest_level'], axis=1)
x_train_csv.columns.values

array(['index', 'bathrooms', 'bedrooms', 'building_id', 'display_address',
       'latitude', 'listing_id', 'longitude', 'manager_id', 'price',
       'street_address', 'feature_num', 'photo_num', 'desc_wordcount',
       'distance_city', 'day_of_year', 'created_month', 'created_day',
       'created_hour', 'day_of_week', 'price_bed_rt', 'price_bath_rt',
       'price_room_rt', 'bed_bath_rt', 'bed_bath_dif', 'bed_bath_sum',
       'bed_room_rt', 'time_stamp', 'img_sizes_mean',
       'jwd_type_low_than_num', 'jwd_type_all', 'jwd_type_rt',
       'building_zero_num', 'bathrooms_size_manager_id',
       'bathrooms_mean_manager_id', 'bathrooms_std_manager_id',
       'bathrooms_median_manager_id', 'bathrooms_max_manager_id',
       'bathrooms_min_manager_id', 'bedrooms_size_manager_id',
       'bedrooms_mean_manager_id', 'bedrooms_std_manager_id',
       'bedrooms_median_manager_id', 'bedrooms_max_manager_id',
       'bedrooms_min_manager_id', 'latitude_size_manager_id',
       'latitude_

In [35]:
dtrain = xgb.DMatrix(x_train_csv, label=y_train_csv)
_ = xgb_cv(dtrain)

Start xgboost cross-validation
[0]	train-mlogloss:1.07898+0.000224968	test-mlogloss:1.0792+0.000239993
[50]	train-mlogloss:0.660347+0.000898065	test-mlogloss:0.669866+0.00233874
[100]	train-mlogloss:0.571462+0.00120799	test-mlogloss:0.588125+0.00372488
[150]	train-mlogloss:0.538569+0.00105192	test-mlogloss:0.561891+0.00444706
[200]	train-mlogloss:0.519681+0.00108435	test-mlogloss:0.549629+0.00491667
[250]	train-mlogloss:0.505739+0.00096267	test-mlogloss:0.542111+0.00497954
[300]	train-mlogloss:0.494032+0.000932057	test-mlogloss:0.536921+0.004927
[350]	train-mlogloss:0.484013+0.000933687	test-mlogloss:0.533374+0.00484482
[400]	train-mlogloss:0.474908+0.000888991	test-mlogloss:0.530611+0.00484593
[450]	train-mlogloss:0.466726+0.000694355	test-mlogloss:0.528531+0.00474222
[500]	train-mlogloss:0.45887+0.000708136	test-mlogloss:0.526894+0.00471079
[550]	train-mlogloss:0.451602+0.000650004	test-mlogloss:0.525526+0.00466927
[600]	train-mlogloss:0.444675+0.000568017	test-mlogloss:0.52429+0.004

### Train Json dataframe using xgb cv

In [36]:
from sklearn.preprocessing import LabelEncoder
categorical = ["display_address", "manager_id", "building_id", "street_address"]
for f in categorical:
    if feat_json[f].dtype == 'object':
        lbl = LabelEncoder()
        lbl.fit(list(feat_json[f].values))
        feat_json[f] = lbl.transform(list(feat_json[f].values))

In [37]:
feat_json.drop(['features'], axis=1, inplace=True)

In [38]:
train_json = feat_json[feat_json.interest_level != -1].copy()
test_json = feat_json[feat_json.interest_level == -1].copy()
y_train_json=train_json["interest_level"]
x_train_json = train_json.drop(['interest_level'], axis=1)
x_train_json.columns.values

array(['index', 'bathrooms', 'bed_bath_dif', 'bed_bath_rt', 'bed_bath_sum',
       'bed_room_rt', 'bedrooms', 'building_id', 'created_day',
       'created_hour', 'created_month', 'day_of_week', 'day_of_year',
       'desc_wordcount', 'display_address', 'distance_city', 'feature_num',
       'img_sizes_mean', 'latitude', 'listing_id', 'longitude',
       'manager_id', 'photo_num', 'price', 'price_bath_rt', 'price_bed_rt',
       'price_room_rt', 'street_address', 'time_stamp',
       'jwd_type_low_than_num', 'jwd_type_all', 'jwd_type_rt',
       'building_zero_num', 'bathrooms_size_manager_id',
       'bathrooms_mean_manager_id', 'bathrooms_std_manager_id',
       'bathrooms_median_manager_id', 'bathrooms_max_manager_id',
       'bathrooms_min_manager_id', 'bedrooms_size_manager_id',
       'bedrooms_mean_manager_id', 'bedrooms_std_manager_id',
       'bedrooms_median_manager_id', 'bedrooms_max_manager_id',
       'bedrooms_min_manager_id', 'latitude_size_manager_id',
       'latitude_

In [39]:
dtrain = xgb.DMatrix(x_train_json, label=y_train_json)
_ = xgb_cv(dtrain)

Start xgboost cross-validation
[0]	train-mlogloss:1.07886+0.00010134	test-mlogloss:1.07907+6.76917e-05
[50]	train-mlogloss:0.660458+0.000738978	test-mlogloss:0.669706+0.00260449
[100]	train-mlogloss:0.571889+0.000987022	test-mlogloss:0.588327+0.00383684
[150]	train-mlogloss:0.539055+0.000985395	test-mlogloss:0.561892+0.00449936
[200]	train-mlogloss:0.520114+0.0011274	test-mlogloss:0.549455+0.00470958
[250]	train-mlogloss:0.506089+0.00101914	test-mlogloss:0.541969+0.00485209
[300]	train-mlogloss:0.494484+0.000928464	test-mlogloss:0.536831+0.00482069
[350]	train-mlogloss:0.484541+0.000901724	test-mlogloss:0.533252+0.00480363
[400]	train-mlogloss:0.475523+0.000755502	test-mlogloss:0.530449+0.00478235
[450]	train-mlogloss:0.467138+0.000626529	test-mlogloss:0.528261+0.00472326
[500]	train-mlogloss:0.459415+0.000559225	test-mlogloss:0.526565+0.00467987
[550]	train-mlogloss:0.452237+0.000469055	test-mlogloss:0.52522+0.00466411
[600]	train-mlogloss:0.445416+0.000543329	test-mlogloss:0.523964+0

The comparison here makes more sense, and it shows no difference for training from a json file or csv file!