In [1]:
import numpy as np
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import StratifiedKFold
import random
from math import exp
import xgboost as xgb
from nltk.stem import PorterStemmer
import re
#import distance

In [2]:
random.seed(321)
np.random.seed(321)

X_train = pd.read_json("train.json")
X_test = pd.read_json("test.json")

In [3]:
interest_level_map = {'low': 0, 'medium': 1, 'high': 2}
X_train['interest_level'] = X_train['interest_level'].apply(lambda x: interest_level_map[x])
X_test['interest_level'] = -1

#remove some noise
#ulimit = np.percentile(X_train.price.values, 99)
X_train['price'].ix[X_train['price']>13000] = 13000

#modify description 
#stemmer = PorterStemmer()
#def clean(x):
#    regex = re.compile('[^a-zA-Z ]')
    # For user clarity, broken it into three steps
#    i = regex.sub(' ', x).lower()
#    i = i.split(" ") 
#    i= [stemmer.stem(l) for l in i]
#    i= " ".join([l.strip() for l in i if (len(l)>2) ]) # Keeping words that have length greater than 2
#    return i

#X_train['description'] = X_train.description.apply(lambda x: clean(x))    
#X_test['description'] = X_test.description.apply(lambda x: clean(x))    

#add features
feature_transform = CountVectorizer(stop_words='english', max_features=150)
X_train['features'] = X_train["features"].apply(lambda x: " ".join(["_".join(i.lower().split(" ")) for i in x]))
X_test['features'] = X_test["features"].apply(lambda x: " ".join(["_".join(i.lower().split(" ")) for i in x]))
feature_transform.fit(list(X_train['features']) + list(X_test['features']))

#featured_transform = CountVectorizer(stop_words='english', max_features=200)
#X_train['description'] = X_train["description"].apply(lambda x: " ".join(["_".join(i.lower().split(" ")) for i in x]))
#X_test['description'] = X_test["description"].apply(lambda x: " ".join(["_".join(i.lower().split(" ")) for i in x]))
#featured_transform.fit(list(X_train['description']) + list(X_test['description']))

train_size = len(X_train)
low_count = len(X_train[X_train['interest_level'] == 0])
medium_count = len(X_train[X_train['interest_level'] == 1])
high_count = len(X_train[X_train['interest_level'] == 2])

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self._setitem_with_indexer(indexer, value)


In [25]:
def find_objects_with_only_one_record(feature_name):
    temp = pd.concat([X_train[feature_name].reset_index(), 
                      X_test[feature_name].reset_index()])
    temp = temp.groupby(feature_name, as_index = False).count()
    return temp[temp['index'] == 1]

In [26]:
def categorical_average(variable, y, pred_0, feature_name):
    def calculate_average(sub1, sub2):
        s = pd.DataFrame(data = {
                                 variable: sub1.groupby(variable, as_index = False).count()[variable],                              
                                 'sumy': sub1.groupby(variable, as_index = False).sum()['y'],
                                 'avgY': sub1.groupby(variable, as_index = False).mean()['y'],
                                 'cnt': sub1.groupby(variable, as_index = False).count()['y']
                                 })
                                 
        tmp = sub2.merge(s.reset_index(), how='left', left_on=variable, right_on=variable) 
        del tmp['index']                       
        tmp.loc[pd.isnull(tmp['cnt']), 'cnt'] = 0.0
        tmp.loc[pd.isnull(tmp['cnt']), 'sumy'] = 0.0

        def compute_beta(row):
            cnt = row['cnt'] if row['cnt'] < 200 else float('inf')
            return 1.0 / (g + exp((cnt - k) / f))
            
        if lambda_val is not None:
            tmp['beta'] = lambda_val
        else:
            tmp['beta'] = tmp.apply(compute_beta, axis = 1)
            
        tmp['adj_avg'] = tmp.apply(lambda row: (1.0 - row['beta']) * row['avgY'] + row['beta'] * row['pred_0'],
                                   axis = 1)
                                   
        tmp.loc[pd.isnull(tmp['avgY']), 'avgY'] = tmp.loc[pd.isnull(tmp['avgY']), 'pred_0']
        tmp.loc[pd.isnull(tmp['adj_avg']), 'adj_avg'] = tmp.loc[pd.isnull(tmp['adj_avg']), 'pred_0']
        tmp['random'] = np.random.uniform(size = len(tmp))
        tmp['adj_avg'] = tmp.apply(lambda row: row['adj_avg'] *(1 + (row['random'] - 0.5) * r_k),
                                   axis = 1)
    
        return tmp['adj_avg'].ravel()
     
    #cv for training set 
    k_fold = StratifiedKFold(5)
    X_train[feature_name] = -999 
    for (train_index, cv_index) in k_fold.split(np.zeros(len(X_train)),
                                                X_train['interest_level'].ravel()):
        sub = pd.DataFrame(data = {variable: X_train[variable],
                                   'y': X_train[y],
                                   'pred_0': X_train[pred_0]})
            
        sub1 = sub.iloc[train_index]        
        sub2 = sub.iloc[cv_index]
        
        X_train.loc[cv_index, feature_name] = calculate_average(sub1, sub2)
    
    #for test set
    sub1 = pd.DataFrame(data = {variable: X_train[variable],
                                'y': X_train[y],
                                'pred_0': X_train[pred_0]})
    sub2 = pd.DataFrame(data = {variable: X_test[variable],
                                'y': X_test[y],
                                'pred_0': X_test[pred_0]})
    X_test.loc[:, feature_name] = calculate_average(sub1, sub2)      

In [27]:
def transform_data(X):
    #add features    
    feat_sparse = feature_transform.transform(X["features"])
    vocabulary = feature_transform.vocabulary_
    del X['features']
    X1 = pd.DataFrame([ pd.Series(feat_sparse[i].toarray().ravel()) for i in np.arange(feat_sparse.shape[0]) ])
    X1.columns = list(sorted(vocabulary.keys()))
    X = pd.concat([X.reset_index(), X1.reset_index()], axis = 1)
    del X['index']
    
    #featd_sparse = featured_transform.transform(X["description"])
    #vocabularyd = featured_transform.vocabulary_
    #del X['description']
    #X2 = pd.DataFrame([ pd.Series(featd_sparse[i].toarray().ravel()) for i in np.arange(featd_sparse.shape[0]) ])
    #X2.columns = list(sorted(vocabularyd.keys()))
    #X = pd.concat([X.reset_index(), X2.reset_index()], axis = 1)
    #del X['index']
    
    X["num_photos"] = X["photos"].apply(len)
    X['created'] = pd.to_datetime(X["created"])
    X["num_description_words"] = X["description"].apply(lambda x: len(x.split(" ")))
    X['price_per_bed'] = X['price'] / X['bedrooms']    
    X['price_per_bath'] = X['price'] / X['bathrooms']
    X['price_per_room'] = X['price'] / (X['bathrooms'] + X['bedrooms'] )
    
    X['low'] = 0
    X.loc[X['interest_level'] == 0, 'low'] = 1
    X['medium'] = 0
    X.loc[X['interest_level'] == 1, 'medium'] = 1
    X['high'] = 0
    X.loc[X['interest_level'] == 2, 'high'] = 1
    
    X['display_address'] = X['display_address'].apply(lambda x: x.lower().strip())
    X['street_address'] = X['street_address'].apply(lambda x: x.lower().strip())
    
    X['pred0_low'] = low_count * 1.0 / train_size
    X['pred0_medium'] = medium_count * 1.0 / train_size
    X['pred0_high'] = high_count * 1.0 / train_size
    
    X.loc[X['manager_id'].isin(managers_with_one_lot['manager_id'].ravel()), 
          'manager_id'] = "-1"
    X.loc[X['building_id'].isin(buildings_with_one_lot['building_id'].ravel()), 
          'building_id'] = "-1"
    X.loc[X['display_address'].isin(addresses_with_one_lot['display_address'].ravel()), 
          'display_address'] = "-1"
          
    return X



In [28]:
def normalize_high_cordiality_data():
    high_cardinality = ["building_id", "manager_id"]
    for c in high_cardinality:
        categorical_average(c, "medium", "pred0_medium", c + "_mean_medium")
        categorical_average(c, "high", "pred0_high", c + "_mean_high")


In [29]:
def transform_categorical_data():
    categorical = ['building_id', 'manager_id', 
                   'display_address', 'street_address']
                   
    for f in categorical:
        encoder = LabelEncoder()
        encoder.fit(list(X_train[f]) + list(X_test[f])) 
        X_train[f] = encoder.transform(X_train[f].ravel())
        X_test[f] = encoder.transform(X_test[f].ravel())
                  


In [30]:
def remove_columns(X):
    columns = ["photos", "pred0_high", "pred0_low", "pred0_medium",
               "description", "low", "medium", "high",
               "interest_level", "created"]
    for c in columns:
        del X[c]

In [4]:


managers_with_one_lot = find_objects_with_only_one_record('manager_id')
buildings_with_one_lot = find_objects_with_only_one_record('building_id')
addresses_with_one_lot = find_objects_with_only_one_record('display_address')

lambda_val = None
k=5.0
f=1.0
r_k=0.01 
g = 1.0

print("Starting transformations")        
X_train = transform_data(X_train)    
X_test = transform_data(X_test) 
y = X_train['interest_level'].ravel()

print("Normalizing high cordiality data...")
normalize_high_cordiality_data()
transform_categorical_data()

remove_columns(X_train)
remove_columns(X_test)

print("Start fitting...")

param = {}
param['objective'] = 'multi:softprob'
param['eta'] = 0.02
param['max_depth'] = 6
param['silent'] = 1
param['num_class'] = 3
param['eval_metric'] = "mlogloss"
param['min_child_weight'] = 3
param['subsample'] = 0.7
param['colsample_bytree'] = 0.7
param['seed'] = 321
param['nthread'] = 8
num_rounds = 1000

xgtrain = xgb.DMatrix(X_train, label=y)
clf = xgb.train(param, xgtrain, num_rounds)

print("Fitted")

Starting transformations
Normalizing high cordiality data...
Start fitting...
Fitted


In [6]:
xgtest = xgb.DMatrix(X_test)
preds = clf.predict(xgtest)    
sub = pd.DataFrame(data = {'listing_id': X_test['listing_id'].ravel()})
sub['low'] = preds[:, 0]
sub['medium'] = preds[:, 1]
sub['high'] = preds[:, 2]

In [7]:
sub

Unnamed: 0,listing_id,low,medium,high
0,7142618,0.512393,0.417390,0.070217
1,7210040,0.970653,0.018977,0.010370
2,7103890,0.799325,0.167566,0.033109
3,7143442,0.490528,0.441748,0.067724
4,6860601,0.705644,0.236680,0.057677
5,6840081,0.982719,0.016648,0.000633
6,6922337,0.755328,0.219923,0.024748
7,6913616,0.382950,0.561937,0.055112
8,6937820,0.538775,0.398548,0.062678
9,6893933,0.506407,0.446319,0.047274


In [12]:
np.where(sub['high'] == sub['high'].max())[0]

array([47039])

In [23]:
test = sub.sort_values('high', ascending=False)
test

Unnamed: 0,listing_id,low,medium,high
47039,7003491,0.002174,0.028384,0.969441
46724,7064124,0.002554,0.039303,0.958143
55607,7062719,0.004393,0.039656,0.955951
34949,7145462,0.003329,0.043314,0.953357
218,6859181,0.006610,0.040721,0.952670
1628,6822270,0.009890,0.039303,0.950807
2635,6862847,0.006874,0.045020,0.948106
24728,7140894,0.016496,0.038533,0.944971
34237,7130355,0.016500,0.038963,0.944537
52693,6958284,0.011740,0.046108,0.942152


In [24]:
test = sub.sort_values('medium', ascending=False)
test

Unnamed: 0,listing_id,low,medium,high
15098,6865084,0.092703,0.875543,0.031754
65737,6859085,0.093659,0.874692,0.031649
6615,6859523,0.100454,0.867195,0.032351
73323,6843212,0.101798,0.865080,0.033122
12715,6851933,0.082253,0.818785,0.098962
47701,6969861,0.161311,0.812580,0.026110
13501,7099225,0.138876,0.803990,0.057134
7978,6918337,0.069976,0.798449,0.131575
55178,7049683,0.145306,0.796132,0.058562
11290,7217292,0.094006,0.793617,0.112377
