This's notebook for CV statistics

In [21]:
import os
import sys
import operator
import numpy as np
import pandas as pd
from scipy import sparse
import xgboost as xgb
import random
from sklearn import model_selection, preprocessing, ensemble
from sklearn.metrics import log_loss
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
#input data
train_df=pd.read_json('../input/train.json')
test_df=pd.read_json('../input/test.json')

## Now add some basic feature.
- price per bed
- bedroom + bathroom
- num of photos
- num of features
- len of descriptions
- time created

In [30]:
#basic features
train_df["price_t"] =train_df["price"]/train_df["bedrooms"]
test_df["price_t"] = test_df["price"]/test_df["bedrooms"] 
train_df["room_sum"] = train_df["bedrooms"]+train_df["bathrooms"] 
test_df["room_sum"] = test_df["bedrooms"]+test_df["bathrooms"] 

# count of photos #
train_df["num_photos"] = train_df["photos"].apply(len)
test_df["num_photos"] = test_df["photos"].apply(len)

# count of "features" #
train_df["num_features"] = train_df["features"].apply(len)
test_df["num_features"] = test_df["features"].apply(len)

# count of words present in description column #
train_df["num_description_words"] = train_df["description"].apply(lambda x: len(x.split(" ")))
test_df["num_description_words"] = test_df["description"].apply(lambda x: len(x.split(" ")))

# created
train_df["created"] = pd.to_datetime(train_df["created"])
train_df["created_month"] = train_df["created"].dt.month
train_df["created_day"] = train_df["created"].dt.day
train_df["day_of_week"] = train_df["created"].dt.weekday
train_df["created_hour"] = train_df["created"].dt.hour

test_df["created"] = pd.to_datetime(test_df["created"])
test_df["created_month"] = test_df["created"].dt.month
test_df["created_day"] = test_df["created"].dt.day
test_df["day_of_week"] = test_df["created"].dt.weekday
test_df["created_hour"] = test_df["created"].dt.hour


features_to_use=["bathrooms", "bedrooms", "latitude", "longitude", "price","price_t","num_photos", "num_features", "num_description_words","listing_id","created_month","created_day","created_hour","day_of_week"]

## Define XGB function, it come from "XGB starter in python" by SRK

In [35]:
def runXGB(train_X, train_y, test_X, test_y=None, feature_names=None, seed_val=0, num_rounds=4000):
    param = {}
    param['objective'] = 'multi:softprob'
    param['eta'] = 0.03
    param['max_depth'] = 6
    param['silent'] = 1
    param['num_class'] = 3
    param['eval_metric'] = "mlogloss"
    param['min_child_weight'] = 1
    param['subsample'] = 0.7
    param['colsample_bytree'] = 0.7
    param['seed'] = seed_val
    num_rounds = num_rounds

    plst = list(param.items())
    xgtrain = xgb.DMatrix(train_X, label=train_y)

    if test_y is not None:
        xgtest = xgb.DMatrix(test_X, label=test_y)
        watchlist = [ (xgtrain,'train'), (xgtest, 'test') ]
        model = xgb.train(plst, xgtrain, num_rounds, watchlist, early_stopping_rounds=20)
    else:
        xgtest = xgb.DMatrix(test_X)
        model = xgb.train(plst, xgtrain, num_rounds)

    pred_test_y = model.predict(xgtest)
    return pred_test_y, model

## CV statistics, only change here
The following code chunk contributes to the construction of the frequency-ratio table for each unique manager_id in the training data. Effect of the cross-validation is considered when constructing.

In [1]:
#Our feature construction class will inherit from these two base classes of sklearn.
from sklearn.base import BaseEstimator
from sklearn.base import TransformerMixin

class manager_skill(BaseEstimator, TransformerMixin):
    """
    Adds the column "manager_skill" to the dataset, based on the Kaggle kernel
    "Improve Perfomances using Manager features" by den3b. The function should
    be usable in scikit-learn pipelines.
    
    Parameters
    ----------
    threshold : Minimum count of rental listings a manager must have in order
                to get his "own" score, otherwise the mean is assigned.

    Attributes
    ----------
    mapping : pandas dataframe
        contains the manager_skill per manager id.
        
    mean_skill : float
        The mean skill of managers with at least as many listings as the 
        threshold.
    """
    def __init__(self, threshold = 5):
        
        self.threshold = threshold
        
    def _reset(self):
        """Reset internal data-dependent state of the scaler, if necessary.
        
        __init__ parameters are not touched.
        """
        # Checking one attribute is enough, becase they are all set together
        # in fit        
        if hasattr(self, 'mapping_'):
            
            self.mapping_ = {}
            self.mean_skill_ = 0.0
        
    def fit(self, X,y):
        """Compute the skill values per manager for later use.
        
        Parameters
        ----------
        X : pandas dataframe, shape [n_samples, n_features]
            The rental data. It has to contain a column named "manager_id".
            
        y : pandas series or numpy array, shape [n_samples]
            The corresponding target values with encoding:
            low: 0.0
            medium: 1.0
            high: 2.0
        """        
        self._reset()
        
        temp = pd.concat([X.manager_id,pd.get_dummies(y)], axis = 1).groupby('manager_id').mean()
        temp.columns = ['low_frac', 'medium_frac', 'high_frac']
        temp['count'] = X.groupby('manager_id').count().iloc[:,1]
        
#        print(temp.head())
        
        temp['manager_skill'] = temp['high_frac']*2 + temp['medium_frac']
        
        mean = temp.loc[temp['count'] >= self.threshold, 'manager_skill'].mean()
        
        temp.loc[temp['count'] < self.threshold, 'manager_skill'] = mean
        
        self.mapping_ = temp[['manager_skill']]
        self.mean_skill_ = mean
            
        return self
        
    def transform(self, X):
        """Add manager skill to a new matrix.
        
        Parameters
        ----------
        X : pandas dataframe, shape [n_samples, n_features]
            Input data, has to contain "manager_id".
        """        
        X = pd.merge(left = X, right = self.mapping_, how = 'left', left_on = 'manager_id', right_index = True)
        X['manager_skill'].fillna(self.mean_skill_, inplace = True)
        
        return X

You can use this method just like any other scikit-learn transformation method:

## High cardinality categorical features
For the code chunk below, categorical features are transfered into numerical values. Unique categorical data is represented with unique numerical values, and the total level of the data defines the range of the corresponding numerical feature.

In [25]:
categorical = ["display_address", "manager_id", "building_id", "street_address"]
for f in categorical:
        if train_df[f].dtype=='object':
#            print(f, train_df[f].dtype)
            lbl = preprocessing.LabelEncoder()
            lbl.fit(list(train_df[f].values) + list(test_df[f].values))
            train_df[f] = lbl.transform(list(train_df[f].values))
            test_df[f] = lbl.transform(list(test_df[f].values))
            features_to_use.append(f)

## TF-IDF encoding of the "features" in data

This is actually only a term-document sparse matrix

In [26]:
train_df['features'] = train_df["features"].apply(lambda x: " ".join(["_".join(i.split(" ")) for i in x]))
test_df['features'] = test_df["features"].apply(lambda x: " ".join(["_".join(i.split(" ")) for i in x]))
print(train_df["features"].head())
tfidf = CountVectorizer(stop_words='english', max_features=200)
tr_sparse = tfidf.fit_transform(train_df["features"])
te_sparse = tfidf.transform(test_df["features"])

10                                                         
10000     Doorman Elevator Fitness_Center Cats_Allowed D...
100004    Laundry_In_Building Dishwasher Hardwood_Floors...
100007                               Hardwood_Floors No_Fee
100013                                              Pre-War
Name: features, dtype: object


In [31]:
these_features = [f for f in features_to_use if f not in ['manager_skill']]
train_X = sparse.hstack([train_df[these_features], tr_sparse]).tocsr()
test_X = sparse.hstack([test_df[these_features], te_sparse]).tocsr()

target_num_map = {'high':0, 'medium':1, 'low':2}
train_y = np.array(train_df['interest_level'].apply(lambda x: target_num_map[x]))
print(train_X.shape, test_X.shape)

(49352, 214) (74659, 214)


Without CV statistic,to score get 0.5480 by SRK. And CV statistic get 0.5346 In fact ,you 
need to turn down the learning rate and turn up run_num

In [36]:
cv_scores = []
kf = model_selection.KFold(n_splits=5, shuffle=True, random_state=2016)
for dev_index, val_index in kf.split(range(train_X.shape[0])):
        dev_X, val_X = train_X[dev_index,:], train_X[val_index,:]
        dev_y, val_y = train_y[dev_index], train_y[val_index]
        preds, model = runXGB(dev_X, dev_y, val_X, val_y)
        cv_scores.append(log_loss(val_y, preds))
        print(cv_scores)
        break

[0]	train-mlogloss:1.08051	test-mlogloss:1.08086
Multiple eval metrics have been passed: 'test-mlogloss' will be used for early stopping.

Will train until test-mlogloss hasn't improved in 20 rounds.
[1]	train-mlogloss:1.06232	test-mlogloss:1.06287
[2]	train-mlogloss:1.04654	test-mlogloss:1.04734
[3]	train-mlogloss:1.03011	test-mlogloss:1.03114
[4]	train-mlogloss:1.01571	test-mlogloss:1.01688
[5]	train-mlogloss:1.00169	test-mlogloss:1.00305
[6]	train-mlogloss:0.987965	test-mlogloss:0.989586
[7]	train-mlogloss:0.974221	test-mlogloss:0.97611
[8]	train-mlogloss:0.961459	test-mlogloss:0.963587
[9]	train-mlogloss:0.949447	test-mlogloss:0.951798
[10]	train-mlogloss:0.937766	test-mlogloss:0.940336
[11]	train-mlogloss:0.925806	test-mlogloss:0.928688
[12]	train-mlogloss:0.91472	test-mlogloss:0.917781
[13]	train-mlogloss:0.904131	test-mlogloss:0.907444
[14]	train-mlogloss:0.893512	test-mlogloss:0.897178
[15]	train-mlogloss:0.883769	test-mlogloss:0.887716
[16]	train-mlogloss:0.874808	test-mloglos

In [21]:
preds, model = runXGB(train_X, train_y, test_X, num_rounds=1000)
out_df = pd.DataFrame(preds)
out_df.columns = ["high", "medium", "low"]
out_df["listing_id"] = test_df.listing_id.values
out_df.to_csv("xgb_starter2.csv", index=False)

If you think it's help, give me a vote.If you have problem, you can write you question in the comment. I will replay for you soon.

Edit:Many people use my script to change parameters.  The wrong way is to change learning rate and run_num blindly. When you adjust learning rate etc, you need  to adjust the best run_num according to  the cross validation. Otherwise, you will be overfitting.