In [33]:
import os
import sys
import operator
import numpy as np
import pandas as pd
from scipy import sparse
from sklearn import model_selection, preprocessing, ensemble
from sklearn.metrics import log_loss
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer

In [34]:
train_df = pd.read_json('../data/train.json', convert_dates = ['created'])
test_df = pd.read_json('../data/test.json', convert_dates = ['created'])

In [35]:
features_to_use  = ["bathrooms", "bedrooms", "latitude", "longitude", "price"]
# count of photos #
train_df["num_photos"] = train_df["photos"].apply(len)
test_df["num_photos"] = test_df["photos"].apply(len)

# count of "features" #
train_df["num_features"] = train_df["features"].apply(len)
test_df["num_features"] = test_df["features"].apply(len)

# count of words present in description column #
train_df["num_description_words"] = train_df["description"].apply(lambda x: len(x.split(" ")))
test_df["num_description_words"] = test_df["description"].apply(lambda x: len(x.split(" ")))

# convert the created column to datetime object so as to extract more features 
train_df["created"] = pd.to_datetime(train_df["created"])
test_df["created"] = pd.to_datetime(test_df["created"])

# Let us extract some features like year, month, day, hour from date columns #
train_df["created_year"] = train_df["created"].dt.year
test_df["created_year"] = test_df["created"].dt.year
train_df["created_month"] = train_df["created"].dt.month
test_df["created_month"] = test_df["created"].dt.month
train_df["created_day"] = train_df["created"].dt.day
test_df["created_day"] = test_df["created"].dt.day
train_df["created_hour"] = train_df["created"].dt.hour
test_df["created_hour"] = test_df["created"].dt.hour

# adding all these new features to use list #
features_to_use.extend(["num_photos", "num_features", "num_description_words","created_year", "created_month", "created_day", "listing_id", "created_hour"])

In [36]:
categorical = ["display_address", "manager_id", "building_id", "street_address"]
for f in categorical:
        if train_df[f].dtype=='object':
            #print(f)
            lbl = preprocessing.LabelEncoder()
            lbl.fit(list(train_df[f].values) + list(test_df[f].values))
            train_df[f] = lbl.transform(list(train_df[f].values))
            test_df[f] = lbl.transform(list(test_df[f].values))
            features_to_use.append(f)

In [37]:
train_df['features'] = train_df["features"].apply(lambda x: " ".join(["_".join(i.split(" ")) for i in x]))
test_df['features'] = test_df["features"].apply(lambda x: " ".join(["_".join(i.split(" ")) for i in x]))
print(train_df["features"].head())
tfidf = CountVectorizer(stop_words='english', max_features=200)
tr_sparse = tfidf.fit_transform(train_df["features"])
te_sparse = tfidf.transform(test_df["features"])

10                                                         
10000     Doorman Elevator Fitness_Center Cats_Allowed D...
100004    Laundry_In_Building Dishwasher Hardwood_Floors...
100007                               Hardwood_Floors No_Fee
100013                                              Pre-War
Name: features, dtype: object


In [40]:
train_X = sparse.hstack([train_df[features_to_use], tr_sparse]).tocsr()
test_X = sparse.hstack([test_df[features_to_use], te_sparse]).tocsr()

target_num_map = {'high':0, 'medium':1, 'low':2}
train_y = np.array(train_df['interest_level'].apply(lambda x: target_num_map[x]))
#X_train, X_test, y_train, y_test = model_selection.train_test_split(train_X, train_y, test_size=0.30)

In [55]:
gbc = ensemble.GradientBoostingClassifier(n_estimators=1000)

In [20]:
gbc.fit(X_train, y_train)
y_pred = gbc.predict_proba(X_test.toarray())
log_loss(y_test, y_pred)

0.55463934895769862

In [44]:
abc = ensemble.AdaBoostClassifier(n_estimators=1000)

In [22]:
abc.fit(X_train, y_train)
y_pred_abc = abc.predict_proba(X_test.toarray())
log_loss(y_test, y_pred)

0.55463934895769862

In [23]:
rfc = ensemble.RandomForestClassifier(n_estimators=1000)

In [48]:
y_pred_abc.shape
test_df.shape
train_df.shape

(49352, 22)

In [24]:
rfc.fit(X_train, y_train)
y_pred_rfc = rfc.predict_proba(X_test.toarray())
log_loss(y_test, y_pred)

0.55463934895769862

In [28]:
y = abc.predict_proba(X_test.toarray())

In [57]:
rfc.fit(train_X, train_y)
y_pred_fin = rfc.predict_proba(test_X.toarray())
out_df = pd.DataFrame(y_pred_fin)
out_df.columns = ['high', 'medium', 'low']
out_df['listing_id'] = test_df.listing_id.values
out_df.to_csv('submission_rfc2.csv', index=False)

In [39]:

# index_label = {u'high': 0, u'medium': 2, u'low': 1}
# fin = pd.DataFrame()
# fin["listing_id"] = test_df["listing_id"]
# for label in ["high", "medium", "low"]:
#     fin[label] = y[:, index_label[label]]
# fin.to_csv("submission_rf.csv", index=False)

In [49]:
out_df.shape

(14806, 3)

In [50]:
test_df.shape

(74659, 21)

In [51]:
train_df.shape

(49352, 22)

In [52]:
test_X.shape

(74659, 217)