In [1]:
import numpy as np
import pandas as pd

import re

import gensim
from gensim.models.doc2vec import Doc2Vec
from gensim.test.utils import get_tmpfile

from sklearn.feature_extraction.text import  TfidfVectorizer, CountVectorizer

In [2]:
ori_train = pd.read_json('data/train.json.zip', orient='columns', convert_dates=['created'], compression='zip')
ori_train = ori_train.reset_index()
ori_train.rename(columns={'index':'rec_id'}, inplace=True) 

In [3]:
train = ori_train

In [4]:
test =  pd.read_json('data/test.json.zip', orient='columns', convert_dates=['created'], compression='zip')
test = test.reset_index()
test.rename(columns={'index':'rec_id'}, inplace=True) 

# missing value & outliers

In [5]:
# remove outliers with ridiculously high prices
train = train[train['price']<=20000]
# remove outliers which the houses locate far away from most of the houses and missing values
train = train[(train['latitude'] <= 41.5) & (train['latitude'] >= 40) & (train['longitude'] >= -80) & (train['longitude'] <= -70)]
# remove outliers which have bathrooms more than bedrooms
train['diff_rooms'] = train['bedrooms'] - train['bathrooms']
train = train[train['diff_rooms']>=-1.5]

In [6]:
train_len = len(train)
test_len = len(test)
train_len, test_len

(49165, 74659)

# derived features

In [7]:
train_id = train[['manager_id', 'building_id']]
test_id = test[['manager_id', 'building_id']]
data_id = pd.concat([train_id, test_id], axis=0)

In [8]:
# transform manager_id and building_id to numerical variable replace missing values by a uniform "unknown" value
class_mapping = {label: i for i, label in enumerate(np.unique(data_id['manager_id']))}
class_mapping_1 =  {label: i for i, label in enumerate(np.unique(data_id['building_id']))}
data_id['manager_id'] = data_id['manager_id'].map(class_mapping)
data_id['building_id'] = data_id['building_id'].map(class_mapping_1)

# text features

In [9]:
train_text = train[['description', 'features']]
test_text = test[['description', 'features']]
data_text = pd.concat([train_text, test_text], axis=0)

## decription

### doc2vec 

In [10]:
def clean_text(text):
    # acronym
    text = re.sub(r"br\s", "bedroom", text)
    text = re.sub(r"n\'t", " not", text)
    text = re.sub(r"cannot", "can not ", text)
    text = re.sub(r"\'m", " am", text)
    text = re.sub(r"\'s", " is", text)
    text = re.sub(r"\'ve ", " have ", text)
    text = re.sub(r"n\'t", " not ", text)
    text = re.sub(r"\'re", " are ", text)
    text = re.sub(r"\'d", " would ", text)
    text = re.sub(r"\'ll", " will ", text)
    # symbol
    text = re.sub(r"&", " and ", text)
    text = re.sub(r"\|", " or ", text)
    text = re.sub(r"=", " equal ", text)
    text = re.sub(r"\+", " plus ", text)
    text = re.sub(r"\$", " dollar ", text)
    # others
    text = re.sub(r"[^a-zA-Z]", " ",text)
    # extra \s
    text = " ".join(text.split())

    return text

In [11]:
# data_text['description_words'] = data_text['description'].apply(clean_text)

In [12]:
# TaggededDocument = gensim.models.doc2vec.TaggedDocument

In [13]:
# def addTag(descr):
#     train=[]
#     for i, doc in enumerate(descr):
#         word_list = doc.split(' ')
#         document = TaggededDocument(word_list, tags=[i])
#         train.append(document)
#     return train

In [14]:
# taggedDoc = addTag(list(data_text['description_words']))

In [15]:
# doc2vec = Doc2Vec(taggedDoc, vector_size=100, min_count = 10, window = 5, sample = 1e-5, workers=4)
# doc2vec.train(taggedDoc, total_examples=doc2vec.corpus_count, epochs=10)

In [16]:
# doc2vec.save('model_descrip.model') 

### tfidf 

In [17]:
desc_tfidf = TfidfVectorizer(min_df=10, max_features=80, strip_accents='unicode',lowercase =True,
                        analyzer='word', token_pattern=r'\w{5,}', ngram_range=(1, 3),  sublinear_tf=True, stop_words = 'english')  
desc_tfidf_fit =desc_tfidf.fit_transform(data_text['description'])
desc_name = [x for x in desc_tfidf.get_feature_names()]

In [18]:
ar = desc_tfidf_fit.toarray()
for i, name in enumerate(desc_name):
    list_all = []
    [rows, cols] = ar.shape
    for row in range(rows):
        list_all.append(ar[row][i])
    dname = 'desc_' + name
    desc_value = pd.Series(list_all,data_text.index, name = dname)
    data_text[dname] = desc_value

In [19]:
for i, name in enumerate(desc_name):
    dname = 'desc_' + name
    train[dname] = data_text[dname].iloc[:train_len]
    test[dname] = data_text[dname].iloc[train_len:]

## feature 

In [20]:
def word_to_phrase(flist):
    plist = []
    if len(flist) > 0:
        for feature in flist:
            feature = re.sub('[_]',' ',feature)
            feature = feature.strip()
            feature_p = ''.join(feature.split(' '))
            plist.append(feature_p)
    return plist

In [21]:
def list_to_string(flist):
    string = ' '.join(flist)
    return string

In [22]:
data_text['features_phr'] = data_text['features'].apply(word_to_phrase)
data_text['features_phr_str'] = data_text['features_phr'].apply(list_to_string)

In [23]:
phr_tfidf = TfidfVectorizer(min_df=10, max_features=80, strip_accents='unicode', lowercase=True, token_pattern=r'\w{3,}', stop_words='english')  
phr_tfidf_fit =phr_tfidf.fit_transform(data_text['features_phr_str'])
phr_names = [x for x in phr_tfidf.get_feature_names()]

In [24]:
ar_feature = phr_tfidf_fit.toarray()
for i, name in enumerate(phr_names):
    list_all = []
    [rows, cols] = ar_feature.shape
    for row in range(rows):
        list_all.append(ar[row][i])
    fname = 'ft_' + name
    f_value = pd.Series(list_all, data_text.index, name = fname)
    data_text[fname] = f_value

In [25]:
for i, name in enumerate(phr_names):
    fname = 'ft_' + name
    train[fname] = data_text[fname].iloc[:train_len]
    test[fname] = data_text[fname].iloc[train_len:]

# add new features to train and test

In [26]:
train['manager_id_num'] = data_id['manager_id'].iloc[:train_len]
train['building_id_num'] = data_id['building_id'].iloc[:train_len]
train['photo_num'] = train['photos'].apply(lambda x: len(x))

In [27]:
test['manager_id_num'] = data_id['manager_id'].iloc[train_len:]
test['building_id_num'] = data_id['building_id'].iloc[train_len:]
test['photo_num'] = test['photos'].apply(lambda x: len(x))

In [28]:
train['clean_feat'] = data_text['features_phr_str'].iloc[:train_len]
test['clean_feat'] = data_text['features_phr_str'].iloc[train_len:]

In [29]:
train.to_json('data/new_modified_train.json')
test.to_json('data/new_modified_test.json')

## additional data 

In [41]:
def haversine(df):
    lat1 = subway['lat']
    lat2 = df['latitude']
    lon1 = subway['lon']
    lon2 = df['longitude']
    dlat = np.deg2rad(np.fabs(lat2-lat1))
    dlon = np.deg2rad(np.fabs(lon2-lon1))
    a = np.sin(dlat/2)**2 + np.cos(np.deg2rad(lat1))*np.cos(np.deg2rad(lat2))*np.sin(dlon/2)**2
    c =2 * np.arcsin(np.sqrt(a))
    Earth_Radius = 6371
    res = min(c * Earth_Radius * 1000)
    return res

In [42]:
subway = pd.read_csv('data/subway.csv')
subway = subway[['Station Latitude','Station Longitude']]
subway = subway.drop_duplicates()
subway = subway.rename(columns={'Station Latitude':'lat','Station Longitude':'lon'})

In [43]:
train['distance_subway'] = train.apply(haversine,axis = 1)

In [45]:
test['distance_subway'] = test.apply(haversine,axis = 1)

In [47]:
train.to_json('data/new_modified_train.json')
test.to_json('data/new_modified_test.json')