In [1]:
import pandas as pd
import re
import numpy as np
import matplotlib.pyplot as plt
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.pipeline import Pipeline
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.feature_selection import SelectKBest, chi2
from sklearn.linear_model import LinearRegression, LogisticRegressionCV
from sklearn.ensemble import RandomForestRegressor
from sklearn.preprocessing import LabelEncoder, OneHotEncoder, StandardScaler
from sklearn.model_selection import cross_val_score
from sklearn.svm import SVR
from nltk.corpus import wordnet
from datetime import datetime, date
import nltk
nltk.download('wordnet')
from nltk.stem import WordNetLemmatizer
%matplotlib inline

[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/ponomarevandrew/nltk_data...
[nltk_data]   Unzipping corpora/wordnet.zip.


In [2]:
train = pd.read_csv('data/train.csv')
test = pd.read_csv('data/test.csv')
train.shape

(51815, 43)

In [3]:
train_X, valid_X = train_test_split(train, test_size=0.3, random_state=42)

In [4]:
long_text_cols = ['name', 'summary', 'space', 'description', 'neighborhood_overview', 'notes', 'transit',
                 'access', 'interaction', 'house_rules', 'host_about', 'amenities']
categoric_cols = ['host_response_time', 'property_type', 'bed_type', 'cancellation_policy',
                  'host_is_superhost', 'host_has_profile_pic', 'host_identity_verified',
              'is_location_exact','require_guest_profile_picture', 'require_guest_phone_verification', 'room_type']
number_cols = ['latitude', 'longitude', 'accommodates', 'bathrooms',
              'bedrooms', 'beds', 'guests_included', 'extra_people',
              'minimum_nights']
special_cols = ['host_since', 'place_price', 'security_deposit', 'cleaning_fee']

In [5]:
agg_place = train_X.groupby('neighbourhood_cleansed')['price'].mean().reset_index()
agg_place.rename({'price': 'place_price'}, axis=1, inplace=True)

In [6]:
num_col_medians = dict()
for col in number_cols:
    num_col_medians[col] = train[col].median()

In [7]:
encoder = OneHotEncoder(handle_unknown='ignore')
for col in categoric_cols:
    train[col].fillna('None', inplace=True)
encoder.fit(train[categoric_cols]);

In [8]:
def features_transform(df):
    new_df = df.copy()
    new_df.host_since.fillna('2018-01-01', inplace=True)
    new_df['host_since'] = new_df.host_since.apply(lambda s: (date.today() - datetime.strptime(s, '%Y-%m-%d').date()).days // 30)
    new_df = new_df.merge(agg_place, how='left', on='neighbourhood_cleansed')
    new_df.place_price.fillna(115, inplace=True)
    new_df.security_deposit.fillna(0, inplace=True)
    new_df.cleaning_fee.fillna(0, inplace=True)
    new_df.drop(['experiences_offered', 'host_id', 'host_response_rate', 'neighbourhood_cleansed',
               'zipcode', 'square_feet'], axis=1, inplace=True)
    for col in number_cols:
        new_df[col].fillna(num_col_medians[col], inplace=True)
    for col in categoric_cols:
        new_df[col].fillna('None', inplace=True)
    df_dumm = pd.DataFrame(encoder.transform(new_df[categoric_cols]).toarray(), columns=encoder.get_feature_names())
    new_df = pd.concat([new_df, df_dumm], axis=1)
    return new_df

In [9]:
X_train_num = features_transform(train_X).drop(long_text_cols + categoric_cols + ['price'], axis=1)
X_valid_num = features_transform(train_X).drop(long_text_cols + categoric_cols + ['price'], axis=1)

In [10]:
X_test_num = features_transform(test).drop(long_text_cols + categoric_cols, axis=1)


In [11]:
pipeline = Pipeline([('liner', StandardScaler()),
                     ('clf', LinearRegression())])
forest = RandomForestRegressor(n_estimators=50)

In [12]:
cvs = cross_val_score(forest, X_train_num, train_X.price, cv=3, scoring='neg_mean_squared_error')

In [13]:
forest.fit(X_train_num, train_X.price)


RandomForestRegressor(n_estimators=50)

In [15]:
X_test_num['price'] = forest.predict(X_test_num)

In [16]:
X_test_num[['id', 'price']].to_csv('answer_1.csv', index=False)
