#### Config

In [211]:
import pandas as pd
import numpy as np
import os

In [212]:
DATA_DIR = '/home/bfortuner/workplace/russian_housing/data'
TRAIN_CSV = os.path.join(DATA_DIR, 'train.csv')
TEST_CSV = os.path.join(DATA_DIR, 'test.csv')
MACRO_CSV = os.path.join(DATA_DIR, 'macro.csv')

TRAIN_DF = pd.read_csv(TRAIN_CSV, header=0)
TEST_DF = pd.read_csv(TEST_CSV, header=0)
MACRO_DF = pd.read_csv(MACRO_CSV, header=0)

#### Timestamps

In [213]:
print(TRAIN_DF[['timestamp']].min())
print(TRAIN_DF[['timestamp']].max()) 
print(TEST_DF[['timestamp']].min())
print(TEST_DF[['timestamp']].max())

timestamp    2011-08-20
dtype: object
timestamp    2015-06-30
dtype: object
timestamp    2015-07-01
dtype: object
timestamp    2016-05-30
dtype: object


#### Look at columns

#### Get Features

In [231]:
features = TRAIN_DF[['timestamp', 'full_sq']]
features.sort_values(by='timestamp').head(3)
labels = TRAIN_DF[['price_doc']]
features['timestamp'] = features['timestamp'].astype('datetime64[ns]').view('int64')
#features['timestamp'] = pd.to_numeric(features['timestamp'], errors='raise')

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy


In [232]:
def norm_arr(arr, mean, std):
    return (arr - mean) / std

def denorm_arr(arr, mean, std):
    return (arr * std) + mean

FEATURES_MEAN = features.mean()
FEATURES_STD = features.std()
LABELS_MEAN = labels.mean()
LABELS_STD =  labels.std()

features = norm_arr(features, FEATURES_MEAN, FEATURES_STD)
labels = norm_arr(labels, LABELS_MEAN, LABELS_STD)

In [233]:
from sklearn.model_selection import train_test_split
VALIDATION_SIZE = int(len(features)*.3)
train_dset = features[:-VALIDATION_SIZE]
train_labels = labels[:-VALIDATION_SIZE]
val_dset = features[-VALIDATION_SIZE:]
val_labels = labels[-VALIDATION_SIZE:]
print(train_dset.shape)
print(val_dset.shape)
print(train_labels.shape)
print(val_labels.shape)
assert len(train_dset)+len(val_dset) == len(features)

(21330, 2)
(9141, 2)
(21330, 1)
(9141, 1)


In [234]:
from sklearn.linear_model import LinearRegression

linreg = LinearRegression()

# fit the model to the training data (learn the coefficients)
linreg.fit(train_dset, train_labels)

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=1, normalize=False)

In [235]:
# Intercept and Coefficients
print (linreg.intercept_)
print (linreg.coef_)

# pair the feature names with the coefficients
feature_names = ['timestamp', 'full_sq']


[-0.01582131]
[[ 0.09314353  0.24868134]]


In [236]:
price_predictions = linreg.predict(val_dset)
price_predictions[0:3]   #numpy array

array([[ 0.15674766],
       [ 0.17636414],
       [ 0.17010164]])

In [237]:
from sklearn import metrics

# calculate MAE by hand
# (10 + 0 + 20 + 10) / 4.

print(metrics.mean_absolute_error(val_labels, price_predictions))
print(metrics.mean_squared_error(val_labels, price_predictions))
print(np.sqrt(metrics.mean_squared_error(val_labels, price_predictions)))
print(np.sqrt( np.mean(( np.log(price_predictions+1) - np.log(val_labels+1) )**2) ))

0.598734833686
1.0388522366
1.01924101007
price_doc    0.751829
dtype: float64




#### Make Submission

In [238]:
test_features = TEST_DF[['timestamp', 'full_sq']]
test_ids = TEST_DF[['id']].astype(int)
test_features['timestamp'] = test_features['timestamp'].astype('datetime64[ns]').view('int64')
test_features = norm_arr(test_features, FEATURES_MEAN, FEATURES_STD)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  app.launch_new_instance()


In [239]:
test_predictions = linreg.predict(test_features)

In [240]:
test_predictions = denorm_arr(test_predictions, np.expand_dims(LABELS_MEAN,1), np.expand_dims(LABELS_STD,1))

In [241]:
submission_arr = np.concatenate((test_ids.astype('int'), test_predictions), axis=1)

In [242]:
submission_df = pd.DataFrame(submission_arr,columns=['id','price_doc'])

In [228]:
submission_df['id'] = submission_df['id'].astype('int')

In [229]:
submission_df.to_csv('submission.csv',index=False)