In [None]:
import pandas as pd
import numpy as np
from sklearn import linear_model
from sklearn.ensemble import RandomForestRegressor
import datetime
import time

In [None]:
house_df = pd.read_csv('../data/properties_2016_backup.csv')

In [None]:
# delete features with too many missing values
nan = house_df.isnull().sum()
nan = nan[nan < 2900000]
nan_feature = nan.index.tolist()
house_nan_df = house_df.loc[:,nan_feature]

In [None]:
# delete categorical features
cate_feature = ['airconditioningtypeid','fips','heatingorsystemtypeid','pooltypeid7','propertycountylandusecode','propertylandusetypeid',
                'propertyzoningdesc','rawcensustractandblock','regionidcity','regionidcounty','regionidneighborhood','regionidzip',
                'censustractandblock']

for i in cate_feature:
    nan_feature.remove(i)

nan_cate_feature = nan_feature

house_nan_cate_df = house_nan_df.loc[:,nan_cate_feature]

In [None]:
# delete highly correlated features

highcorr_feature = ['finishedsquarefeet12','finishedsquarefeet15', 'finishedsquarefeet50','taxvaluedollarcnt']

for i in highcorr_feature:
    nan_cate_feature.remove(i)

nan_cate_co_feature = nan_cate_feature

house_nan_co_cate_df = house_nan_cate_df.loc[:,nan_cate_co_feature]

# rename the dataframe, after deleting many features
house_new_df = house_nan_co_cate_df

In [None]:
house_new_df.isnull().sum()

In [None]:
nan_zero = 'poolcnt'
house_new_df[nan_zero] = house_new_df[nan_zero].fillna(value=0)

In [None]:
house_new_df['poolcnt'].describe()

In [None]:
nan_mean = ['parcelid', 'bathroomcnt', 'bedroomcnt', 'buildingqualitytypeid','calculatedbathnbr', 'finishedfloor1squarefeet',
       'calculatedfinishedsquarefeet', 'fireplacecnt', 'fullbathcnt', 'garagecarcnt', 'garagetotalsqft', 'latitude', 'longitude',
       'lotsizesquarefeet', 'roomcnt', 'threequarterbathnbr', 'unitcnt', 'yearbuilt', 'numberofstories', 'structuretaxvaluedollarcnt',
       'assessmentyear', 'landtaxvaluedollarcnt', 'taxamount']

house_new_df[nan_mean] = house_new_df[nan_mean].fillna(house_new_df[nan_mean].mean())

In [None]:
house_new_df.describe()

In [None]:
cols_to_norm = ['bathroomcnt', 'bedroomcnt', 'buildingqualitytypeid', 'calculatedbathnbr', 'finishedfloor1squarefeet',
                'calculatedfinishedsquarefeet', 'fireplacecnt', 'fullbathcnt', 'garagecarcnt', 'garagetotalsqft', 'latitude', 'longitude',
                'lotsizesquarefeet', 'poolcnt', 'roomcnt', 'threequarterbathnbr', 'unitcnt', 'yearbuilt', 'numberofstories', 
                'structuretaxvaluedollarcnt', 'assessmentyear', 'landtaxvaluedollarcnt', 'taxamount']


for col in cols_to_norm:
    house_new_df[col] = (house_new_df[col] - house_new_df[col].mean())/(house_new_df[col].std())

# house_new_df[cols_to_norm] = house_new_df[cols_to_norm].apply(lambda x: (x - x.mean()) / (x.std())

In [None]:
house_new_df.describe()

In [None]:
train_df = pd.read_csv('../data/train_2016_v2.csv')

In [None]:
train_df

In [None]:
join_df = train_df.merge(house_new_df, how='left', on='parcelid')

In [None]:
join_df

In [None]:
# training data set: prediction target

target = join_df.loc[:,'logerror']

In [None]:
# training data set: attributes

train = join_df.iloc[:,2:]

# feature = join_df.iloc[:,2:]

In [None]:
# convert datetime into float

for num,str in enumerate(train.transactiondate):
        date_time = datetime.datetime.strptime(str,'%Y-%m-%d')
        str = time.mktime(date_time.timetuple())
        train.set_value(num,'transactiondate',str)
        
train['transactiondate'] = train['transactiondate'].astype(float)

In [None]:
lr = linear_model.LinearRegression()
lr = lr.fit(train, target)

In [None]:
# Models


'''
model = linear_model.LinearRegression()
model = model.fit(feature, target)
'''

'''
model = linear_model.Ridge(alpha=1.0)
model = model.fit(feature, target)
'''

model = RandomForestRegressor(max_depth=6, random_state=0)
model = model.fit(feature, target)

In [None]:
model

In [None]:
sample = pd.read_csv("../data/sample_submission.csv")

In [None]:
sample1 = sample.ParcelId

In [None]:
sample1.columns

In [None]:
sample1.columns = ['parcelid']

In [None]:
sample1 = sample1.to_frame()

In [None]:
sample1.columns

In [None]:
sample1.columns

In [None]:
sample1 = sample.ParcelId
sample1 = sample1.to_frame()
sample1.columns = ['parcelid']
sample1.describe()
df_test = sample1.merge(house_new_df, on='parcelid', how='left')

In [None]:
del df_test['parcelid']

In [None]:
def date_transform(df_test):
    for num,str in enumerate(df_test.transactiondate):
        date_time = datetime.datetime.strptime(str,'%Y-%m-%d')
        str = time.mktime(date_time.timetuple())
        df_test.set_value(num,'transactiondate',str)
    df_test['transactiondate'] = df_test['transactiondate'].astype(float) 
    
    test_matrix = df_test.as_matrix()
    return test_matrix

In [None]:
result = sample1.copy()
result.columns = ['ParcelId']


df_test1 = df_test.copy()
df_test1.insert(0,'transactiondate','2016-10-15')
df_test1 = date_transform(df_test1)
pred1 = model.predict(df_test1)
pred1 = np.asarray(pred1)
result.insert(1,'201610',pred1)
del df_test1


df_test2 = df_test.copy()
df_test2.insert(0,'transactiondate','2016-11-15')
df_test2 = date_transform(df_test2)
pred2 = model.predict(df_test2)
pred2 = np.asarray(pred2)
result.insert(2,'201611',pred2)
del df_test2


df_test3 = df_test.copy()
df_test3.insert(0,'transactiondate','2016-12-15')
df_test3 = date_transform(df_test3)
pred3 = model.predict(df_test3)
pred3 = np.asarray(pred3)
result.insert(3,'201612',pred3)
del df_test3


df_test4 = df_test.copy()
df_test4.insert(0,'transactiondate','2017-10-15')
df_test4 = date_transform(df_test4)
pred4 = model.predict(df_test4)
pred4 = np.asarray(pred4)
result.insert(4,'201710',pred4)
del df_test4


df_test5 = df_test.copy()
df_test5.insert(0,'transactiondate','2017-11-15')
df_test5 = date_transform(df_test5)
pred5 = model.predict(df_test5)
pred5 = np.asarray(pred5)
result.insert(5,'201711',pred5)
del df_test5


df_test6 = df_test.copy()
df_test6.insert(0,'transactiondate','2017-12-15')
df_test6 = date_transform(df_test6)
pred6 = model.predict(df_test6)
pred6 = np.asarray(pred6)
result.insert(6,'201712',pred6)
del df_test6

# result.columns = ['ParcelId', '201610']

result.columns = ['ParcelId', '201610', '201611', '201612', '201710','201711','201712']
result.to_csv('../predictions/sample4.csv',index=False,header=True)  # need to change filename per run