In [34]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import time
import seaborn as sns
from collections import Counter
from dateutil.relativedelta import relativedelta
import datetime
import copy
import pdb

In [2]:
pd.set_option('max_columns', None)

In [31]:
start = time.time()
df_train = pd.read_csv("raw_data/train_data.csv", parse_dates=[1])
df_test = pd.read_csv("raw_data/test_data.csv", parse_dates=[1])
end = time.time()
print("Train set loaded in ", end - start, " seconds.")

Train set loaded in  24.63033938407898  seconds.


In [None]:
df_train.hist(figsize=(20, 20))

In [None]:
df_test.hist(figsize=(20, 20))

In [4]:
def get_cross_val_splits(df_train, train_inds, val_inds):
    train_data= df_train.iloc[train_inds]
    X_train = train_data.loc[:, ~train_data.columns.isin(['srch_id','rel'])]
    y_train = train_data.loc[:, train_data.columns.isin(['rel'])]
    train_groups = train_data.groupby('srch_id').size().to_frame('size')['size'].to_numpy()
    
    val_data= df_train.iloc[val_inds]
    #We need to keep the id for later predictions
    X_val = val_data.loc[:, ~val_data.columns.isin(['srch_id','rel'])]
    X_val_with_srch_id = val_data.loc[:, ~val_data.columns.isin(['rel'])]
    y_val = val_data.loc[:, val_data.columns.isin(['rel'])]
    eval_groups = X_val_with_srch_id.groupby('srch_id').size().to_frame('size')['size'].to_numpy()
    
    return X_train, y_train, train_groups, X_val, y_val, eval_groups 

In [None]:
# Train and Test Data have literally the same distribution 
# Is it safe to say that model overfitted to the trainset will do well on the test set?
from sklearn.model_selection import GroupShuffleSplit
gss = GroupShuffleSplit(test_size=0.1, n_splits=5, random_state = 7)
for train_inds, val_inds in gss.split(df_train, groups=df_train['srch_id']):
    X_train, y_train, train_groups, X_val, y_val, eval_groups = get_cross_val_splits(df_train, train_inds, val_inds)
    print(len(X_val))
    print(len(X_train))
    X_val.hist(figsize=(20, 20))
    
    

In [None]:
# Todo add mean prop_id numeric features 

In [41]:
prop_columns = [feature for feature in df_train.columns if 'prop' in feature]
prop_columns.append('price_usd')
df_train_subset = df_train[df_test.columns]
full_data = pd.concat([df_test, df_train_subset])
full_data = full_data[prop_columns]
prop_columns.remove('prop_id')
property_df = full_data.groupby('prop_id')[prop_columns].agg(np.mean).reset_index()
property_df['prop_review_score'].fillna(-1, inplace=True)
x_train = property_df[~property_df['prop_location_score2'].isnull()]
x_test = property_df[property_df['prop_location_score2'].isnull()]
y_train = x_train['prop_location_score2']
x_train.drop(columns=['prop_location_score2'], inplace=True)
x_test.drop(columns=['prop_location_score2'], inplace=True)
from xgboost import XGBRegressor
model = XGBRegressor(n_estimators=500, objective='reg:squarederror', max_depth=7, eta=0.2, subsample=0.7, colsample_bytree=0.8, verbose=1)
model.fit(x_train, y_train, eval_set=[(x_train, y_train)], eval_metric='rmse')
preds = model.predict(x_test)
preds[preds < 0] = 0
x_train['prop_location_score2'] = y_train
x_test['prop_location_score2'] = preds
filled_prop_df = pd.concat([x_train, x_test])
filled_prop_df.to_csv('preprocessed_data/filled_prop_df.csv', index=False)

Parameters: { "verbose" } might not be used.

  This could be a false alarm, with some parameters getting used by language bindings but
  then being mistakenly passed down to XGBoost core, or some parameter actually being used
  but getting flagged wrongly here. Please open an issue if you find any such cases.


[0]	validation_0-rmse:0.32919
[1]	validation_0-rmse:0.27702
[2]	validation_0-rmse:0.23762


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  errors=errors,


[3]	validation_0-rmse:0.20851
[4]	validation_0-rmse:0.18773
[5]	validation_0-rmse:0.17228
[6]	validation_0-rmse:0.16200
[7]	validation_0-rmse:0.15444
[8]	validation_0-rmse:0.14969
[9]	validation_0-rmse:0.14605
[10]	validation_0-rmse:0.14369
[11]	validation_0-rmse:0.14201
[12]	validation_0-rmse:0.14098
[13]	validation_0-rmse:0.13987
[14]	validation_0-rmse:0.13918
[15]	validation_0-rmse:0.13863
[16]	validation_0-rmse:0.13820
[17]	validation_0-rmse:0.13794
[18]	validation_0-rmse:0.13765
[19]	validation_0-rmse:0.13715
[20]	validation_0-rmse:0.13669
[21]	validation_0-rmse:0.13640
[22]	validation_0-rmse:0.13608
[23]	validation_0-rmse:0.13595
[24]	validation_0-rmse:0.13583
[25]	validation_0-rmse:0.13563
[26]	validation_0-rmse:0.13553
[27]	validation_0-rmse:0.13541
[28]	validation_0-rmse:0.13532
[29]	validation_0-rmse:0.13521
[30]	validation_0-rmse:0.13496
[31]	validation_0-rmse:0.13485
[32]	validation_0-rmse:0.13470
[33]	validation_0-rmse:0.13462
[34]	validation_0-rmse:0.13456
[35]	validation

[263]	validation_0-rmse:0.11636
[264]	validation_0-rmse:0.11632
[265]	validation_0-rmse:0.11625
[266]	validation_0-rmse:0.11617
[267]	validation_0-rmse:0.11613
[268]	validation_0-rmse:0.11607
[269]	validation_0-rmse:0.11600
[270]	validation_0-rmse:0.11595
[271]	validation_0-rmse:0.11587
[272]	validation_0-rmse:0.11582
[273]	validation_0-rmse:0.11578
[274]	validation_0-rmse:0.11570
[275]	validation_0-rmse:0.11563
[276]	validation_0-rmse:0.11555
[277]	validation_0-rmse:0.11549
[278]	validation_0-rmse:0.11546
[279]	validation_0-rmse:0.11540
[280]	validation_0-rmse:0.11537
[281]	validation_0-rmse:0.11527
[282]	validation_0-rmse:0.11521
[283]	validation_0-rmse:0.11518
[284]	validation_0-rmse:0.11510
[285]	validation_0-rmse:0.11503
[286]	validation_0-rmse:0.11495
[287]	validation_0-rmse:0.11490
[288]	validation_0-rmse:0.11485
[289]	validation_0-rmse:0.11478
[290]	validation_0-rmse:0.11473
[291]	validation_0-rmse:0.11464
[292]	validation_0-rmse:0.11460
[293]	validation_0-rmse:0.11457
[294]	va

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


In [14]:
prop_columns

['prop_country_id',
 'prop_starrating',
 'prop_review_score',
 'prop_brand_bool',
 'prop_location_score1',
 'prop_location_score2',
 'prop_log_historical_price',
 'price_usd']

In [None]:
def add_normalised_prop_features(df):
    df_clone = copy.deepcopy(df)
    filled_prop_df = pd.read_csv('preprocessed_data/filled_prop_df.csv')
    selector = pd.DataFrame(filled_prop_df.groupby('prop_id')['price_usd'].mean())
    selector.columns =  ['price_usd_mean']
    df_clone = pd.merge(df_clone, selector, how='left', on='prop_id')
    
    selector = pd.DataFrame(filled_prop_df.groupby('prop_id')['prop_location_score2'].mean())
    selector.columns =  ['prop_location_score2_new']
    df_clone = pd.merge(df_clone, selector, how='left', on='prop_id')
    
    df_clone.drop(columns=['prop_location_score2'], inplace=True)
    
    return df_clone
    
    
    
    
    
    

In [28]:
df_test = add_normalised_prop_features(df_test)

NameError: name 'add_normalised_prop_features' is not defined

In [45]:
filled_prop_df = pd.read_csv('preprocessed_data/filled_prop_df.csv')

TypeError: read_csv() got an unexpected keyword argument 'index'

In [40]:
filled_prop_df.head()

Unnamed: 0.1,Unnamed: 0,prop_id,prop_country_id,prop_starrating,prop_review_score,prop_brand_bool,prop_location_score1,prop_log_historical_price,price_usd,prop_location_score2
0,1,2,219,0,4.0,1,0.69,3.3,96.272727,0.0361
1,2,3,219,3,3.5,1,0.69,4.165122,109.026098,0.035055
2,3,4,109,5,4.5,1,4.88,4.965,468.611818,0.1155
3,5,6,158,3,0.0,0,2.3,3.664,46.498,0.018763
4,6,7,31,0,5.0,0,6.49,4.374286,139.577143,0.1247


In [39]:
filled_prop_df_new.head()

Unnamed: 0,prop_id,prop_country_id,prop_starrating,prop_review_score,prop_brand_bool,prop_location_score1,prop_log_historical_price,price_usd,prop_location_score2
1,2,219,0,4.0,1,0.69,3.74,97.149655,0.06679
2,3,219,3,3.5,1,0.69,4.140465,117.853895,0.026578
3,4,109,5,4.5,1,4.88,4.914048,485.960952,0.1155
5,6,158,3,0.0,0,2.3,3.592353,47.25,0.018408
6,7,31,0,5.0,0,6.49,4.035,148.463571,0.1247
