In [12]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from datetime import datetime
from math import sqrt

from sklearn.metrics import mean_squared_error, mean_absolute_error
from sklearn import metrics
from sklearn.metrics import roc_curve, auc
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.neighbors import KNeighborsClassifier, KNeighborsRegressor
%matplotlib inline

In [38]:
train_df = pd.read_csv("train.csv", low_memory=False)
test_df = pd.read_csv("test.csv", low_memory=False)
data = pd.read_csv("data_set.csv", low_memory = False)

In [39]:
train_df = train_df[['id','precio']]
train_df['precio'].value_counts()

1500000.0    3041
2500000.0    3036
1200000.0    2909
3500000.0    2726
1300000.0    2480
             ... 
2704600.0       1
1352250.0       1
938260.0        1
2704460.0       1
2327000.0       1
Name: precio, Length: 15520, dtype: int64

In [40]:
data.shape

(300000, 12)

In [41]:
test_df.shape

(60000, 22)

In [42]:
train_df.shape

(240000, 2)

In [43]:
features = pd.merge(train_df, data, on='id', how='inner')
features = features.fillna(0)
features['precio'].value_counts()

1500000.0    3041
2500000.0    3036
1200000.0    2909
3500000.0    2726
1300000.0    2480
             ... 
2704600.0       1
1352250.0       1
938260.0        1
2704460.0       1
2327000.0       1
Name: precio, Length: 15520, dtype: int64

In [44]:
labels = features['precio']
features = features.drop(['id','precio'], axis=1)
x_train, x_test, y_train, y_test = train_test_split(features, labels, test_size=0.25)
print("Train: ",len(x_train),"Test: ",len(x_test))

Train:  180000 Test:  60000


In [45]:
neighbors = list(range(1,30))
train_results = []
test_results = []
best = [9999999999999,1]
mas_val = []
for n in neighbors:
    model = KNeighborsRegressor(n_neighbors = n)

    model.fit(x_train, y_train)
    pred=model.predict(x_test)
    error = sqrt(mean_absolute_error(y_test,pred))
    mas_val.append(error)
    print('MAS value for k= ' , n , 'is:', error)
    if best[0] > error:
            best[0] = error
            best[1] = n


MAS value for k=  1 is: 1094.45137533987
MAS value for k=  2 is: 1044.430449942296
MAS value for k=  3 is: 1024.900213928708
MAS value for k=  4 is: 1016.3442093068994
MAS value for k=  5 is: 1009.0793818228574
MAS value for k=  6 is: 1004.866039640221
MAS value for k=  7 is: 1002.7355115608691
MAS value for k=  8 is: 1000.0745269957802
MAS value for k=  9 is: 998.4767521867534
MAS value for k=  10 is: 997.7785344270208
MAS value for k=  11 is: 996.4956158101188
MAS value for k=  12 is: 996.0035114807299
MAS value for k=  13 is: 995.4400684068045
MAS value for k=  14 is: 995.4215249713011
MAS value for k=  15 is: 995.1267005825048
MAS value for k=  16 is: 994.9121014567066
MAS value for k=  17 is: 994.3950722695288
MAS value for k=  18 is: 994.3886956581806
MAS value for k=  19 is: 994.3714729040846
MAS value for k=  20 is: 994.3619324253451
MAS value for k=  21 is: 994.286006196534
MAS value for k=  22 is: 993.7962803864023
MAS value for k=  23 is: 994.1517295540768
MAS value for k=  

In [46]:
min(mas_val)

993.7962803864023

In [47]:
best

[993.7962803864023, 22]

In [48]:
set_test = pd.merge(data, test_df[['id']],on = "id", how = "inner")
ids = set_test["id"]
set_test = set_test.drop(columns=['id'])

In [49]:
set_test = set_test.fillna(0)

In [50]:
set_test.head()

Unnamed: 0,habitaciones,garages,banos,anio_x,gimnasio,usosmultiples,piscina,anio_y,qty_of_extras,metroscubiertos,metrostotales
0,3.0,0.0,4.0,2013,0.0,0.0,0.0,2013,0.0,300.0,0.0
1,1.0,1.0,1.0,2015,0.0,0.0,0.0,2015,0.0,67.0,67.0
2,2.0,1.0,2.0,2015,0.0,0.0,0.0,2015,0.0,87.0,100.0
3,2.0,2.0,2.0,2015,0.0,0.0,0.0,2015,0.0,86.0,86.0
4,2.0,1.0,1.0,2013,0.0,0.0,0.0,2013,0.0,80.0,76.0


In [51]:
model = KNeighborsRegressor(n_neighbors=16)
model.fit(x_train, y_train)

preds_kaggel_knn = model.predict(set_test)

In [52]:
df_kaggel_knn = pd.DataFrame(preds_kaggel_knn)
df_kaggel_knn.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 60000 entries, 0 to 59999
Data columns (total 1 columns):
0    60000 non-null float64
dtypes: float64(1)
memory usage: 468.9 KB


In [53]:
df_kaggel_knn = pd.DataFrame(df_kaggel_knn)

In [54]:
df_kaggel_knn.head()

Unnamed: 0,0
0,5593125.0
1,1049375.0
2,1083270.0
3,2224310.0
4,570187.5


In [55]:
# df_kaggel_knn = df_kaggel_knn.to_frame()# df_kagg 
df_kaggel_knn["id"] = ids
df_kaggel_knn.rename(columns = {0: 'target'},inplace = True) 
df_kaggel_knn = df_kaggel_knn[['id','target']]

In [56]:
df_kaggel_knn.head()

Unnamed: 0,id,target
0,4941,5593125.0
1,51775,1049375.0
2,115253,1083270.0
3,299321,2224310.0
4,173570,570187.5


In [57]:
df_kaggel_knn.to_csv('ResultadosKNN.csv',index=False)

In [58]:
df_kaggel_knn.shape

(60000, 2)

In [59]:
df_kaggel_knn.head()

Unnamed: 0,id,target
0,4941,5593125.0
1,51775,1049375.0
2,115253,1083270.0
3,299321,2224310.0
4,173570,570187.5
