In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from datetime import datetime
from math import sqrt

from sklearn import metrics
from sklearn.metrics import mean_squared_error, mean_absolute_error
from sklearn.metrics import roc_curve, auc
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.neighbors import KNeighborsClassifier, KNeighborsRegressor
%matplotlib inline

In [2]:
train_df = pd.read_csv("train.csv", low_memory=False)
test_df = pd.read_csv("test.csv", low_memory=False)
data = pd.read_csv("data_set.csv", low_memory = False)

In [3]:
train_df = train_df[['id','precio']]
train_df['precio'].value_counts()

1500000.0    3041
2500000.0    3036
1200000.0    2909
3500000.0    2726
1300000.0    2480
             ... 
2704600.0       1
1352250.0       1
938260.0        1
2704460.0       1
2327000.0       1
Name: precio, Length: 15520, dtype: int64

In [4]:
data.shape

(300000, 81)

In [5]:
test_df.shape

(60000, 22)

In [6]:
train_df.shape

(240000, 2)

In [7]:
features = pd.merge(train_df, data, on='id', how='inner')
features = features.fillna(0)
features['precio'].value_counts()

1500000.0    3041
2500000.0    3036
1200000.0    2909
3500000.0    2726
1300000.0    2480
             ... 
2704600.0       1
1352250.0       1
938260.0        1
2704460.0       1
2327000.0       1
Name: precio, Length: 15520, dtype: int64

In [8]:
labels = features['precio']
features = features.drop(['id','precio'], axis=1)
x_train, x_test, y_train, y_test = train_test_split(features, labels, test_size=0.25)
print("Train: ",len(x_train),"Test: ",len(x_test))

Train:  180000 Test:  60000


In [9]:
metrics = ['euclidean','manhattan','cosine','jaccard','hamming']
neighbors = list(range(1,30, 2))
train_results = []
test_results = []
best = [9999999999999,1, 1]
mas_val = []
for m in metrics:
    for n in neighbors:
        model = KNeighborsRegressor(n_neighbors = n, metric=m)

        model.fit(x_train, y_train)
        pred=model.predict(x_test)
        error = sqrt(mean_absolute_error(y_test,pred))
        mas_val.append(error)
        print('MAS value for m= ' , m , 'is:', error)
        if best[0] > error:
                best[0] = error
                best[1] = m
                best[2] = n


MAS value for m=  euclidean is: 965.8301160745955
MAS value for m=  euclidean is: 907.5134501298957
MAS value for m=  euclidean is: 895.5144810814991
MAS value for m=  euclidean is: 891.1687324773228
MAS value for m=  euclidean is: 889.7746253291502
MAS value for m=  euclidean is: 890.7552485306211
MAS value for m=  euclidean is: 891.5909247864396
MAS value for m=  euclidean is: 893.1665329632293
MAS value for m=  euclidean is: 894.5608585802988
MAS value for m=  euclidean is: 896.0850968753499
MAS value for m=  euclidean is: 897.8190598638247
MAS value for m=  euclidean is: 899.8360770710134
MAS value for m=  euclidean is: 901.5836013633641
MAS value for m=  euclidean is: 903.2791067860326
MAS value for m=  euclidean is: 905.2138573084364
MAS value for m=  manhattan is: 941.1235291572161
MAS value for m=  manhattan is: 888.1587584873427
MAS value for m=  manhattan is: 875.8820255871602
MAS value for m=  manhattan is: 874.4827655942847
MAS value for m=  manhattan is: 872.866706738969
M

In [10]:
min(mas_val)

872.866706738969

In [11]:
best

[872.866706738969, 'manhattan', 9]

In [12]:
set_test = pd.merge(data, test_df[['id']],on = "id", how = "inner")
ids = set_test["id"]
set_test = set_test.drop(columns=['id'])

In [13]:
set_test = set_test.fillna(0)

In [14]:
set_test.head()

Unnamed: 0,days_to_today,año,antiguedad,centroscomercialescercanos,escuelascercanas,habitaciones,garages,banos,anio_x,Apartamento,...,Quintana Roo,San luis Potosí,Sinaloa,Sonora,Tabasco,Tamaulipas,Tlaxcala,Veracruz,Yucatán,Zacatecas
0,2304,2013,29.0,0.0,0.0,3.0,0.0,4.0,2013,0,...,0,0,0,0,0,0,0,0,0,0
1,1478,2015,0.0,0.0,0.0,1.0,1.0,1.0,2015,1,...,0,0,0,0,0,0,0,0,1,0
2,1625,2015,0.0,1.0,0.0,2.0,1.0,2.0,2015,1,...,0,0,0,0,0,0,0,0,0,0
3,1683,2015,2.0,0.0,0.0,2.0,2.0,2.0,2015,1,...,0,0,0,0,0,0,0,0,0,0
4,2278,2013,10.0,1.0,1.0,2.0,1.0,1.0,2013,0,...,0,0,0,0,0,0,0,0,0,0


In [15]:
model = KNeighborsRegressor(n_neighbors=9, metric="manhattan")
model.fit(features, labels)

preds_kaggel_knn = model.predict(set_test)

In [16]:
df_kaggel_knn = pd.DataFrame(preds_kaggel_knn)
df_kaggel_knn.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 60000 entries, 0 to 59999
Data columns (total 1 columns):
0    60000 non-null float64
dtypes: float64(1)
memory usage: 468.9 KB


In [17]:
df_kaggel_knn = pd.DataFrame(df_kaggel_knn)

In [18]:
df_kaggel_knn.head()

Unnamed: 0,0
0,3676667.0
1,1301667.0
2,2136667.0
3,1257778.0
4,625222.2


In [19]:
# df_kaggel_knn = df_kaggel_knn.to_frame()# df_kagg 
df_kaggel_knn["id"] = ids
df_kaggel_knn.rename(columns = {0: 'target'},inplace = True) 
df_kaggel_knn = df_kaggel_knn[['id','target']]

In [20]:
df_kaggel_knn.head()

Unnamed: 0,id,target
0,4941,3676667.0
1,51775,1301667.0
2,115253,2136667.0
3,299321,1257778.0
4,173570,625222.2


In [21]:
df_kaggel_knn.to_csv('ResultadosKNN-mn.csv',index=False)

In [22]:
df_kaggel_knn.shape

(60000, 2)

In [23]:
df_kaggel_knn.head()

Unnamed: 0,id,target
0,4941,3676667.0
1,51775,1301667.0
2,115253,2136667.0
3,299321,1257778.0
4,173570,625222.2
