In [8]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from datetime import datetime

from sklearn import tree
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn import preprocessing
from sklearn.externals import joblib
from sklearn.ensemble import RandomForestRegressor

%matplotlib inline

In [2]:
train_df = pd.read_csv("train.csv", low_memory=False)
test_df = pd.read_csv("test.csv", low_memory=False)
data = pd.read_csv("data_set.csv", low_memory = False)

In [3]:
train_df = train_df[['id','precio']]
train_df['precio'].value_counts()

1500000.0    3041
2500000.0    3036
1200000.0    2909
3500000.0    2726
1300000.0    2480
             ... 
2704600.0       1
1352250.0       1
938260.0        1
2704460.0       1
2327000.0       1
Name: precio, Length: 15520, dtype: int64

In [4]:
features = pd.merge(train_df, data, on='id', how='inner')
features = features.fillna(0)
features['precio'].value_counts()

1500000.0    3041
2500000.0    3036
1200000.0    2909
3500000.0    2726
1300000.0    2480
             ... 
2704600.0       1
1352250.0       1
938260.0        1
2704460.0       1
2327000.0       1
Name: precio, Length: 15520, dtype: int64

In [5]:
labels = features['precio']
features = features.drop(['id','precio'], axis=1)
x_train, x_test, y_train, y_test = train_test_split(features, labels, test_size=0.25)
print("Train: ",len(x_train),"Test: ",len(x_test))

Train:  180000 Test:  60000


In [9]:
def grid_search():
    n_estimators = [10,20,30,40,50,60,70,80,90,100]
    min_samples_split = [10,20,30,40,50,60,70,80,90,100]
    n_jobs = [-10,-5,5,10,15,20]
    maximos = []
    max_score = 0
    for n_est in n_estimators:
        for min_sam in min_samples_split:
            for n_j in n_jobs:
                c=RandomForestRegressor(n_estimators=n_est,min_samples_split=min_sam,n_jobs=n_j,random_state=0)
                dt=c.fit(x_train,y_train)
                score=c.score(x_test,y_test)*100
                if max_score < score:
                    maximos = [n_est,min_sam,n_j]
                    max_score = score
                print("n_estimators : "+ str(n_est)+"; min_samples_split: "+str(min_sam)+"; n_job: "+str(n_j)+"; SCORE: "+ str(score))
    print("El mejor score fue: "+str(max_score))
    return maximos

In [10]:
mejores_hiperparmetros = grid_search()

n_estimators : 10; min_samples_split: 10; n_job: -10; SCORE: 75.88355543857811
n_estimators : 10; min_samples_split: 10; n_job: -5; SCORE: 75.88355543857811
n_estimators : 10; min_samples_split: 10; n_job: 5; SCORE: 75.88355543857811
n_estimators : 10; min_samples_split: 10; n_job: 10; SCORE: 75.88355543857811
n_estimators : 10; min_samples_split: 10; n_job: 15; SCORE: 75.88355543857811
n_estimators : 10; min_samples_split: 10; n_job: 20; SCORE: 75.88355543857811
n_estimators : 10; min_samples_split: 20; n_job: -10; SCORE: 75.68125945402274
n_estimators : 10; min_samples_split: 20; n_job: -5; SCORE: 75.68125945402274
n_estimators : 10; min_samples_split: 20; n_job: 5; SCORE: 75.68125945402275
n_estimators : 10; min_samples_split: 20; n_job: 10; SCORE: 75.68125945402274
n_estimators : 10; min_samples_split: 20; n_job: 15; SCORE: 75.68125945402274
n_estimators : 10; min_samples_split: 20; n_job: 20; SCORE: 75.68125945402275
n_estimators : 10; min_samples_split: 30; n_job: -10; SCORE: 75.

n_estimators : 20; min_samples_split: 80; n_job: 10; SCORE: 74.32381746763824
n_estimators : 20; min_samples_split: 80; n_job: 15; SCORE: 74.32381746763824
n_estimators : 20; min_samples_split: 80; n_job: 20; SCORE: 74.32381746763824
n_estimators : 20; min_samples_split: 90; n_job: -10; SCORE: 74.00784852482956
n_estimators : 20; min_samples_split: 90; n_job: -5; SCORE: 74.00784852482956
n_estimators : 20; min_samples_split: 90; n_job: 5; SCORE: 74.00784852482956
n_estimators : 20; min_samples_split: 90; n_job: 10; SCORE: 74.00784852482956
n_estimators : 20; min_samples_split: 90; n_job: 15; SCORE: 74.00784852482956
n_estimators : 20; min_samples_split: 90; n_job: 20; SCORE: 74.00784852482956
n_estimators : 20; min_samples_split: 100; n_job: -10; SCORE: 73.76057670910747
n_estimators : 20; min_samples_split: 100; n_job: -5; SCORE: 73.76057670910747
n_estimators : 20; min_samples_split: 100; n_job: 5; SCORE: 73.76057670910747
n_estimators : 20; min_samples_split: 100; n_job: 10; SCORE: 

n_estimators : 40; min_samples_split: 60; n_job: -10; SCORE: 75.13343848223056
n_estimators : 40; min_samples_split: 60; n_job: -5; SCORE: 75.13343848223056
n_estimators : 40; min_samples_split: 60; n_job: 5; SCORE: 75.13343848223056
n_estimators : 40; min_samples_split: 60; n_job: 10; SCORE: 75.13343848223056
n_estimators : 40; min_samples_split: 60; n_job: 15; SCORE: 75.13343848223056
n_estimators : 40; min_samples_split: 60; n_job: 20; SCORE: 75.13343848223056
n_estimators : 40; min_samples_split: 70; n_job: -10; SCORE: 74.78749518665097
n_estimators : 40; min_samples_split: 70; n_job: -5; SCORE: 74.78749518665097
n_estimators : 40; min_samples_split: 70; n_job: 5; SCORE: 74.78749518665097
n_estimators : 40; min_samples_split: 70; n_job: 10; SCORE: 74.78749518665097
n_estimators : 40; min_samples_split: 70; n_job: 15; SCORE: 74.78749518665097
n_estimators : 40; min_samples_split: 70; n_job: 20; SCORE: 74.78749518665097
n_estimators : 40; min_samples_split: 80; n_job: -10; SCORE: 74.

n_estimators : 60; min_samples_split: 30; n_job: 10; SCORE: 76.39855259306978
n_estimators : 60; min_samples_split: 30; n_job: 15; SCORE: 76.39855259306978
n_estimators : 60; min_samples_split: 30; n_job: 20; SCORE: 76.39855259306978
n_estimators : 60; min_samples_split: 40; n_job: -10; SCORE: 75.94293262551929
n_estimators : 60; min_samples_split: 40; n_job: -5; SCORE: 75.94293262551929
n_estimators : 60; min_samples_split: 40; n_job: 5; SCORE: 75.94293262551929
n_estimators : 60; min_samples_split: 40; n_job: 10; SCORE: 75.94293262551929
n_estimators : 60; min_samples_split: 40; n_job: 15; SCORE: 75.94293262551929
n_estimators : 60; min_samples_split: 40; n_job: 20; SCORE: 75.94293262551929
n_estimators : 60; min_samples_split: 50; n_job: -10; SCORE: 75.547348635456
n_estimators : 60; min_samples_split: 50; n_job: -5; SCORE: 75.547348635456
n_estimators : 60; min_samples_split: 50; n_job: 5; SCORE: 75.54734863545599
n_estimators : 60; min_samples_split: 50; n_job: 10; SCORE: 75.54734

n_estimators : 80; min_samples_split: 10; n_job: -5; SCORE: 77.54581287493768
n_estimators : 80; min_samples_split: 10; n_job: 5; SCORE: 77.54581287493768
n_estimators : 80; min_samples_split: 10; n_job: 10; SCORE: 77.54581287493768
n_estimators : 80; min_samples_split: 10; n_job: 15; SCORE: 77.54581287493768
n_estimators : 80; min_samples_split: 10; n_job: 20; SCORE: 77.54581287493768
n_estimators : 80; min_samples_split: 20; n_job: -10; SCORE: 77.01637656504955
n_estimators : 80; min_samples_split: 20; n_job: -5; SCORE: 77.01637656504955
n_estimators : 80; min_samples_split: 20; n_job: 5; SCORE: 77.01637656504955
n_estimators : 80; min_samples_split: 20; n_job: 10; SCORE: 77.01637656504955
n_estimators : 80; min_samples_split: 20; n_job: 15; SCORE: 77.01637656504955
n_estimators : 80; min_samples_split: 20; n_job: 20; SCORE: 77.01637656504955
n_estimators : 80; min_samples_split: 30; n_job: -10; SCORE: 76.47876454335609
n_estimators : 80; min_samples_split: 30; n_job: -5; SCORE: 76.4

n_estimators : 90; min_samples_split: 80; n_job: 20; SCORE: 74.5473750423428
n_estimators : 90; min_samples_split: 90; n_job: -10; SCORE: 74.2255846748596
n_estimators : 90; min_samples_split: 90; n_job: -5; SCORE: 74.2255846748596
n_estimators : 90; min_samples_split: 90; n_job: 5; SCORE: 74.2255846748596
n_estimators : 90; min_samples_split: 90; n_job: 10; SCORE: 74.2255846748596
n_estimators : 90; min_samples_split: 90; n_job: 15; SCORE: 74.2255846748596
n_estimators : 90; min_samples_split: 90; n_job: 20; SCORE: 74.2255846748596
n_estimators : 90; min_samples_split: 100; n_job: -10; SCORE: 73.93149918784012
n_estimators : 90; min_samples_split: 100; n_job: -5; SCORE: 73.93149918784012
n_estimators : 90; min_samples_split: 100; n_job: 5; SCORE: 73.93149918784012
n_estimators : 90; min_samples_split: 100; n_job: 10; SCORE: 73.93149918784012
n_estimators : 90; min_samples_split: 100; n_job: 15; SCORE: 73.93149918784012
n_estimators : 90; min_samples_split: 100; n_job: 20; SCORE: 73.93

In [11]:
mejores_hiperparmetros


[100, 10, -10]

In [13]:
c=RandomForestRegressor(n_estimators=mejores_hiperparmetros[0],
                         min_samples_split=mejores_hiperparmetros[1],
                         n_jobs=mejores_hiperparmetros[2],random_state=0)
dt=c.fit(x_train,y_train)

In [14]:
c.score(x_test,y_test)*100


77.58980171074016

In [15]:
set_test = pd.merge(data, test_df[['id']],on = "id", how = "inner")
ids = set_test["id"]
set_test = set_test.drop(columns=['id'])

In [16]:
set_test = set_test.fillna(0)

In [17]:
preds_kaggel = c.predict(set_test)

In [18]:
df_kaggel = pd.DataFrame(preds_kaggel)
df_kaggel.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 60000 entries, 0 to 59999
Data columns (total 1 columns):
0    60000 non-null float64
dtypes: float64(1)
memory usage: 468.9 KB


In [19]:
df_kaggel = pd.DataFrame(df_kaggel)
df_kaggel.head()

Unnamed: 0,0
0,6376671.0
1,1039948.0
2,2725980.0
3,1302461.0
4,623164.5


In [20]:
# df_kaggel_knn = df_kaggel_knn.to_frame()# df_kagg 
df_kaggel["id"] = ids
df_kaggel.rename(columns = {0: 'target'},inplace = True) 
df_kaggel = df_kaggel[['id','target']]

In [21]:
df_kaggel.to_csv('ResultadosRandomForest.csv',index=False)

In [22]:
df_kaggel.shape

(60000, 2)

In [23]:
importances = list(c.feature_importances_)
feature_importances = [(feature, round(importance, 2)) for feature, importance in zip(features, importances)]
feature_importances = sorted(feature_importances, key = lambda x: x[1], reverse = True)
[print('Variable: {:20} Importance: {}'.format(*pair)) for pair in feature_importances];

Variable: metroscubiertos      Importance: 0.41
Variable: metrostotales        Importance: 0.09
Variable: Distrito Federal     Importance: 0.07
Variable: Apartamento          Importance: 0.06
Variable: idzona               Importance: 0.06
Variable: days_to_today        Importance: 0.05
Variable: banos                Importance: 0.05
Variable: antiguedad           Importance: 0.03
Variable: habitaciones         Importance: 0.03
Variable: lat                  Importance: 0.02
Variable: lng                  Importance: 0.02
Variable: garages              Importance: 0.01
Variable: Casa                 Importance: 0.01
Variable: Edificio             Importance: 0.01
Variable: Terreno              Importance: 0.01
Variable: Edo. de México       Importance: 0.01
Variable: Jalisco              Importance: 0.01
Variable: Nuevo León           Importance: 0.01
Variable: año                  Importance: 0.0
Variable: centroscomercialescercanos Importance: 0.0
Variable: escuelascercanas     Impor