## Base

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

tabla = np.array(pd.read_csv("economia7.csv").drop('Unnamed: 0', 1))
pd.DataFrame(tabla).head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14
0,3358406000.0,3637845000.0,1347350000.0,1239080000.0,108.320559,0.071111,14517.635341,39348.639719,42622.666667,15786.17284,-1.046262,25735.714754,24723.111892,2196543000.0,2110118000.0
1,2954188000.0,3257308000.0,1206410000.0,1089944000.0,110.260684,-12.036015,12629.713786,34231.611819,37744.00927,13979.262693,-13.004332,22388.956865,21671.375749,1932167000.0,1870240000.0
2,2743171000.0,3065955000.0,1135539000.0,1012090000.0,111.766847,-7.142991,11602.142227,31446.47897,35146.738046,13017.310387,-8.136143,20567.359344,20151.309866,1794152000.0,1757859000.0
3,2693974000.0,3050780000.0,1129918000.0,993938500.0,113.244595,-1.793436,11275.280156,30560.551094,34608.172248,12817.841573,-2.817256,19987.924139,19987.924139,1761975000.0,1761975000.0
4,2802259000.0,3252725000.0,1204713000.0,1033890000.0,116.075112,4.019542,11607.745344,31461.665675,36519.163794,13525.61622,2.94862,20577.292107,20946.700835,1832799000.0,1865702000.0


## Train Test Split

In [17]:
def splitter(tabla,y_indicator):
    from sklearn.model_selection import train_test_split
    # Asignamos X e y, eliminando la columna y en X
    X = np.delete(tabla[:,:], y_indicator, 1)
    y = tabla[:,y_indicator]
    # Separamos Train y Test respectivamente para X e y
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)
    return X_train, X_test, y_train, y_test

## Métricas de error
### R2 Score - Coeficiente de Determinación
El mejor valor posible es 1.0

In [18]:
from sklearn.metrics import r2_score
def r2(y_true, y_predict):
    return r2_score(y_true, y_predict)

## Decission Tree Regressor

In [19]:
def DTR(X_train, X_test, y_train, y_test):
    from sklearn.tree import DecisionTreeRegressor
    estimator = DecisionTreeRegressor()
    estimator.fit(X_train,y_train)
    y_predict = estimator.predict(X_test)
    print "R2 score:",r2(y_test,y_predict)
    return y_test,y_predict

In [29]:
X_train, X_test, y_train, y_test = splitter(tabla,0)
y_test , y_predict = DTR(X_train, X_test, y_train, y_test)
pd.DataFrame((zip(*[y_test,y_predict])), columns=['Test','Predict'])

R2 score: 0.990054374212


Unnamed: 0,Test,Predict
0,1194000000.0,1186000000.0
1,0.0,0.0
2,362282800000.0,431273000000.0
3,23970100000.0,25372800000.0
4,22937810000000.0,23933860000000.0
5,13279400000000.0,12763880000000.0
6,88765000000.0,88759000000.0
7,11675440000000.0,12282460000000.0
8,9451700000.0,25372800000.0
9,3293180000.0,3107000000.0


## Modificamos los valores nulos

In [20]:
def imputador(X):
    from sklearn.preprocessing import Imputer
    impute=Imputer(missing_values=0,strategy='mean',axis=0)
    impute.fit(X)
    X_imputado=impute.transform(X)
    return X_imputado

In [21]:
tabla_imputada = imputador(tabla)

In [22]:
X_train, X_test, y_train, y_test = splitter(tabla_imputada,0)
y_test , y_predict = DTR(X_train, X_test, y_train, y_test)
pd.DataFrame((zip(*[y_test,y_predict])), columns=['Test','Predict']).head()

R2 score: 0.999874807582


Unnamed: 0,Test,Predict
0,7893019000.0,7898250000.0
1,844508100000.0,837791000000.0
2,318637000000.0,352584000000.0
3,25372800000.0,23970100000.0
4,16729620000000.0,16729620000000.0


## Normalizamos los datos

In [23]:
def normalizar_datos(X):
    from sklearn.preprocessing import StandardScaler
    sc = StandardScaler()
    X_norm = sc.fit_transform(X)
    return X_norm

In [24]:
tabla_normalizada = normalizar_datos(tabla_imputada)

In [26]:
X_train, X_test, y_train, y_test = splitter(tabla_normalizada,0)
y_test , y_predict = DTR(X_train, X_test, y_train, y_test)
pd.DataFrame((zip(*[y_test,y_predict])), columns=['Test','Predict']).head()

R2 score: 0.999804160537


Unnamed: 0,Test,Predict
0,-0.2469244,-0.2468445
1,-0.2345704,-0.2232004
2,-0.2423358,-0.2417016
3,-0.2466663,-0.2468445
4,-2.884117e-17,-2.884117e-17
