In [1]:
%matplotlib inline
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
pd.set_option('display.max_columns', 100)
pd.set_option('display.float_format', '{:.2f}'.format)

In [2]:
# Importamos utilidades y modelos de sklearn
from sklearn.model_selection import train_test_split
from sklearn.dummy import DummyRegressor
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn import linear_model
from sklearn.impute import SimpleImputer 
imputer = SimpleImputer(missing_values=np.nan, strategy='mean')

# Importamos datos de entrenamiento
Con este set entreno el modelo

In [3]:
train = pd.read_csv(
    './train.csv',
    dtype={
        "id": np.int32,
        "keyword":str,
        "location":str,
        "text":str,
        "target":np.int32,
    },
)

In [4]:
train.head()

Unnamed: 0,id,keyword,location,text,target
0,1,,,Our Deeds are the Reason of this #earthquake M...,1
1,4,,,Forest fire near La Ronge Sask. Canada,1
2,5,,,All residents asked to 'shelter in place' are ...,1
3,6,,,"13,000 people receive #wildfires evacuation or...",1
4,7,,,Just got sent this photo from Ruby #Alaska as ...,1


# Importamos los datos de test

Sobre este set de datos no tengo la columna 'target' que es justamento lo que quiero predecir si es o no real

In [5]:
test = pd.read_csv(
    './test.csv',
    dtype={
        "id": np.int32,
        "keyword":str,
        "location":str,
        "text":str,
        "target":np.int32,
    },
)

In [6]:
test.head()

Unnamed: 0,id,keyword,location,text
0,0,,,Just happened a terrible car crash
1,2,,,"Heard about #earthquake is different cities, s..."
2,3,,,"there is a forest fire at spot pond, geese are..."
3,9,,,Apocalypse lighting. #Spokane #wildfires
4,11,,,Typhoon Soudelor kills 28 in China and Taiwan


# Preprocesamiento de los datos

In [7]:
X = train.drop(['target'], axis=1)
y = train['target']

In [8]:
# Separo el set en train y test 
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=1)
print(f"Train shapes: X={X_train.shape} y={y_train.shape}")
print(f"Test  shapes: X={X_test.shape}  y={y_test.shape}")

Train shapes: X=(5709, 4) y=(5709,)
Test  shapes: X=(1904, 4)  y=(1904,)


# Métrica de evaluación

In [29]:
def RMSLE(actual, pred):
    return (np.mean((np.log(actual + 1) - np.log(pred + 1)) ** 2)) **.5

# Exportar prediccion

In [30]:
def EXPORTAR(modelo,nombre):
    pred = modelo.predict(test) 
    #Creo un dataframe con el formato ( id , precio (El precio predecido ))
    res = test['id'].to_frame()
    res.insert(1,'target', pred , True) 
    #Exportamos la prediccion en formato csv sin labels
    res.to_csv(nombre,index=False , header=True,encoding='utf-8')
    

# Errores

In [31]:
def ERRORES(modelo):
    # Realizo la prediccion
    pred = modelo.predict(    X_test[['id']]     )

    #Calculo de Errores contra mi set de test
    rmsle = RMSLE(y_test, pred)
    #Calculo de Errores contra mi set de entrenamiento
    rmsle_train = RMSLE(y_train, modelo.predict(X_train) )

    # Muestro los errores
    print(f"RMSLE Error (train): {rmsle_train:.5f}")
    print(f"RMSLE Error (Test): {rmsle:.5f}")

<p style="color:red; font-size:20px">Modelo 1 : Constante </p>

In [36]:
# Defino el modelo
dummy_constant  = DummyRegressor(strategy='constant', constant=0)

# Entreno el modelo
dummy_constant .fit(X_train[['id']], y_train)

# Calculo los errores
ERRORES(dummy_constant)

RMSLE Error (train): 0.45482
RMSLE Error (Test): 0.45294


In [37]:
# Exportar la prediccion como csv
EXPORTAR(dummy_constant,r'.\Dummy_constant_pred.csv')

<p style="color:red; font-size:20px">Modelo 2 : Promedio </p>

In [38]:
# Defino el modelo
dummy_mean = DummyRegressor(strategy='mean')

# Entreno el modelo
dummy_mean.fit(X_train, y_train)

# Calculo los errores
ERRORES(dummy_mean)

RMSLE Error (train): 0.34835
RMSLE Error (Test): 0.34844


In [39]:
# Exportar la prediccion como csv
EXPORTAR(dummy_mean,r'.\Dummy_promedio_pred.csv')