In [1]:
#IMPORTS

%run ../datuslib.ipynb
%run Functions.ipynb
from hyperopt import fmin, tpe, hp, Trials, STATUS_OK
from hyperopt.pyll import scope
from sklearn import datasets
from sklearn import pipeline
from sklearn.metrics import confusion_matrix
from sklearn.metrics import f1_score
from sklearn.model_selection import cross_val_score, GridSearchCV, RandomizedSearchCV
from sklearn.preprocessing import MinMaxScaler,scale, normalize
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import time

# graficos incrustados
%matplotlib inline

import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)

In [2]:
def plotCM(cm,classes=None,title='Matriz de Confusion'):
    if classes is not None:
        sns.heatmap(cm,xticklabels=classes,yticklabels=classes,vmin=0.,vmax=1.,annot=True,annot_kws={'size':50})
    else:
        sns.heatmap(cm,vmin=0.,vmax=1.)
    plt.title(title)
    plt.ylabel('Etiqueta Verdadera')
    plt.xlabel('Etiqueta Predecida')
    
def checkDummyModel(xTrain,yTrain,xTest,yTest):
    dummyCf=DummyClassifier(strategy='most_frequent')
    dummyCf.fit(xTrain,yTrain)
    punjate=dummyCf.score(xTest,yTest)
    predict=dummyCf.predict(xTest)
    f1Score=f1_score(yTest,predict)
    print('Score:',punjate)
    print('f1Score:',f1Score)
    return predict

def runRF(rfModel,x_train,y_train,x_test,y_test,to_predict,perfect_sub):
    # ENTRENAMOS EL MODELO CON LOS DATOS
    rfModel.fit(x_train, y_train)

    # TOMAMOS LA IMPORTANCIA NUMERICA DE LOS FEATURES
    importances = list(rfModel.feature_importances_)
    feature_list = list(x_train.columns)

    # USAMOS EL MODELO PARA PREDECIR
    predictions = rfModel.predict(x_train)
    prediction_train_round= np.around(rfModel.predict(x_train))
    print ("F1 Score train: ", f1_score(y_train, prediction_train_round,average='micro'))

    predictions = rfModel.predict(x_test)
    prediction_test_round= np.around(rfModel.predict(x_test))
    print ("F1 Score test: ", f1_score(y_test, prediction_test_round,average='micro'))
    
    dummyPrediction=getDummyLabelToTest(y_test)
    print ("F1 Score dummy test: ", f1_score(y_test, dummyPrediction,average='micro'))

    prediction_to_predict_round= np.around(rfModel.predict(to_predict)).astype(int)
    f1real=f1_score(perfect_sub.target, prediction_to_predict_round,average='micro')
    print ("F1 Score to_predict: ",f1real)
    
    print ("F1 Score dummy perfect target: ",f1_score(perfect_sub.target, getDummyLabelToTest(perfect_sub.target),average='micro'))
    

    return [pd.Series(prediction_to_predict_round),feature_list,importances,rfModel.estimators_[5],f1real]

def graficarImportancia(feature_list,importances,title='Importancia de los Featuares'):
    # SETEAMOS EL ESTILO
    plt.style.use('fivethirtyeight')

    plt.figure(figsize=(20,10))

    # LISTA DE LOS VALORES DE X PARA PLOTTEAR
    x_values = list(range(len(importances)))


    # GENERAMOS EL GRAFICO DE BARRAS
    plt.bar(x_values, importances, orientation = 'vertical')

    # COLOCAMOS LAS ETIQUETAS PARA EL EJE X
    plt.xticks(x_values, feature_list, rotation='vertical')

    # ETIQUETAS DE EJES Y TITULO
    plt.ylabel('Importancia'); plt.xlabel('Features'); plt.title(title)
    
    
def tomarXfeaturesImportantes(cantidadFeateares,feature_list,importances):
    df_feature_importance=pd.DataFrame(importances)
    df_feature_importance['feature']=feature_list
    df_feature_importance.columns=['Importancia','Features']
    df_feature_importance=df_feature_importance[['Features','Importancia']]
    return df_feature_importance.nlargest(cantidadFeateares,'Importancia')

def exportarArbolEjemplo(estimador,lista_features,nombreSalida='tree.png'):
    export_graphviz(estimador, out_file = 'tree.dot', feature_names = lista_features, rounded = True, precision = 1)
    (graph, ) = pydot.graph_from_dot_file('tree.dot')

    # GUARDAMOS EL GRAFICO EN EL ARCHIVO PNG
    graph.write_png(nombreSalida)
    
def getDummyLabelToTest(serieToTest):
    valor_mas_frecuente=pd.Series(serieToTest).value_counts().idxmax()
    return pd.Series(valor_mas_frecuente).repeat(len(serieToTest))

In [3]:
perfect_sub = pd.read_csv("../Data/perfect_submission.csv")

# CARGO LOS FEATURES

In [4]:
to_predict=pd.read_csv('to_predict.csv')
to_predict=to_predict*1
print (to_predict.shape)
to_predict.head()

(3263, 204)


Unnamed: 0,len,word_count,unique_word_count,url_count,mean_word_length,hashtag_count,mention_count,key_bit0,key_bit1,key_bit2,...,Tiene_deaths,Tiene_climate,Tiene_plague,Tiene_disappearance,Tiene_missing,Tiene_floods,Tiene_delug,contains_keyword,Tiene_key_impor,Tiene_key_no_impor
0,0.121429,6,6,0,4.833333,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0.228571,9,9,0,6.222222,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0.342857,19,19,0,4.105263,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0.142857,4,4,0,9.25,2,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0.160714,8,8,0,4.75,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [5]:
x_train=pd.read_csv('x_train.csv')
x_train=x_train*1
print (x_train.shape)
x_train.head()

(5709, 204)


Unnamed: 0,len,word_count,unique_word_count,url_count,mean_word_length,hashtag_count,mention_count,key_bit0,key_bit1,key_bit2,...,Tiene_deaths,Tiene_climate,Tiene_plague,Tiene_disappearance,Tiene_missing,Tiene_floods,Tiene_delug,contains_keyword,Tiene_key_impor,Tiene_key_no_impor
0,0.485714,22,21,0,5.227273,0,2,1,0,0,...,0,0,0,0,0,0,0,1,0,0
1,0.457143,17,17,1,6.588235,0,1,1,0,1,...,0,0,0,0,0,0,0,0,0,0
2,0.489286,29,29,0,3.758621,0,0,0,1,1,...,0,0,0,0,0,0,0,1,0,0
3,0.189286,12,11,0,3.5,0,0,1,1,0,...,0,0,0,0,0,0,0,1,0,0
4,0.189286,6,6,0,8.0,0,0,1,1,0,...,0,0,0,0,0,0,0,1,0,0


In [6]:
x_test=pd.read_csv('x_test.csv')
x_test=x_test*1
print (x_test.shape)
x_test.head()

(1904, 204)


Unnamed: 0,len,word_count,unique_word_count,url_count,mean_word_length,hashtag_count,mention_count,key_bit0,key_bit1,key_bit2,...,Tiene_deaths,Tiene_climate,Tiene_plague,Tiene_disappearance,Tiene_missing,Tiene_floods,Tiene_delug,contains_keyword,Tiene_key_impor,Tiene_key_no_impor
0,0.235714,11,11,0,5.090909,0,0,0,1,0,...,0,0,0,0,0,0,0,1,0,0
1,0.425,21,20,0,4.714286,1,4,0,1,0,...,0,0,0,0,0,0,1,1,0,0
2,0.446429,15,15,1,7.4,0,2,1,0,0,...,0,0,0,0,0,0,0,1,0,0
3,0.407143,21,20,0,4.47619,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,1
4,0.375,17,17,0,5.235294,0,0,1,1,0,...,0,0,0,0,0,0,0,1,0,0


In [7]:
y_train=pd.read_csv('y_train.csv')
y_train=y_train.target
print (y_train.shape)
y_train.head()

(5709,)


0    0
1    0
2    0
3    1
4    1
Name: target, dtype: int64

In [8]:
y_test=pd.read_csv('y_test.csv')
y_test=y_test.target
print (y_test.shape)
y_test.head()

(1904,)


0    1
1    0
2    1
3    0
4    0
Name: target, dtype: int64

In [9]:
ids=pd.read_csv('ids.csv')
ids=ids.id
print (ids.shape)
ids.head()

(3263,)


0     0
1     2
2     3
3     9
4    11
Name: id, dtype: int64

# COMIENZO CON EL ALGORITMO

In [10]:
from sklearn.naive_bayes import GaussianNB
model = GaussianNB()
model.fit(x_train,y_train)

predicted=model.predict(x_train)
print ("F1_score Test:", f1_score(y_train,predicted,average='micro'))
predicted= model.predict(x_test)
print ("F1_score Test:", f1_score(y_test,predicted,average='micro'))
predicted=model.predict(to_predict)
print ("F1_score Submit:", f1_score(perfect_sub.target,predicted,average='micro'))

F1_score Test: 0.721492380451918
F1_score Test: 0.7069327731092437
F1_score Submit: 0.6950658902850138


# SUBMIT DE MODELO

In [11]:
#SUBMIT DE MODELO BASE
result = submission_output(ids, predicted,"NB.csv")
result.head()

Unnamed: 0,id,target
0,0,0
1,2,1
2,3,0
3,9,1
4,11,1
