In [4]:
import pandas as pd
import numpy as np

from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.model_selection import GridSearchCV, StratifiedKFold
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC, LinearSVC, NuSVC
from sklearn.ensemble import BaggingClassifier
from sklearn.tree import DecisionTreeClassifier
from xgboost import XGBClassifier

from sklearn import metrics
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split
from sklearn import preprocessing
from sklearn import model_selection

from sklearn.datasets import fetch_20newsgroups
from keras.layers import  Dropout, Dense
from keras.models import Sequential

##Importamos las matrices a utilizar

In [5]:
from pydrive.auth import GoogleAuth
from pydrive.drive import GoogleDrive
from google.colab import auth
from oauth2client.client import GoogleCredentials
from google.colab import files

# Authenticate and create the PyDrive client
auth.authenticate_user()
gauth = GoogleAuth()
gauth.credentials = GoogleCredentials.get_application_default()
drive = GoogleDrive(gauth)

In [6]:
id='15Q0YL820piC66xf3q-8C-El4THe39PFW'
downloaded = drive.CreateFile({'id': id})
downloaded.GetContentFile('train.csv')

train = pd.read_csv('train.csv')

id='1jt6Uc69MmmbVY1DJdr38rrnDU--X6sEB'
downloaded = drive.CreateFile({'id': id})
downloaded.GetContentFile('test.csv')

test = pd.read_csv('test.csv')

In [7]:
id= '1GhLSiSasF4IIRG98Ulx9xG2V2-CAzHJr'
downloaded = drive.CreateFile({'id': id})
downloaded.GetContentFile('ft_features_numericos_test.csv')

id= '1kgVMHuLTXNNzjdMCX96OBoVfP0Ou4tec'
downloaded = drive.CreateFile({'id': id})
downloaded.GetContentFile('ft_features_numericos_train.csv')

id= '1fqX2OqscRVJfiNDiNHtNnA12C9NgBiIQ'
downloaded = drive.CreateFile({'id': id})
downloaded.GetContentFile('matriz_test_ft_solo_text.csv')

id= '1oBA_jERGip2orWLrFWm5DF3jrq7w5dAL'
downloaded = drive.CreateFile({'id': id})
downloaded.GetContentFile('matriz_train_ft_solo_text.csv')

id= '1krnFYY5tqXKY3Sbqn_VGZiM8S2OUBmJz'
downloaded = drive.CreateFile({'id': id})
downloaded.GetContentFile('matriz_test_ft_text_keyword.csv')

id= '1vFYmS-7cXFH--0rKV-cAi1E35KD7qQbh'
downloaded = drive.CreateFile({'id': id})
downloaded.GetContentFile('matriz_train_ft_text_keyword.csv')

In [8]:
matriz_test_ft_solo_text = pd.read_csv('matriz_test_ft_solo_text.csv')
matriz_train_ft_solo_text = pd.read_csv('matriz_train_ft_solo_text.csv')

matriz_test_ft_text_keyword = pd.read_csv('matriz_test_ft_text_keyword.csv')
matriz_train_ft_text_keyword = pd.read_csv('matriz_train_ft_text_keyword.csv')

ft_features_numericos_test = pd.read_csv('ft_features_numericos_test.csv')
ft_features_numericos_train = pd.read_csv('ft_features_numericos_train.csv')

In [9]:
def cross_val(model, x_train, y_train):
  score_cross_val = model_selection.cross_val_score(model, x_train, y_train, cv=5)
  print(score_cross_val.mean())

##SPLIT DE LAS MATRICES

In [10]:
y = train.target

In [11]:
x_train_text, x_validation_text, y_train_text, y_validation_text = train_test_split(matriz_train_ft_solo_text, y, test_size=0.3, stratify=y)
x_train_text_keyword, x_validation_text_keyword, y_train_text_keyword, y_validation_text_keyword = train_test_split(matriz_train_ft_text_keyword, y, test_size=0.3, stratify=y)
x_train_numericos, x_validation_numericos, y_train_numericos, y_validation_numericos = train_test_split(ft_features_numericos_train, y, test_size=0.3, stratify=y)

##KNN

Funcion que aplica KNN con GridSearch y CV para encontrar los mejores hiperparametros para cada matriz probada.

In [None]:
def knn_con_gridsearch(x_train, y_train, x_validation, y_validation):

  knn = KNeighborsClassifier()
  params_knn = {'n_neighbors': np.arange(1, 25), 'weights' : ['uniform', 'distance'], 'metric': ['euclidean', 'manhattan', 'minkowski']}
  knn_gs = GridSearchCV(knn, params_knn, cv=5)
  knn_gs.fit(x_train, y_train)
  knn_best = knn_gs.best_estimator_
  
  print(knn_gs.best_params_)
  print('knn score: {}'.format(knn_best.score(x_validation, y_validation)))
  
  return knn_best

Corremos el algoritmo con cada matriz y comparamos los resultados.

In [None]:
knn_text = knn_con_gridsearch(x_train_text, y_train_text, x_validation_text, y_validation_text)

{'metric': 'manhattan', 'n_neighbors': 21, 'weights': 'distance'}
knn score: 0.7473730297723292


In [None]:
cross_val(knn_text, x_train_text, y_train_text)

0.7310946101877052


In [None]:
knn_text_keyword = knn_con_gridsearch(x_train_text_keyword, y_train_text_keyword, x_validation_text_keyword, y_validation_text_keyword)

{'metric': 'manhattan', 'n_neighbors': 22, 'weights': 'distance'}
knn score: 0.7342381786339754


In [None]:
cross_val(knn_text_keyword, x_train_text_keyword, y_train_text_keyword)

0.7350354534964635


In [None]:
knn_numericos = knn_con_gridsearch(x_train_numericos,y_train_numericos,  x_validation_numericos, y_validation_numericos)

{'metric': 'manhattan', 'n_neighbors': 24, 'weights': 'distance'}
knn score: 0.6799474605954466


In [None]:
cross_val(knn_numericos, x_train_text, y_train_text)

0.7280925578486555


Realizamos la prediccion con el modelo que haya dado mayor score

In [None]:
prediction_knn = knn_text.predict(matriz_test_ft_solo_text)
submission_knn = pd.DataFrame(data={'id':test['id'], 'target': prediction_knn})
submission_knn.to_csv('ft_knn.csv', index=False)
files.download('ft_knn.csv')

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

#RED NEURONAL PROFUNDA

In [None]:
def DNN_model(shape, nClasses, dropout=0.5):
    model = Sequential()
    node = 512
    nLayers = 4
    model.add(Dense(node,input_dim=shape,activation='relu'))
    model.add(Dropout(dropout))
    for i in range(0,nLayers):
        model.add(Dense(node,input_dim=node,activation='relu'))
        model.add(Dropout(dropout))
    model.add(Dense(nClasses, activation='softmax'))
    model.compile(loss='sparse_categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
    return model

Corremos el algoritmo con cada matriz y comparamos los resultados.

In [None]:
model_DNN_text = DNN_model(matriz_train_ft_solo_text.shape[1], 2)
model_DNN_text.fit(matriz_train_ft_solo_text, train.target, epochs=20, batch_size=128, verbose=2)

Epoch 1/20
60/60 - 1s - loss: 0.6371 - accuracy: 0.6344
Epoch 2/20
60/60 - 1s - loss: 0.5367 - accuracy: 0.7496
Epoch 3/20
60/60 - 1s - loss: 0.4984 - accuracy: 0.7655
Epoch 4/20
60/60 - 1s - loss: 0.4681 - accuracy: 0.7877
Epoch 5/20
60/60 - 1s - loss: 0.4684 - accuracy: 0.7883
Epoch 6/20
60/60 - 1s - loss: 0.4537 - accuracy: 0.7988
Epoch 7/20
60/60 - 1s - loss: 0.4531 - accuracy: 0.8017
Epoch 8/20
60/60 - 1s - loss: 0.4418 - accuracy: 0.8031
Epoch 9/20
60/60 - 1s - loss: 0.4355 - accuracy: 0.8084
Epoch 10/20
60/60 - 1s - loss: 0.4381 - accuracy: 0.8081
Epoch 11/20
60/60 - 1s - loss: 0.4288 - accuracy: 0.8118
Epoch 12/20
60/60 - 1s - loss: 0.4278 - accuracy: 0.8130
Epoch 13/20
60/60 - 1s - loss: 0.4243 - accuracy: 0.8168
Epoch 14/20
60/60 - 1s - loss: 0.4305 - accuracy: 0.8110
Epoch 15/20
60/60 - 1s - loss: 0.4113 - accuracy: 0.8210
Epoch 16/20
60/60 - 1s - loss: 0.4171 - accuracy: 0.8173
Epoch 17/20
60/60 - 1s - loss: 0.4068 - accuracy: 0.8223
Epoch 18/20
60/60 - 1s - loss: 0.4051 - 

<tensorflow.python.keras.callbacks.History at 0x7f816c358550>

In [None]:
model_DNN_text_keyword = DNN_model(matriz_train_ft_text_keyword.shape[1], 2)
model_DNN_text_keyword.fit(matriz_train_ft_text_keyword, train.target, epochs=20, batch_size=128, verbose=2)

Epoch 1/20
60/60 - 1s - loss: 0.6497 - accuracy: 0.6115
Epoch 2/20
60/60 - 1s - loss: 0.5670 - accuracy: 0.7205
Epoch 3/20
60/60 - 1s - loss: 0.5097 - accuracy: 0.7674
Epoch 4/20
60/60 - 1s - loss: 0.4808 - accuracy: 0.7825
Epoch 5/20
60/60 - 1s - loss: 0.4698 - accuracy: 0.7927
Epoch 6/20
60/60 - 1s - loss: 0.4644 - accuracy: 0.7908
Epoch 7/20
60/60 - 1s - loss: 0.4475 - accuracy: 0.8024
Epoch 8/20
60/60 - 1s - loss: 0.4457 - accuracy: 0.8034
Epoch 9/20
60/60 - 1s - loss: 0.4391 - accuracy: 0.8087
Epoch 10/20
60/60 - 1s - loss: 0.4418 - accuracy: 0.8070
Epoch 11/20
60/60 - 1s - loss: 0.4414 - accuracy: 0.8089
Epoch 12/20
60/60 - 1s - loss: 0.4261 - accuracy: 0.8152
Epoch 13/20
60/60 - 1s - loss: 0.4187 - accuracy: 0.8181
Epoch 14/20
60/60 - 1s - loss: 0.4145 - accuracy: 0.8187
Epoch 15/20
60/60 - 1s - loss: 0.4155 - accuracy: 0.8186
Epoch 16/20
60/60 - 1s - loss: 0.4167 - accuracy: 0.8186
Epoch 17/20
60/60 - 1s - loss: 0.4112 - accuracy: 0.8252
Epoch 18/20
60/60 - 1s - loss: 0.4110 - 

<tensorflow.python.keras.callbacks.History at 0x7f8169c34ac8>

In [None]:
model_DNN_text_keyword_numericos = DNN_model(ft_features_numericos_train.shape[1], 2)
model_DNN_text_keyword_numericos.fit(ft_features_numericos_train, train.target, epochs=20, batch_size=128, verbose=2)

Epoch 1/20
60/60 - 1s - loss: 0.9534 - accuracy: 0.5196
Epoch 2/20
60/60 - 1s - loss: 0.6989 - accuracy: 0.5509
Epoch 3/20
60/60 - 1s - loss: 0.6892 - accuracy: 0.5644
Epoch 4/20
60/60 - 1s - loss: 0.6738 - accuracy: 0.5837
Epoch 5/20
60/60 - 1s - loss: 0.6583 - accuracy: 0.6049
Epoch 6/20
60/60 - 1s - loss: 0.6564 - accuracy: 0.6099
Epoch 7/20
60/60 - 1s - loss: 0.6460 - accuracy: 0.6263
Epoch 8/20
60/60 - 1s - loss: 0.6501 - accuracy: 0.6242
Epoch 9/20
60/60 - 1s - loss: 0.6391 - accuracy: 0.6485
Epoch 10/20
60/60 - 1s - loss: 0.6373 - accuracy: 0.6520
Epoch 11/20
60/60 - 1s - loss: 0.6291 - accuracy: 0.6523
Epoch 12/20
60/60 - 1s - loss: 0.6313 - accuracy: 0.6523
Epoch 13/20
60/60 - 1s - loss: 0.6194 - accuracy: 0.6685
Epoch 14/20
60/60 - 1s - loss: 0.6179 - accuracy: 0.6737
Epoch 15/20
60/60 - 1s - loss: 0.6179 - accuracy: 0.6786
Epoch 16/20
60/60 - 1s - loss: 0.6090 - accuracy: 0.6820
Epoch 17/20
60/60 - 1s - loss: 0.6098 - accuracy: 0.6803
Epoch 18/20
60/60 - 1s - loss: 0.6032 - 

<tensorflow.python.keras.callbacks.History at 0x7f81692a0ba8>

In [None]:
prediction_dnn = model_DNN_text_keyword.predict_classes(matriz_test_ft_text_keyword)
submission_dnn = pd.DataFrame(data={'id':test['id'], 'target': prediction_dnn})
submission_dnn.to_csv('ft_dnn.csv', index=False)
files.download('ft_dnn.csv')

#RANDOM FOREST

Funcion que aplica RANDOM FOREST con GridSearch y CV para encontrar los mejores hiperparametros para cada matriz probada.

In [None]:
"""#Numero de arboles a usar
n_estimators = [int(x) for x in np.linspace(start = 200, stop = 2000, num = 10)]
#Cantidad de features a considerar en cada split
max_features = ['auto', 'sqrt']
#Profundidad de cada arbol
max_depth = [int(x) for x in np.linspace(10, 110, num = 11)]
max_depth.append(None)
#Cantidad minima de muestras requeridas para realizar un split
min_samples_split = [2, 5, 10]
#Cantidad minima de muestras por cada hoja del arbol
min_samples_leaf = [1, 2, 4]
#Método de seleccion de muestras para el entrenamiento de cada árbol
bootstrap = [True, False]

params_rf = {'n_estimators': n_estimators,
               'max_features': max_features,
               'max_depth': max_depth,
               'min_samples_split': min_samples_split,
               'min_samples_leaf': min_samples_leaf,
               'bootstrap': bootstrap}"""

In [None]:
def RandomForest_con_gridsearch(x_train, y_train, x_validation, y_validation):

  rf = RandomForestClassifier()
  params_rf = {'n_estimators': [50, 100, 200]}
  rf_gs = GridSearchCV(rf, params_rf, cv=5)
  rf_gs.fit(x_train, y_train)
  rf_best = rf_gs.best_estimator_

  print(rf_gs.best_params_)
  print('rf: {}'.format(rf_best.score(x_validation, y_validation)))

  return rf_best

Corremos el algoritmo con cada matriz y comparamos los resultados.

In [None]:
rf_text = RandomForest_con_gridsearch(x_train_text, y_train_text, x_validation_text, y_validation_text)

{'n_estimators': 200}
rf: 0.7705779334500875


In [None]:
cross_val(rf_text, x_train_text, y_train_text)

0.7607457125492165


In [None]:
rf_text_keyword = RandomForest_con_gridsearch(x_train_text_keyword, y_train_text_keyword, x_validation_text_keyword, y_validation_text_keyword)

{'n_estimators': 200}
rf: 0.7653239929947461


In [None]:
cross_val(rf_text_keyword, x_train_text_keyword, y_train_text_keyword)

0.7586782231852653


In [None]:
rf_numericos = RandomForest_con_gridsearch(x_train_numericos,y_train_numericos,  x_validation_numericos, y_validation_numericos)

{'n_estimators': 200}
rf: 0.7539404553415061


In [None]:
cross_val(rf_numericos, x_train_numericos, y_train_numericos)

0.7682447656545904


In [None]:
prediction_rf_numericos = rf_numericos.predict(ft_features_numericos_test)
submission_rf = pd.DataFrame(data={'id':test['id'], 'target': prediction_rf_numericos})
submission_rf.to_csv('ft_rf.csv', index=False)
files.download('ft_rf.csv')

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

#NuSVC

In [None]:
def nusvc(x_train, y_train, x_validation, y_validation):

  nusvc = NuSVC(probability=True)
  grid_nusvc = GridSearchCV(nusvc, param_grid={'nu': [0.4, 0.5]}, cv=5)
  grid_nusvc.fit(x_train, y_train)
  grid_best = grid_nusvc.best_estimator_
  
  print(grid_nusvc.best_params_)
  print('nusvc score: {}'.format(grid_best.score(x_validation, y_validation)))

  return grid_best

In [None]:
nusvc_text = nusvc(x_train_text, y_train_text, x_validation_text, y_validation_text)

{'nu': 0.5}
nusvc score: 0.7985989492119089


In [None]:
nusvc_text_keyword = nusvc(x_train_text_keyword, y_train_text_keyword, x_validation_text_keyword, y_validation_text_keyword)

{'nu': 0.5}
nusvc score: 0.8042907180385289


In [None]:
nusvc_numericos = nusvc(x_train_numericos, y_train_numericos, x_validation_numericos, y_validation_numericos)

{'nu': 0.5}
nusvc score: 0.7237302977232924


In [None]:
prediction_nuscv = nusvc_text_keyword.predict(matriz_test_ft_text_keyword)
submission_nuscv = pd.DataFrame(data={'id':test['id'], 'target': prediction_nuscv})
submission_nuscv.to_csv('ft_nuscv.csv', index=False)
files.download('ft_nuscv.csv')

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

#XGBoost

In [None]:
def xgboost(x_train, y_train, x_validation, y_validation):

  xgb = XGBClassifier()
  params_xgb = {'n_estimators': [50, 100, 200], 'learning_rate': [0.01, 0.05, 0.1], 'gamma': [0, 1, 5]}
  xgb_gs = GridSearchCV(xgb, params_xgb, cv=5)
  xgb_gs.fit(x_train, y_train)
  xgb_best = xgb_gs.best_estimator_

  print(xgb_gs.best_params_)
  print('xgb: {}'.format(xgb_best.score(x_validation, y_validation)))

  return xgb_best


In [None]:
xgboost_text = xgboost(x_train_text, y_train_text, x_validation_text, y_validation_text)

{'gamma': 1, 'learning_rate': 0.1, 'n_estimators': 200}
xgb: 0.7911558669001751


In [None]:
cross_val(xgboost_text, x_train_text, y_train_text)

0.7883280923816822


In [None]:
xgboost_text_keyword = xgboost(x_train_text_keyword, y_train_text_keyword, x_validation_text_keyword, y_validation_text_keyword)

{'gamma': 5, 'learning_rate': 0.1, 'n_estimators': 200}
xgb: 0.7964098073555166


In [None]:
cross_val(xgboost_text_keyword, x_train_text_keyword, y_train_text_keyword)

0.7892647693540857


In [None]:
xgboost_numericos = xgboost(x_train_numericos,y_train_numericos,  x_validation_numericos, y_validation_numericos)

{'gamma': 1, 'learning_rate': 0.1, 'n_estimators': 200}
xgb: 0.7837127845884413


In [None]:
cross_val(xgboost_numericos, x_train_numericos,y_train_numericos)

0.7962046701723788


In [None]:
y_pred = xgboost_numericos.predict(ft_features_numericos_test)
predictions = [round(value) for value in y_pred]
submission_xgboost = pd.DataFrame(data={'id':test['id'], 'target': predictions})
submission_xgboost.to_csv('ft_xgboost.csv', index=False)

#SVC

In [12]:
from sklearn.svm import SVC
from sklearn.model_selection import RandomizedSearchCV
from sklearn.metrics import precision_score, recall_score, f1_score, accuracy_score

def svc(x_train, y_train, x_test, y_test):
  classifier = SVC()

  parameters = {'C': [0.1, 1, 10, 100, 1000], 'gamma': [1, 0.1, 0.01, 0.0001], 'kernel' : ['linear',
                                                                                         'sigmoid', 'poly', 'rbf']}
  grid_s_p = RandomizedSearchCV(classifier, parameters, refit = True) 

  model = grid_s_p.fit(x_train, y_train)

  model.best_params_

  svc_best = grid_s_p.best_estimator_

  print(model.best_params_)

  print('svc: {}'.format(svc_best.score(x_test, y_test)))
  preds = model.predict(x_train)
  print("Accuracy score: ", accuracy_score(y_train, preds))
  print("Precision score: ", precision_score(y_train, preds))
  print("Recall score: ", recall_score(y_train, preds))
  print("f1 score: ", f1_score(y_train, preds))
  
  return model

In [None]:
svc_text = svc(x_train_text, y_train_text, x_validation_text, y_validation_text)

{'kernel': 'linear', 'gamma': 0.0001, 'C': 1000}
svc: 0.7859019264448336
Accuracy score:  0.8269844248451867
Precision score:  0.8330087633885103
Recall score:  0.7471615720524017
f1 score:  0.7877532228360958


In [None]:
cross_val(svc_text, x_train_text, y_train_text)

0.7911425274599442


In [13]:
svc_text_keyword = svc(x_train_text_keyword, y_train_text_keyword, x_validation_text_keyword, y_validation_text_keyword)

{'kernel': 'rbf', 'gamma': 1, 'C': 10}
svc: 0.8073555166374781
Accuracy score:  0.8187277162694689
Precision score:  0.8384458077709611
Recall score:  0.7161572052401747
f1 score:  0.7724917569477155


In [14]:
cross_val(svc_text_keyword, x_train_text_keyword, y_train_text_keyword)

0.7922736921843758


In [None]:
#Tardo 5 horas
# svc_numericos = svc(x_train_numericos,y_train_numericos, x_validation_numericos, y_validation_numericos)

In [None]:
# cross_val(svc_numericos, x_train_numericos,y_train_numericos)

In [15]:
preds = svc_text_keyword.predict(matriz_test_ft_text_keyword)
submission = pd.DataFrame(data={'id':test['id'], 'target': preds})
submission.to_csv('fastext_svc.csv', index=False)
files.download('fastext_svc.csv')

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>