In [1]:
import pandas as pd
import numpy as np

from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.model_selection import GridSearchCV, StratifiedKFold
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC, LinearSVC, NuSVC
from sklearn.ensemble import BaggingClassifier
from sklearn.tree import DecisionTreeClassifier
from xgboost import XGBClassifier

from sklearn import metrics
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split
from sklearn import preprocessing
from sklearn import model_selection

from sklearn.datasets import fetch_20newsgroups
from keras.layers import  Dropout, Dense
from keras.models import Sequential

##Importamos las matrices a utilizar

In [2]:
from pydrive.auth import GoogleAuth
from pydrive.drive import GoogleDrive
from google.colab import auth
from oauth2client.client import GoogleCredentials

# Authenticate and create the PyDrive client
auth.authenticate_user()
gauth = GoogleAuth()
gauth.credentials = GoogleCredentials.get_application_default()
drive = GoogleDrive(gauth)

In [3]:
id='1Pk5MK9Hs_kMUT9NotGnOKE0NPra-39YU'
downloaded = drive.CreateFile({'id': id})
downloaded.GetContentFile('train.csv')

train = pd.read_csv('train.csv')

id='1GsTM9oLtIV8-Da_fDOFWsQYMpgQ8GOYJ'
downloaded = drive.CreateFile({'id': id})
downloaded.GetContentFile('test.csv')

test = pd.read_csv('test.csv')

In [4]:
id= '1XRy8Za6o-AmIgqMtORWpNvAb_AjTRYIG'
downloaded = drive.CreateFile({'id': id})
downloaded.GetContentFile('ft_features_numericos_test.csv')

id= '1exec3rpRy8Y2tWC8OmK4OKQ901v5-Ckv'
downloaded = drive.CreateFile({'id': id})
downloaded.GetContentFile('ft_features_numericos_train.csv')

id= '1TA7fS4iqd9is7hjRS-BW8DmTyxy5gGKP'
downloaded = drive.CreateFile({'id': id})
downloaded.GetContentFile('matriz_test_ft_solo_text.csv')

id= '1oYeJf5ffKhy-HnGpgVFO9uSoeUib-NBk'
downloaded = drive.CreateFile({'id': id})
downloaded.GetContentFile('matriz_train_ft_solo_text.csv')

id= '1c3gKcyPMYG3I8Yt7YfN8nsYxkWOb_4fe'
downloaded = drive.CreateFile({'id': id})
downloaded.GetContentFile('matriz_test_ft_text_keyword.csv')

id= '1qxjmYbeE69mIbZ_daSMMbOpNdWrLsh-I'
downloaded = drive.CreateFile({'id': id})
downloaded.GetContentFile('matriz_train_ft_text_keyword.csv')

In [5]:
matriz_test_ft_solo_text = pd.read_csv('matriz_test_ft_solo_text.csv')
matriz_train_ft_solo_text = pd.read_csv('matriz_train_ft_solo_text.csv')

matriz_test_ft_text_keyword = pd.read_csv('matriz_test_ft_text_keyword.csv')
matriz_train_ft_text_keyword = pd.read_csv('matriz_train_ft_text_keyword.csv')

ft_features_numericos_test = pd.read_csv('ft_features_numericos_test.csv')
ft_features_numericos_train = pd.read_csv('ft_features_numericos_train.csv')

In [6]:
def cross_val(model, x_train, y_train):
  score_cross_val = model_selection.cross_val_score(model, x_train, y_train, cv=5)
  print(score_cross_val.mean())

##SPLIT DE LAS MATRICES

In [7]:
y = train.target

In [8]:
x_train_text, x_validation_text, y_train_text, y_validation_text = train_test_split(matriz_train_ft_solo_text, y, test_size=0.3, stratify=y)
x_train_text_keyword, x_validation_text_keyword, y_train_text_keyword, y_validation_text_keyword = train_test_split(matriz_train_ft_text_keyword, y, test_size=0.3, stratify=y)
x_train_numericos, x_validation_numericos, y_train_numericos, y_validation_numericos = train_test_split(ft_features_numericos_train, y, test_size=0.3, stratify=y)

##KNN

Funcion que aplica KNN con GridSearch y CV para encontrar los mejores hiperparametros para cada matriz probada.

In [30]:
def knn_con_gridsearch(x_train, y_train, x_validation, y_validation):

  knn = KNeighborsClassifier()
  params_knn = {'n_neighbors': np.arange(1, 25), 'weights' : ['uniform', 'distance'], 'metric': ['euclidean', 'manhattan', 'minkowski']}
  knn_gs = GridSearchCV(knn, params_knn, cv=5)
  knn_gs.fit(x_train, y_train)
  knn_best = knn_gs.best_estimator_
  
  print(knn_gs.best_params_)
  print('knn score: {}'.format(knn_best.score(x_validation, y_validation)))
  
  return knn_best

Corremos el algoritmo con cada matriz y comparamos los resultados.

In [31]:
knn_text = knn_con_gridsearch(x_train_text, y_train_text, x_validation_text, y_validation_text)

{'metric': 'euclidean', 'n_neighbors': 2, 'weights': 'uniform'}
knn score: 0.553415061295972


In [None]:
#CON GRIDSEARCH: params_knn = {'n_neighbors': np.arange(1, 25)}
#{'n_neighbors': 20}
#knn score: 0.5652364273204904

In [32]:
cross_val(knn_text, x_train_text, y_train_text)

0.5635192770129218


In [33]:
knn_text_keyword = knn_con_gridsearch(x_train_text_keyword, y_train_text_keyword, x_validation_text_keyword, y_validation_text_keyword)

{'metric': 'manhattan', 'n_neighbors': 23, 'weights': 'distance'}
knn score: 0.5564798598949212


In [None]:
#CON GRIDSEARCH: params_knn = {'n_neighbors': np.arange(1, 25)}
#{'n_neighbors': 10}
#knn score: 0.6112084063047285

In [34]:
cross_val(knn_text_keyword, x_train_text_keyword, y_train_text_keyword)

0.5637063657743837


In [35]:
knn_numericos = knn_con_gridsearch(x_train_numericos,y_train_numericos,  x_validation_numericos, y_validation_numericos)

{'metric': 'manhattan', 'n_neighbors': 19, 'weights': 'distance'}
knn score: 0.6624343257443083


In [None]:
#CON GRIDSEARCH: params_knn = {'n_neighbors': np.arange(1, 25)}
#{'n_neighbors': 24}
#knn score: 0.658493870402802

In [36]:
cross_val(knn_numericos, x_train_text, y_train_text)

0.542128619119344


Realizamos la prediccion con el modelo que haya dado mayor score

In [38]:
prediction_knn = knn_numericos.predict(ft_features_numericos_test)
submission_knn = pd.DataFrame(data={'id':test['id'], 'target': prediction_knn})
submission_knn.to_csv('ft_knn.csv', index=False)

#RED NEURONAL PROFUNDA

In [39]:
def DNN_model(shape, nClasses, dropout=0.5):
    model = Sequential()
    node = 512
    nLayers = 4
    model.add(Dense(node,input_dim=shape,activation='relu'))
    model.add(Dropout(dropout))
    for i in range(0,nLayers):
        model.add(Dense(node,input_dim=node,activation='relu'))
        model.add(Dropout(dropout))
    model.add(Dense(nClasses, activation='softmax'))
    model.compile(loss='sparse_categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
    return model

Corremos el algoritmo con cada matriz y comparamos los resultados.

In [40]:
model_DNN_text = DNN_model(matriz_train_ft_solo_text.shape[1], 2)
model_DNN_text.fit(matriz_train_ft_solo_text, train.target, epochs=20, batch_size=128, verbose=2)

Epoch 1/20
60/60 - 2s - loss: 0.6859 - accuracy: 0.5651
Epoch 2/20
60/60 - 2s - loss: 0.6853 - accuracy: 0.5663
Epoch 3/20
60/60 - 2s - loss: 0.6842 - accuracy: 0.5702
Epoch 4/20
60/60 - 2s - loss: 0.6823 - accuracy: 0.5703
Epoch 5/20
60/60 - 2s - loss: 0.6827 - accuracy: 0.5702
Epoch 6/20
60/60 - 2s - loss: 0.6821 - accuracy: 0.5705
Epoch 7/20
60/60 - 2s - loss: 0.6806 - accuracy: 0.5703
Epoch 8/20
60/60 - 2s - loss: 0.6812 - accuracy: 0.5703
Epoch 9/20
60/60 - 2s - loss: 0.6779 - accuracy: 0.5694
Epoch 10/20
60/60 - 2s - loss: 0.6793 - accuracy: 0.5703
Epoch 11/20
60/60 - 2s - loss: 0.6770 - accuracy: 0.5706
Epoch 12/20
60/60 - 2s - loss: 0.6773 - accuracy: 0.5717
Epoch 13/20
60/60 - 2s - loss: 0.6785 - accuracy: 0.5715
Epoch 14/20
60/60 - 2s - loss: 0.6751 - accuracy: 0.5730
Epoch 15/20
60/60 - 2s - loss: 0.6715 - accuracy: 0.5723
Epoch 16/20
60/60 - 2s - loss: 0.6726 - accuracy: 0.5826
Epoch 17/20
60/60 - 2s - loss: 0.6706 - accuracy: 0.5793
Epoch 18/20
60/60 - 2s - loss: 0.6686 - 

<tensorflow.python.keras.callbacks.History at 0x7fbb8b1b2f28>

In [43]:
model_DNN_text_keyword = DNN_model(matriz_train_ft_text_keyword.shape[1], 2)
model_DNN_text_keyword.fit(matriz_train_ft_text_keyword, train.target, epochs=20, batch_size=128, verbose=2)

Epoch 1/20
60/60 - 2s - loss: 0.6859 - accuracy: 0.5669
Epoch 2/20
60/60 - 2s - loss: 0.6842 - accuracy: 0.5698
Epoch 3/20
60/60 - 2s - loss: 0.6832 - accuracy: 0.5685
Epoch 4/20
60/60 - 2s - loss: 0.6832 - accuracy: 0.5698
Epoch 5/20
60/60 - 2s - loss: 0.6804 - accuracy: 0.5699
Epoch 6/20
60/60 - 2s - loss: 0.6821 - accuracy: 0.5692
Epoch 7/20
60/60 - 2s - loss: 0.6813 - accuracy: 0.5703
Epoch 8/20
60/60 - 2s - loss: 0.6793 - accuracy: 0.5703
Epoch 9/20
60/60 - 2s - loss: 0.6784 - accuracy: 0.5703
Epoch 10/20
60/60 - 2s - loss: 0.6780 - accuracy: 0.5698
Epoch 11/20
60/60 - 2s - loss: 0.6766 - accuracy: 0.5702
Epoch 12/20
60/60 - 2s - loss: 0.6754 - accuracy: 0.5706
Epoch 13/20
60/60 - 2s - loss: 0.6752 - accuracy: 0.5694
Epoch 14/20
60/60 - 2s - loss: 0.6720 - accuracy: 0.5728
Epoch 15/20
60/60 - 2s - loss: 0.6711 - accuracy: 0.5777
Epoch 16/20
60/60 - 2s - loss: 0.6699 - accuracy: 0.5782
Epoch 17/20
60/60 - 2s - loss: 0.6677 - accuracy: 0.5873
Epoch 18/20
60/60 - 2s - loss: 0.6678 - 

<tensorflow.python.keras.callbacks.History at 0x7fbb881e3ef0>

In [41]:
model_DNN_text_keyword_numericos = DNN_model(ft_features_numericos_train.shape[1], 2)
model_DNN_text_keyword_numericos.fit(ft_features_numericos_train, train.target, epochs=20, batch_size=128, verbose=2)

Epoch 1/20
60/60 - 2s - loss: 0.9137 - accuracy: 0.5161
Epoch 2/20
60/60 - 2s - loss: 0.6998 - accuracy: 0.5475
Epoch 3/20
60/60 - 2s - loss: 0.6838 - accuracy: 0.5678
Epoch 4/20
60/60 - 2s - loss: 0.6694 - accuracy: 0.5812
Epoch 5/20
60/60 - 2s - loss: 0.6632 - accuracy: 0.5918
Epoch 6/20
60/60 - 2s - loss: 0.6599 - accuracy: 0.6008
Epoch 7/20
60/60 - 2s - loss: 0.6579 - accuracy: 0.6116
Epoch 8/20
60/60 - 2s - loss: 0.6518 - accuracy: 0.6247
Epoch 9/20
60/60 - 2s - loss: 0.6431 - accuracy: 0.6330
Epoch 10/20
60/60 - 2s - loss: 0.6431 - accuracy: 0.6319
Epoch 11/20
60/60 - 2s - loss: 0.6452 - accuracy: 0.6397
Epoch 12/20
60/60 - 2s - loss: 0.6405 - accuracy: 0.6413
Epoch 13/20
60/60 - 2s - loss: 0.6440 - accuracy: 0.6351
Epoch 14/20
60/60 - 2s - loss: 0.6379 - accuracy: 0.6459
Epoch 15/20
60/60 - 2s - loss: 0.6354 - accuracy: 0.6469
Epoch 16/20
60/60 - 2s - loss: 0.6389 - accuracy: 0.6432
Epoch 17/20
60/60 - 2s - loss: 0.6383 - accuracy: 0.6460
Epoch 18/20
60/60 - 2s - loss: 0.6378 - 

<tensorflow.python.keras.callbacks.History at 0x7fbb8b1fcda0>

In [45]:
prediction_dnn = model_DNN_text_keyword.predict_classes(matriz_test_ft_text_keyword)
submission_dnn = pd.DataFrame(data={'id':test['id'], 'target': prediction_dnn})
submission_dnn.to_csv('ft_dnn.csv', index=False)

#RANDOM FOREST

Funcion que aplica RANDOM FOREST con GridSearch y CV para encontrar los mejores hiperparametros para cada matriz probada.

In [None]:
"""#Numero de arboles a usar
n_estimators = [int(x) for x in np.linspace(start = 200, stop = 2000, num = 10)]
#Cantidad de features a considerar en cada split
max_features = ['auto', 'sqrt']
#Profundidad de cada arbol
max_depth = [int(x) for x in np.linspace(10, 110, num = 11)]
max_depth.append(None)
#Cantidad minima de muestras requeridas para realizar un split
min_samples_split = [2, 5, 10]
#Cantidad minima de muestras por cada hoja del arbol
min_samples_leaf = [1, 2, 4]
#Método de seleccion de muestras para el entrenamiento de cada árbol
bootstrap = [True, False]

params_rf = {'n_estimators': n_estimators,
               'max_features': max_features,
               'max_depth': max_depth,
               'min_samples_split': min_samples_split,
               'min_samples_leaf': min_samples_leaf,
               'bootstrap': bootstrap}"""

In [46]:
def RandomForest_con_gridsearch(x_train, y_train, x_validation, y_validation):

  rf = RandomForestClassifier()
  params_rf = {'n_estimators': [50, 100, 200]}
  rf_gs = GridSearchCV(rf, params_rf, cv=5)
  rf_gs.fit(x_train, y_train)
  rf_best = rf_gs.best_estimator_

  print(rf_gs.best_params_)
  print('rf: {}'.format(rf_best.score(x_validation, y_validation)))

  return rf_best

Corremos el algoritmo con cada matriz y comparamos los resultados.

In [47]:
rf_text = RandomForest_con_gridsearch(x_train_text, y_train_text, x_validation_text, y_validation_text)

{'n_estimators': 200}
rf: 0.5748686514886164


In [None]:
#GRIDSEARCH CON: params_rf = {'n_estimators': [50, 100, 200]}
#{'n_estimators': 200}
#rf: 0.5595446584938704

In [48]:
cross_val(rf_text, x_train_text, y_train_text)

0.5699019633749967


In [49]:
rf_text_keyword = RandomForest_con_gridsearch(x_train_text_keyword, y_train_text_keyword, x_validation_text_keyword, y_validation_text_keyword)

{'n_estimators': 200}
rf: 0.5718038528896673


In [50]:
cross_val(rf_text_keyword, x_train_text_keyword, y_train_text_keyword)

0.5775967373974932


In [51]:
rf_numericos = RandomForest_con_gridsearch(x_train_numericos,y_train_numericos,  x_validation_numericos, y_validation_numericos)

{'n_estimators': 200}
rf: 0.648861646234676


In [52]:
cross_val(rf_numericos, x_train_numericos, y_train_numericos)

0.6293859718662193


In [54]:
prediction_rf_numericos = rf_numericos.predict(ft_features_numericos_test)
submission_rf = pd.DataFrame(data={'id':test['id'], 'target': prediction_rf_numericos})
submission_rf.to_csv('ft_rf.csv', index=False)

#NuSVC

In [55]:
def nusvc(x_train, y_train):

  nusvc = NuSVC(probability=True)
  grid_nusvc = GridSearchCV(nusvc, param_grid={'nu': [0.4, 0.5]}, scoring=['f1_macro'], refit=False)
  grid_nusvc.fit(x_train, y_train)

  print('nusvc: {}'.format(grid_nusvc.cv_results_['mean_test_f1_macro'][0]))

  return grid_nusvc

In [56]:
nusvc_text = nusvc(x_train_text, y_train_text)

nusvc: 0.5157041731984793


In [57]:
nusvc_text_keyword = nusvc(x_train_text_keyword, y_train_text_keyword)

nusvc: 0.5332814299320351


In [58]:
nusvc_numericos = nusvc(x_train_numericos,y_train_numericos)

nusvc: 0.5640194893028262


In [62]:
# prediction_nuscv = nusvc.predict(ft_features_numericos_test)
# submission_nuscv = pd.DataFrame(data={'id':test['id'], 'target': prediction_nuscv})
# submission_nuscv.to_csv('ft_nuscv.csv', index=False)

#XGBoost

In [9]:
def xgboost(x_train, y_train, x_validation, y_validation):

  xgb = XGBClassifier()
  params_xgb = {'n_estimators': [50, 100, 200], 'learning_rate': [0.01, 0.05, 0.1], 'gamma': [0, 1, 5]}
  xgb_gs = GridSearchCV(xgb, params_xgb, cv=5)
  xgb_gs.fit(x_train, y_train)
  xgb_best = xgb_gs.best_estimator_

  print(xgb_gs.best_params_)
  print('xgb: {}'.format(xgb_best.score(x_validation, y_validation)))

  return xgb_best


In [10]:
xgboost_text = xgboost(x_train_text, y_train_text, x_validation_text, y_validation_text)

{'gamma': 0, 'learning_rate': 0.05, 'n_estimators': 100}
xgb: 0.5761821366024519


In [11]:
cross_val(xgboost_text, x_train_text, y_train_text)

0.5841610513613261


In [12]:
xgboost_text_keyword = xgboost(x_train_text_keyword, y_train_text_keyword, x_validation_text_keyword, y_validation_text_keyword)

{'gamma': 1, 'learning_rate': 0.05, 'n_estimators': 100}
xgb: 0.612521891418564


In [13]:
cross_val(xgboost_text_keyword, x_train_text_keyword, y_train_text_keyword)

0.6089349857745597


In [14]:
xgboost_numericos = xgboost(x_train_numericos,y_train_numericos,  x_validation_numericos, y_validation_numericos)

{'gamma': 1, 'learning_rate': 0.05, 'n_estimators': 200}
xgb: 0.7149737302977233


In [15]:
cross_val(xgboost_numericos, x_train_numericos,y_train_numericos)

0.6772394718530068


In [18]:
y_pred = xgboost_numericos.predict(ft_features_numericos_test)
predictions = [round(value) for value in y_pred]
submission_xgboost = pd.DataFrame(data={'id':test['id'], 'target': predictions})
submission_xgboost.to_csv('ft_xgboost.csv', index=False)