In [1]:
import pandas as pd
import numpy as np

from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.model_selection import GridSearchCV, StratifiedKFold
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC, LinearSVC, NuSVC
from sklearn.ensemble import BaggingClassifier
from sklearn.tree import DecisionTreeClassifier
from xgboost import XGBClassifier

from sklearn import metrics
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split
from sklearn import preprocessing
from sklearn import model_selection

from sklearn.datasets import fetch_20newsgroups
from keras.layers import  Dropout, Dense
from keras.models import Sequential

##Importamos las matrices a utilizar

In [2]:
from pydrive.auth import GoogleAuth
from pydrive.drive import GoogleDrive
from google.colab import auth
from oauth2client.client import GoogleCredentials
from google.colab import files

# Authenticate and create the PyDrive client
auth.authenticate_user()
gauth = GoogleAuth()
gauth.credentials = GoogleCredentials.get_application_default()
drive = GoogleDrive(gauth)

In [3]:
id='1Pk5MK9Hs_kMUT9NotGnOKE0NPra-39YU'
downloaded = drive.CreateFile({'id': id})
downloaded.GetContentFile('train.csv')

train = pd.read_csv('train.csv')

id='1GsTM9oLtIV8-Da_fDOFWsQYMpgQ8GOYJ'
downloaded = drive.CreateFile({'id': id})
downloaded.GetContentFile('test.csv')

train = pd.read_csv('train.csv')
test = pd.read_csv('test.csv')

In [4]:
id= '14H80mFIAgSeMgh4WyHD91a6IgkJmHpy-'
downloaded = drive.CreateFile({'id': id})
downloaded.GetContentFile('use_features_numericos_test.csv')

id= '18tM8KBYMVj8VrpGEBNGT9m5QOhaMiZp7'
downloaded = drive.CreateFile({'id': id})
downloaded.GetContentFile('use_features_numericos_train.csv')

id= '1yFTqxomrfb9xZmOck780mcr7Fb9Y9R5S'
downloaded = drive.CreateFile({'id': id})
downloaded.GetContentFile('matriz_test_use_solo_text.csv')

id= '1knojPsOX9jd1LewsNBPd9EYbqBPwVyFr'
downloaded = drive.CreateFile({'id': id})
downloaded.GetContentFile('matriz_train_use_solo_text.csv')

In [5]:
matriz_test_use_solo_text = pd.read_csv('matriz_test_use_solo_text.csv')
matriz_train_use_solo_text = pd.read_csv('matriz_train_use_solo_text.csv')

use_features_numericos_test = pd.read_csv('use_features_numericos_test.csv')
use_features_numericos_train = pd.read_csv('use_features_numericos_train.csv')

##Split del set de datos


In [6]:
x_train_text, x_test_text, y_train_text, y_test_text = train_test_split(matriz_train_use_solo_text, train.target, test_size=0.2, random_state = 1)
x_train_numeric, x_test_numeric, y_train_numeric, y_test_numeric = train_test_split(use_features_numericos_train, train.target, test_size=0.2, random_state = 1)


In [7]:
def cross_val(model, x_train, y_train):
  score_cross_val = model_selection.cross_val_score(model, x_train, y_train, cv=5)
  print(score_cross_val.mean())

##KNN

Funcion que aplica KNN con GridSearch y CV para encontrar los mejores hiperparametros para cada matriz probada.

In [8]:
def knn_con_gridsearch(x_train, y_train, x_test, y_test):

  knn = KNeighborsClassifier()
  params_knn = {'n_neighbors': np.arange(1, 25), 'weights' : ['uniform', 'distance'], 'metric': ['euclidean', 'manhattan', 'minkowski']}
  knn_gs = GridSearchCV(knn, params_knn, cv=5)
  knn_gs.fit(x_train, y_train)
  knn_best = knn_gs.best_estimator_
  
  print(knn_gs.best_params_)
  print('knn score: {}'.format(knn_best.score(x_test, y_test)))
  
  return knn_best

Corremos el algoritmo con cada matriz y comparamos los resultados.

In [9]:
knn_text = knn_con_gridsearch(x_train_text, y_train_text, x_test_text, y_test_text)

{'metric': 'euclidean', 'n_neighbors': 6, 'weights': 'uniform'}
knn score: 0.8030203545633617


In [10]:
cross_val(knn_text, x_train_text, y_train_text)

0.8151067323481117


In [11]:
knn_numeric = knn_con_gridsearch(x_train_numeric, y_train_numeric, x_test_numeric, y_test_numeric)

{'metric': 'manhattan', 'n_neighbors': 6, 'weights': 'uniform'}
knn score: 0.7419566644780039


In [12]:
cross_val(knn_numeric, x_train_numeric, y_train_numeric)

0.7446633825944171


Realizamos la prediccion 

In [None]:
prediction_knn = knn.predict(matriz_test_use_solo_text)
submission_knn = pd.DataFrame(data={'id':test['id'], 'target': prediction_knn})
submission_knn.to_csv('hub_knn.csv', index=False)

#RED NEURONAL PROFUNDA

In [8]:
def DNN_model(shape, nClasses, dropout=0.5):
    model = Sequential()
    node = 512
    nLayers = 4
    model.add(Dense(node,input_dim=shape,activation='relu'))
    model.add(Dropout(dropout))
    for i in range(0,nLayers):
        model.add(Dense(node,input_dim=node,activation='relu'))
        model.add(Dropout(dropout))
    model.add(Dense(nClasses, activation='softmax'))
    model.compile(loss='sparse_categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
    return model

Corremos el algoritmo 

In [9]:
model_DNN_text = DNN_model(matriz_train_use_solo_text.shape[1], 2)
model_DNN_text.fit(matriz_train_use_solo_text , train.target, epochs=20, batch_size=128, verbose=2)

Epoch 1/20
60/60 - 2s - loss: 0.5327 - accuracy: 0.7284
Epoch 2/20
60/60 - 2s - loss: 0.4072 - accuracy: 0.8285
Epoch 3/20
60/60 - 2s - loss: 0.3715 - accuracy: 0.8497
Epoch 4/20
60/60 - 2s - loss: 0.3438 - accuracy: 0.8596
Epoch 5/20
60/60 - 2s - loss: 0.3170 - accuracy: 0.8746
Epoch 6/20
60/60 - 2s - loss: 0.2736 - accuracy: 0.8903
Epoch 7/20
60/60 - 2s - loss: 0.2416 - accuracy: 0.9035
Epoch 8/20
60/60 - 2s - loss: 0.2128 - accuracy: 0.9207
Epoch 9/20
60/60 - 2s - loss: 0.1844 - accuracy: 0.9306
Epoch 10/20
60/60 - 2s - loss: 0.1603 - accuracy: 0.9384
Epoch 11/20
60/60 - 2s - loss: 0.1508 - accuracy: 0.9429
Epoch 12/20
60/60 - 2s - loss: 0.1265 - accuracy: 0.9535
Epoch 13/20
60/60 - 2s - loss: 0.1242 - accuracy: 0.9534
Epoch 14/20
60/60 - 2s - loss: 0.1150 - accuracy: 0.9576
Epoch 15/20
60/60 - 2s - loss: 0.1104 - accuracy: 0.9594
Epoch 16/20
60/60 - 2s - loss: 0.0968 - accuracy: 0.9637
Epoch 17/20
60/60 - 2s - loss: 0.0977 - accuracy: 0.9665
Epoch 18/20
60/60 - 2s - loss: 0.0944 - 

<tensorflow.python.keras.callbacks.History at 0x7f3766e437f0>

In [10]:
model_DNN_numeric = DNN_model(use_features_numericos_train.shape[1], 2)
model_DNN_numeric.fit(use_features_numericos_train, train.target, epochs=20, batch_size=128, verbose=2)

Epoch 1/20
60/60 - 3s - loss: 0.9020 - accuracy: 0.5236
Epoch 2/20
60/60 - 2s - loss: 0.6948 - accuracy: 0.5523
Epoch 3/20
60/60 - 2s - loss: 0.6753 - accuracy: 0.5836
Epoch 4/20
60/60 - 2s - loss: 0.6481 - accuracy: 0.6137
Epoch 5/20
60/60 - 2s - loss: 0.5951 - accuracy: 0.6921
Epoch 6/20
60/60 - 2s - loss: 0.5329 - accuracy: 0.7553
Epoch 7/20
60/60 - 2s - loss: 0.4928 - accuracy: 0.7809
Epoch 8/20
60/60 - 2s - loss: 0.4582 - accuracy: 0.8027
Epoch 9/20
60/60 - 2s - loss: 0.4392 - accuracy: 0.8127
Epoch 10/20
60/60 - 2s - loss: 0.4302 - accuracy: 0.8183
Epoch 11/20
60/60 - 2s - loss: 0.4241 - accuracy: 0.8239
Epoch 12/20
60/60 - 2s - loss: 0.4228 - accuracy: 0.8256
Epoch 13/20
60/60 - 2s - loss: 0.4171 - accuracy: 0.8233
Epoch 14/20
60/60 - 2s - loss: 0.4137 - accuracy: 0.8296
Epoch 15/20
60/60 - 2s - loss: 0.4091 - accuracy: 0.8290
Epoch 16/20
60/60 - 2s - loss: 0.4023 - accuracy: 0.8342
Epoch 17/20
60/60 - 2s - loss: 0.4039 - accuracy: 0.8319
Epoch 18/20
60/60 - 2s - loss: 0.3976 - 

<tensorflow.python.keras.callbacks.History at 0x7f37646b0ac8>

In [None]:
prediction_dnn = model_DNN_text.predict_classes(matriz_test_use_solo_text)
submission_dnn = pd.DataFrame(data={'id':test['id'], 'target': prediction_dnn})
submission_dnn.to_csv('hub_dnn_text.csv', index=False)

In [13]:
prediction_dnn = model_DNN_numeric.predict_classes(use_features_numericos_test)
submission_dnn = pd.DataFrame(data={'id':test['id'], 'target': prediction_dnn})
submission_dnn.to_csv('hub_dnn_numeric.csv', index=False)

In [14]:
files.download('hub_dnn_text.csv')
files.download('hub_dnn_numeric.csv')

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

#RANDOM FOREST

Funcion que aplica RANDOM FOREST con GridSearch y CV para encontrar los mejores hiperparametros para cada matriz probada.

In [None]:
"""#Numero de arboles a usar
n_estimators = [int(x) for x in np.linspace(start = 200, stop = 2000, num = 10)]
#Cantidad de features a considerar en cada split
max_features = ['auto', 'sqrt']
#Profundidad de cada arbol
max_depth = [int(x) for x in np.linspace(10, 110, num = 11)]
max_depth.append(None)
#Cantidad minima de muestras requeridas para realizar un split
min_samples_split = [2, 5, 10]
#Cantidad minima de muestras por cada hoja del arbol
min_samples_leaf = [1, 2, 4]
#Método de seleccion de muestras para el entrenamiento de cada árbol
bootstrap = [True, False]

params_rf = {'n_estimators': n_estimators,
               'max_features': max_features,
               'max_depth': max_depth,
               'min_samples_split': min_samples_split,
               'min_samples_leaf': min_samples_leaf,
               'bootstrap': bootstrap}"""

In [16]:
def RandomForest_con_gridsearch(x_train, y_train, x_validation, y_validation):

  rf = RandomForestClassifier()
  params_rf = {'n_estimators': [50, 100, 200]}
  rf_gs = GridSearchCV(rf, params_rf, cv=5)
  rf_gs.fit(x_train, y_train)
  rf_best = rf_gs.best_estimator_

  print(rf_gs.best_params_)
  print('rf: {}'.format(rf_best.score(x_validation, y_validation)))

  return rf_best

Corremos el algoritmo 


In [17]:
rf_text = RandomForest_con_gridsearch(x_train_text, y_train_text, x_test_text, y_test_text)

{'n_estimators': 200}
rf: 0.8233749179251477


In [18]:
cross_val(rf_text, x_train_text, y_train_text)

0.8152709359605911


In [19]:
rf_numeric = RandomForest_con_gridsearch(x_train_numeric, y_train_numeric, x_test_numeric, y_test_numeric)

{'n_estimators': 200}
rf: 0.8253447143795141


In [20]:
cross_val(rf_numeric, x_train_numeric, y_train_numeric)

0.8177339901477833


In [None]:
prediction_rf = rf.predict(use_features_numericos_test)
submission_rf = pd.DataFrame(data={'id':test['id'], 'target': prediction_rf})
submission_rf.to_csv('hub_rf.csv', index=False)
files.download('hub_rf.csv')

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

#NuSVC

In [8]:
def nusvc(x_train, y_train, x_validation, y_validation):

  nusvc = NuSVC(probability=True)
  grid_nusvc = GridSearchCV(nusvc, param_grid={'nu': [0.4, 0.5]}, cv=5)
  grid_nusvc.fit(x_train, y_train)
  grid_best = grid_nusvc.best_estimator_
  
  print(grid_nusvc.best_params_)
  print('nusvc score: {}'.format(grid_best.score(x_validation, y_validation)))

  return grid_best

In [9]:
nusvc_text = nusvc(x_train_text, y_train_text, x_test_text, y_test_text)

{'nu': 0.4}
nusvc score: 0.8397898883782009


In [23]:
nusvc_numericos = nusvc(x_train_numeric, y_train_numeric, x_test_numeric, y_test_numeric)

{'nu': 0.4}
nusvc score: 0.8240315167432699


In [11]:
prediction_nuscv = nusvc_text.predict(matriz_test_use_solo_text)
submission_nuscv = pd.DataFrame(data={'id':test['id'], 'target': prediction_nuscv})
submission_nuscv.to_csv('hub_nuscv_text.csv', index=False)

#XGBoost

In [24]:
def xgboost(x_train, y_train, x_validation, y_validation):

  xgb = XGBClassifier()
  params_xgb = {'n_estimators': [50, 100, 200], 'learning_rate': [0.01, 0.05, 0.1], 'gamma': [0, 1, 5]}
  xgb_gs = GridSearchCV(xgb, params_xgb, cv=5)
  xgb_gs.fit(x_train, y_train)
  xgb_best = xgb_gs.best_estimator_

  print(xgb_gs.best_params_)
  print('xgb: {}'.format(xgb_best.score(x_validation, y_validation)))

  return xgb_best


In [25]:
xgboost_text = xgboost(x_train_text, y_train_text, x_test_text, y_test_text)

{'gamma': 5, 'learning_rate': 0.05, 'n_estimators': 200}
xgb: 0.8240315167432699


In [26]:
cross_val(xgboost_text, x_train_text, y_train_text)

0.8182266009852217


In [27]:
xgboost_numeric = xgboost(x_train_numeric, y_train_numeric, x_test_numeric, y_test_numeric)

{'gamma': 1, 'learning_rate': 0.1, 'n_estimators': 200}
xgb: 0.8240315167432699


In [28]:
cross_val(xgboost_numeric, x_train_numeric, y_train_numeric)

0.8205254515599343


In [None]:
y_pred = xgboost.predict(x_test_numeric)
predictions = [round(value) for value in y_pred]
submission_xgboost = pd.DataFrame(data={'id':test['id'], 'target': predictions})
submission_xgboost.to_csv('hub_xg.csv', index=False)
files.download('hub_xg_numeirc.csv')

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

#SVC

In [15]:
from sklearn.svm import SVC
from sklearn.model_selection import RandomizedSearchCV
from sklearn.metrics import precision_score, recall_score, f1_score, accuracy_score

classifier_text = SVC()

parameters = {'C': [0.1, 1, 10, 100, 1000], 'gamma': [1, 0.1, 0.01, 0.0001], 'kernel' : ['linear',
                                                                                         'sigmoid', 'poly', 'rbf']}
grid_s_p = RandomizedSearchCV(classifier_text, parameters, refit = True) 

model = grid_s_p.fit(x_train_text, y_train_text)

model.best_params_

{'C': 1, 'gamma': 1, 'kernel': 'linear'}

In [16]:
preds = model.predict(x_test_text)
print("Accuracy score: ", accuracy_score(y_test_text, preds))
print("Precision score: ", precision_score(y_test_text, preds))
print("Recall score: ", recall_score(y_test_text, preds))
print("f1 score: ", f1_score(y_test_text, preds))

Accuracy score:  0.8365068942875903
Precision score:  0.85
Recall score:  0.7425897035881436
f1 score:  0.7926727726894255


In [None]:
#Tardo dos horas y no termino
# classifier_numeric = SVC()

# parameters = {'C': [0.1, 1, 10, 100, 1000], 'gamma': [1, 0.1, 0.01, 0.0001], 'kernel' : ['linear',
#                                                                                          'sigmoid', 'poly', 'rbf']}
# grid_s_p = RandomizedSearchCV(classifier_numeric, parameters, refit = True) 

# model = grid_s_p.fit(x_train_numeric, y_train_numeric)

# model.best_params_

In [None]:
# preds = model.predict(x_test_numeric)
# print("Accuracy score: ", accuracy_score(y_test_numeric, preds))
# print("Precision score: ", precision_score(y_test_numeric, preds))
# print("Recall score: ", recall_score(y_test_numeric, preds))
# print("f1 score: ", f1_score(y_test_numeric, preds))

In [18]:
preds = model.predict(matriz_test_use_solo_text)
submission_svc = pd.DataFrame(data={'id':test['id'], 'target': preds})
submission_svc.to_csv('hub_svc_text.csv', index=False)