In [1]:
import pandas as pd
import numpy as np

from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC, LinearSVC, NuSVC
from sklearn.ensemble import BaggingClassifier
from sklearn.tree import DecisionTreeClassifier
from xgboost import XGBClassifier

from sklearn import metrics
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split
from sklearn import preprocessing
from sklearn import model_selection

from sklearn.datasets import fetch_20newsgroups
from keras.layers import  Dropout, Dense
from keras.models import Sequential

In [2]:
from pydrive.auth import GoogleAuth
from pydrive.drive import GoogleDrive
from google.colab import auth
from oauth2client.client import GoogleCredentials

# Authenticate and create the PyDrive client
auth.authenticate_user()
gauth = GoogleAuth()
gauth.credentials = GoogleCredentials.get_application_default()
drive = GoogleDrive(gauth)

In [3]:
id='1Pk5MK9Hs_kMUT9NotGnOKE0NPra-39YU'
downloaded = drive.CreateFile({'id': id})
downloaded.GetContentFile('train.csv')

In [4]:
train = pd.read_csv('train.csv')

In [5]:
id='1GsTM9oLtIV8-Da_fDOFWsQYMpgQ8GOYJ'
downloaded = drive.CreateFile({'id': id})
downloaded.GetContentFile('test.csv')

In [6]:
test = pd.read_csv('test.csv')

In [7]:
id='1TmL2AY_yymiV8zXVvmbqXsIZoexR9sLs'
downloaded = drive.CreateFile({'id': id})
downloaded.GetContentFile('test_features.csv')

In [8]:
features_test = pd.read_csv('test_features.csv')

In [10]:
id='1slJiG-M_oawvkzaqWzZkSmVxxZor6mKQ'
downloaded = drive.CreateFile({'id': id})
downloaded.GetContentFile('train_features.csv')

In [11]:
features_train = pd.read_csv('train_features.csv')

In [12]:
features_train.drop(columns=['id', 'keyword', 'location','text', 'target', 'longitud tweets', 'cant_palabras','cant_url', 'cant_hashtag', 'cant_mencion', 'cant_signos_pregunta', 'cant_signos_exclamacion', 'subjetividad', 'cant_oraciones', 'cant_minusculas', 'cant_consonant', 'palabras_unicas'], inplace = True)
features_test.drop(columns=['id', 'keyword', 'location','text', 'longitud tweets', 'cant_palabras','cant_url', 'cant_hashtag', 'cant_mencion', 'cant_signos_pregunta', 'cant_signos_exclamacion', 'subjetividad', 'cant_oraciones', 'cant_minusculas', 'cant_consonant', 	'palabras_unicas'], inplace = True)

In [14]:
y = train.target
x_train, x_validation, y_train, y_validation = train_test_split(features_train, y, test_size=0.3, stratify=y)

##KNN

Buscamos los mejores hiperparametros

In [15]:
#BUSQUEDA DE HIPERPARAMETROS
knn = KNeighborsClassifier()
params_knn = {'n_neighbors': np.arange(1, 25), 'weights' : ['uniform', 'distance'], 'metric': ['euclidean', 'manhattan', 'minkowski']}
knn_gs = GridSearchCV(knn, params_knn, cv=5)
knn_gs.fit(x_train, y_train)
knn_best = knn_gs.best_estimator_
print(knn_gs.best_params_)

print('knn score: {}'.format(knn_best.score(x_validation, y_validation)))

{'metric': 'manhattan', 'n_neighbors': 22, 'weights': 'distance'}
knn score: 0.6681260945709282


In [16]:
#params_knn = {'n_neighbors': np.arange(1, 25)}
#{'n_neighbors': 22}
#knn score: 0.6663747810858144

Realizamos la prediccion con el modelo

In [None]:
prediction_knn = knn_best.predict(features_test)
submission_knn = pd.DataFrame(data={'id':test['id'], 'target': prediction_knn})
submission_knn.to_csv('features_nuevos_knn.csv', index=False)

#RED NEURONAL PROFUNDA

In [None]:
def DNN_model(shape, nClasses, dropout=0.5):
    model = Sequential()
    node = 512
    nLayers = 4
    model.add(Dense(node,input_dim=shape,activation='relu'))
    model.add(Dropout(dropout))
    for i in range(0,nLayers):
        model.add(Dense(node,input_dim=node,activation='relu'))
        model.add(Dropout(dropout))
    model.add(Dense(nClasses, activation='softmax'))
    model.compile(loss='sparse_categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
    return model

In [None]:
model_DNN = DNN_model(features_train.shape[1], 2)
model_DNN.fit(features_train, train.target, epochs=20, batch_size=128, verbose=2)

Epoch 1/20
 - 2s - loss: 1.0030 - accuracy: 0.5210
Epoch 2/20
 - 2s - loss: 0.7023 - accuracy: 0.5395
Epoch 3/20
 - 2s - loss: 0.6857 - accuracy: 0.5636
Epoch 4/20
 - 2s - loss: 0.6768 - accuracy: 0.5815
Epoch 5/20
 - 2s - loss: 0.6683 - accuracy: 0.5953
Epoch 6/20
 - 2s - loss: 0.6684 - accuracy: 0.5861
Epoch 7/20
 - 2s - loss: 0.6611 - accuracy: 0.6054
Epoch 8/20
 - 2s - loss: 0.6550 - accuracy: 0.6134
Epoch 9/20
 - 2s - loss: 0.6525 - accuracy: 0.6162
Epoch 10/20
 - 2s - loss: 0.6481 - accuracy: 0.6271
Epoch 11/20
 - 2s - loss: 0.6471 - accuracy: 0.6289
Epoch 12/20
 - 2s - loss: 0.6489 - accuracy: 0.6310
Epoch 13/20
 - 2s - loss: 0.6444 - accuracy: 0.6426
Epoch 14/20
 - 2s - loss: 0.6412 - accuracy: 0.6461
Epoch 15/20
 - 2s - loss: 0.6438 - accuracy: 0.6428
Epoch 16/20
 - 2s - loss: 0.6360 - accuracy: 0.6451
Epoch 17/20
 - 2s - loss: 0.6382 - accuracy: 0.6417
Epoch 18/20
 - 2s - loss: 0.6371 - accuracy: 0.6447
Epoch 19/20
 - 2s - loss: 0.6383 - accuracy: 0.6467
Epoch 20/20
 - 2s - l

<keras.callbacks.callbacks.History at 0x7fcf44d0cc50>

In [None]:
prediction_dnn = model_DNN.predict_classes(features_test)

In [None]:
submission_dnn = pd.DataFrame(data={'id':test['id'], 'target': prediction_dnn})
submission_dnn.to_csv('features_nuevos_dnn.csv', index=False)

#RANDOM FOREST

GRID SEARCH

In [None]:
rf = RandomForestClassifier()
#Numero de arboles a usar
#n_estimators = [int(x) for x in np.linspace(start = 200, stop = 2000, num = 10)]
#Cantidad de features a considerar en cada split
#max_features = ['auto', 'sqrt']
#Profundidad de cada arbol
#max_depth = [int(x) for x in np.linspace(10, 110, num = 11)]
#max_depth.append(None)
#Cantidad minima de muestras requeridas para realizar un split
#min_samples_split = [2, 5, 10]
#Cantidad minima de muestras por cada hoja del arbol
#min_samples_leaf = [1, 2, 4]
#Método de seleccion de muestras para el entrenamiento de cada árbol
#bootstrap = [True, False]

params_rf = {'n_estimators': [50, 100, 200]}

#params_rf = {'n_estimators': n_estimators,
#               'max_features': max_features,
#               'max_depth': max_depth,
#               'min_samples_split': min_samples_split,
#               'min_samples_leaf': min_samples_leaf,
#               'bootstrap': bootstrap}

rf_gs = GridSearchCV(rf, params_rf, cv=5)
rf_gs.fit(x_train, y_train)

In [None]:
rf_best = rf_gs.best_estimator_
print(rf_gs.best_params_)

{'n_estimators': 200}


In [None]:
print('rf: {}'.format(rf_best.score(x_validation, y_validation)))

rf: 0.6672504378283712


In [None]:
prediction_rf = rf_best.predict(features_test)

In [None]:
submission_rf = pd.DataFrame(data={'id':test['id'], 'target': prediction_rf})
submission_rf.to_csv('doc2vec_rf.csv', index=False)

#NuSVC

TARDO NADA pero mala prediccion. MUY BUENA PREDICCION SOLO CON TFIDF SIN MAS FEATURES.

In [None]:
nuscv = NuSVC(probability=True)
nuscv.fit(x_train, y_train)

print('rf: {}'.format(nuscv.score(x_validation, y_validation)))

rf: 0.5954465849387041


In [None]:
prediction_nuscv = nuscv.predict(features_test)

submission_nuscv = pd.DataFrame(data={'id':test['id'], 'target': prediction_nuscv})
submission_nuscv.to_csv('doc2vec_nuscv.csv', index=False)

#XGBoost

In [None]:
model = XGBClassifier()
model.fit(x_train, y_train)

XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
              colsample_bynode=1, colsample_bytree=1, gamma=0,
              learning_rate=0.1, max_delta_step=0, max_depth=3,
              min_child_weight=1, missing=None, n_estimators=100, n_jobs=1,
              nthread=None, objective='binary:logistic', random_state=0,
              reg_alpha=0, reg_lambda=1, scale_pos_weight=1, seed=None,
              silent=None, subsample=1, verbosity=1)

In [None]:
y_pred = model.predict(x_validation)
predictions = [round(value) for value in y_pred]
accuracy = accuracy_score(y_validation, predictions)
print("Accuracy: %.5f%%" % (accuracy * 100.0))

Accuracy: 67.60070%


In [None]:
submission_xgboost = pd.DataFrame(data={'id':test['id'], 'target': prediction_xgboost})
submission_xgboost.to_csv('tf_idf_xgboost.csv', index=False)