In [1]:
import pandas as pd
import numpy as np

from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC, LinearSVC, NuSVC
from sklearn.ensemble import BaggingClassifier
from sklearn.tree import DecisionTreeClassifier
from xgboost import XGBClassifier

from sklearn import metrics
from sklearn.model_selection import train_test_split
from sklearn import preprocessing
from sklearn import model_selection

from sklearn.datasets import fetch_20newsgroups
from keras.layers import  Dropout, Dense
from keras.models import Sequential

In [2]:
from pydrive.auth import GoogleAuth
from pydrive.drive import GoogleDrive
from google.colab import auth
from oauth2client.client import GoogleCredentials

# Authenticate and create the PyDrive client
auth.authenticate_user()
gauth = GoogleAuth()
gauth.credentials = GoogleCredentials.get_application_default()
drive = GoogleDrive(gauth)

In [3]:
id='1Pk5MK9Hs_kMUT9NotGnOKE0NPra-39YU'
downloaded = drive.CreateFile({'id': id})
downloaded.GetContentFile('train.csv')

In [None]:
train = pd.read_csv('train.csv')

In [5]:
id='1GsTM9oLtIV8-Da_fDOFWsQYMpgQ8GOYJ'
downloaded = drive.CreateFile({'id': id})
downloaded.GetContentFile('test.csv')

In [6]:
test = pd.read_csv('test.csv')

In [7]:
id='1vtpcdE13KC-nCfeWP_4yVX205cMCF9U8'
downloaded = drive.CreateFile({'id': id})
downloaded.GetContentFile('features_test_ft.csv')

In [8]:
features_test = pd.read_csv('features_test_ft.csv')

In [9]:
features_test.shape

(3263, 307)

In [10]:
id='10W1E8suoMsnUQDgR1zH2vKJliuwvLJ5Z'
downloaded = drive.CreateFile({'id': id})
downloaded.GetContentFile('features_train_ft.csv')

In [11]:
features_train = pd.read_csv('features_train_ft.csv')

In [12]:
features_train.shape

(7613, 307)

In [13]:
y = train.target
x_train, x_validation, y_train, y_validation = train_test_split(features_train, y, test_size=0.3, stratify=y)

#KNN

Buscamos los mejores hiperparametros

In [58]:
knn = KNeighborsClassifier(10)
knn.fit(x_train, y_train)

KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
                     metric_params=None, n_jobs=None, n_neighbors=10, p=2,
                     weights='uniform')

In [60]:
#BUSQUEDA DE HIPERPARAMETROS

params_knn = {'n_neighbors': np.arange(1, 25)}
knn_gs = GridSearchCV(knn, params_knn, cv=5)
knn_gs.fit(x_train, y_train)

knn_best = knn_gs.best_estimator_
print(knn_gs.best_params_)

print('knn score: {}'.format(knn_best.score(x_validation, y_validation)))

{'n_neighbors': 24}
knn score: 0.6541155866900175


In [61]:
prediction_knn = knn.predict(features_test)
submission_knn = pd.DataFrame(data={'id':test['id'], 'target': prediction_knn})
submission_knn.to_csv('ft_knn.csv', index=False)

#NAIVE BAYES CLASSIFICATION

No se puede prodar con esta codificacion, porque algunos de los valores de la matriz son negativos

In [None]:
from sklearn.model_selection import GridSearchCV

parameters = {'alpha': (1, 0.1, 0.01, 0.001, 0.0001, 0.00001)} 
model = MultinomialNB()
nb = GridSearchCV(model, parameters, n_jobs=-1, verbose=2)

nb.fit(x_train, y_train)
nb_best = nb.best_estimator_
print(nb.best_params_)

print('naive bayes score: {}'.format(nb_best.score(x_validation, y_validation)))

In [None]:
prediction_nb = nb_best.predict(features_test)

In [None]:
submission_nb = pd.DataFrame(data={'id':test['id'], 'target': prediction_nb})
submission_nb.to_csv('ft_nb.csv', index=False)

#RED NEURONAL PROFUNDA

In [18]:
def DNN_model(shape, nClasses, dropout=0.5):
    model = Sequential()
    node = 512
    nLayers = 4
    model.add(Dense(node,input_dim=shape,activation='relu'))
    model.add(Dropout(dropout))
    for i in range(0,nLayers):
        model.add(Dense(node,input_dim=node,activation='relu'))
        model.add(Dropout(dropout))
    model.add(Dense(nClasses, activation='softmax'))
    model.compile(loss='sparse_categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
    return model

In [19]:
model_DNN = DNN_model(features_train.shape[1], 2)
model_DNN.fit(features_train, train.target, epochs=10, batch_size=128, verbose=2)

Epoch 1/10
60/60 - 2s - loss: 0.8833 - accuracy: 0.5296
Epoch 2/10
60/60 - 2s - loss: 0.6991 - accuracy: 0.5516
Epoch 3/10
60/60 - 2s - loss: 0.6790 - accuracy: 0.5667
Epoch 4/10
60/60 - 2s - loss: 0.6706 - accuracy: 0.5865
Epoch 5/10
60/60 - 2s - loss: 0.6619 - accuracy: 0.5985
Epoch 6/10
60/60 - 2s - loss: 0.6532 - accuracy: 0.6159
Epoch 7/10
60/60 - 2s - loss: 0.6515 - accuracy: 0.6196
Epoch 8/10
60/60 - 2s - loss: 0.6498 - accuracy: 0.6195
Epoch 9/10
60/60 - 2s - loss: 0.6447 - accuracy: 0.6329
Epoch 10/10
60/60 - 2s - loss: 0.6427 - accuracy: 0.6375


<tensorflow.python.keras.callbacks.History at 0x7f98ee265eb8>

In [20]:
prediction_dnn = model_DNN.predict_classes(features_test)

Instructions for updating:
Please use instead:* `np.argmax(model.predict(x), axis=-1)`,   if your model does multi-class classification   (e.g. if it uses a `softmax` last-layer activation).* `(model.predict(x) > 0.5).astype("int32")`,   if your model does binary classification   (e.g. if it uses a `sigmoid` last-layer activation).


In [36]:
submission_dnn = pd.DataFrame(data={'id':test['id'], 'target': prediction_dnn})
submission_dnn.to_csv('ft_dnn.csv', index=False)

#RANDOM FOREST

GRID SEARCH

In [22]:
rf = RandomForestClassifier()
params_rf = {'n_estimators': [50, 100, 200]}
rf_gs = GridSearchCV(rf, params_rf, cv=5)
rf_gs.fit(x_train, y_train)

GridSearchCV(cv=5, error_score=nan,
             estimator=RandomForestClassifier(bootstrap=True, ccp_alpha=0.0,
                                              class_weight=None,
                                              criterion='gini', max_depth=None,
                                              max_features='auto',
                                              max_leaf_nodes=None,
                                              max_samples=None,
                                              min_impurity_decrease=0.0,
                                              min_impurity_split=None,
                                              min_samples_leaf=1,
                                              min_samples_split=2,
                                              min_weight_fraction_leaf=0.0,
                                              n_estimators=100, n_jobs=None,
                                              oob_score=False,
                                              rando

In [23]:
rf_best = rf_gs.best_estimator_
print(rf_gs.best_params_)

{'n_estimators': 200}


In [24]:
print('rf: {}'.format(rf_best.score(x_validation, y_validation)))

rf: 0.6295971978984238


In [25]:
prediction_rf = rf_best.predict(features_test)

In [35]:
submission_rf = pd.DataFrame(data={'id':test['id'], 'target': prediction_rf})
submission_rf.to_csv('ft_rf.csv', index=False)

BAGGING

In [27]:
seed = 7
num_trees = 100

BAGGING CLASSIFIER

In [28]:
kfold = model_selection.KFold(n_splits=10)
cart = DecisionTreeClassifier()
model = BaggingClassifier(base_estimator=cart, n_estimators=num_trees, random_state=seed)
results = model_selection.cross_val_score(model, features_train, train.target, cv=kfold)
print(results.mean())

0.6195939863627429


In [31]:
model.fit(x_train, y_train)

BaggingClassifier(base_estimator=DecisionTreeClassifier(ccp_alpha=0.0,
                                                        class_weight=None,
                                                        criterion='gini',
                                                        max_depth=None,
                                                        max_features=None,
                                                        max_leaf_nodes=None,
                                                        min_impurity_decrease=0.0,
                                                        min_impurity_split=None,
                                                        min_samples_leaf=1,
                                                        min_samples_split=2,
                                                        min_weight_fraction_leaf=0.0,
                                                        presort='deprecated',
                                                        random_state=None,


In [32]:
prediction_bagg = model.predict(features_test)

In [34]:
submission_bagg = pd.DataFrame(data={'id':test['id'], 'target': prediction_bagg})
submission_bagg.to_csv('ft_bagg.csv', index=False)

BAGGING CON RANDOM FOREST

In [38]:
max_features = 3
kfold = model_selection.KFold(n_splits=10)
model = RandomForestClassifier(n_estimators=num_trees, max_features=max_features)
results = model_selection.cross_val_score(model, features_train, train.target, cv=kfold)
print(results.mean())

0.5793932213795221


In [41]:
model.fit(x_train, y_train)

RandomForestClassifier(bootstrap=True, ccp_alpha=0.0, class_weight=None,
                       criterion='gini', max_depth=None, max_features=3,
                       max_leaf_nodes=None, max_samples=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=100,
                       n_jobs=None, oob_score=False, random_state=None,
                       verbose=0, warm_start=False)

In [42]:
prediction_rf_bagg = model.predict(features_test)

In [43]:
submission_rf_bagg = pd.DataFrame(data={'id':test['id'], 'target': prediction_rf_bagg})
submission_rf_bagg.to_csv('ft_rf_bagg.csv', index=False)

#NuSCV

In [57]:
nuscv = NuSVC(probability=True)
nuscv.fit(x_train, y_train)

print('rf: {}'.format(nuscv.score(x_validation, y_validation)))

prediction_nuscv = nuscv.predict(features_test)

submission_nuscv = pd.DataFrame(data={'id':test['id'], 'target': prediction_nuscv})
submission_nuscv.to_csv('ft_nuscv.csv', index=False)

rf: 0.5595446584938704


#XGBoost

In [53]:
from sklearn.metrics import accuracy_score
from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_score

In [51]:
model = XGBClassifier()
model.fit(x_train, y_train)

XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
              colsample_bynode=1, colsample_bytree=1, gamma=0,
              learning_rate=0.1, max_delta_step=0, max_depth=3,
              min_child_weight=1, missing=None, n_estimators=100, n_jobs=1,
              nthread=None, objective='binary:logistic', random_state=0,
              reg_alpha=0, reg_lambda=1, scale_pos_weight=1, seed=None,
              silent=None, subsample=1, verbosity=1)

In [54]:
kfold = KFold(n_splits = 8)
results = cross_val_score(model, x_validation, y_validation, cv=kfold)
print("Accuracy: %.2f%% (%.2f%%)" % (results.mean()*100, results.std()*100))

Accuracy: 63.53% (3.73%)


In [56]:
y_pred = model.predict(features_test)
predictions = [round(value) for value in y_pred]
submission = pd.DataFrame(data={'id':test['id'], 'target': predictions})
submission.to_csv('ft_xgboost.csv', index=False)