# <span style="color:#9c8f8f"> 75.06/95.58 Organización de Datos</span>
# <span style="color:#9c8f8f"> Análisis exploratorio: Real or Not? NLP with Disaster Tweets</span>

# <center>ALGORITMOS</center>

In [2]:
import pandas as pd
from sklearn.model_selection import train_test_split,GridSearchCV,KFold
from sklearn.preprocessing import MinMaxScaler
import numpy as np
from sklearn.metrics import f1_score
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfTransformer

from sklearn import metrics
from sklearn.model_selection import RepeatedStratifiedKFold

from sklearn.model_selection import cross_val_predict, cross_val_score
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV
import warnings
warnings.filterwarnings("ignore", category=FutureWarning)
warnings.filterwarnings("ignore", category=DeprecationWarning)

import datetime

In [3]:
from sklearn.neural_network import MLPClassifier
from sklearn import preprocessing
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.multioutput import MultiOutputClassifier
import lightgbm as lgb
from sklearn.linear_model import SGDClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import SVR
from sklearn.naive_bayes import MultinomialNB
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from sklearn.ensemble import ExtraTreesClassifier

# Funciones auxiliares

## Guardar prediccion

In [4]:
def save_prediction_accuracy(modelo,x_train,y_train,x_test,y_test):
    with open("prediction_history.csv", "a") as myfile:
        predicted = modelo.predict(x_train)
        train_prediction = str(metrics.accuracy_score(y_train,predicted))
        print('Score para x_train: '+ train_prediction)
        predicted = modelo.predict(x_test)
        test_prediction = str(metrics.accuracy_score(y_test,predicted))
        print('Score para x_test: '+ test_prediction)
        params = str(modelo)
        print('Hiperparametros: '+ str(modelo))
        myfile.write(params+','+test_prediction+","+train_prediction+","+str(datetime.datetime.now())+"\n")

In [5]:
def save_prediction(model, test, name):
    df_prediccion = test[['id']].copy()
    predicted = model.predict(test.drop(columns='id'))
    df_prediccion['target'] = predicted
    df_prediccion.to_csv('predictions/'+name+str(datetime.datetime.now())+'.csv', index=None)
    return

## Fit and predict

In [6]:
def evaluate(model, test_features, test_labels):
    predictions = model.predict(test_features)
    #errors = abs(predictions - test_labels)
    #mape = 100 * np.mean(errors / test_labels)
    #accuracy = 100 - mape
    accuracy = round(accuracy_score(test_labels,predictions)*100)
    print('Model Performance')
    #print('Average Error: {:0.4f} degrees.'.format(np.mean(errors)))
    print('Accuracy = {:0.2f}%.'.format(accuracy))
    return accuracy

In [7]:
def print_cm(cm, labels, hide_zeroes=False, hide_diagonal=False, hide_threshold=None):
    """pretty print for confusion matrixes"""
    columnwidth = max([len(x) for x in labels] + [5])  # 5 is value length
    empty_cell = " " * columnwidth
    # Print header
    print("    " + empty_cell, end=" ")
    for label in labels:
        print("%{0}s".format(columnwidth) % label, end=" ")
    print()
    # Print rows
    for i, label1 in enumerate(labels):
        print("    %{0}s".format(columnwidth) % label1, end=" ")
        for j in range(len(labels)):
            cell = "%{0}.1f".format(columnwidth) % cm[i, j]
            if hide_zeroes:
                cell = cell if float(cm[i, j]) != 0 else empty_cell
            if hide_diagonal:
                cell = cell if i != j else empty_cell
            if hide_threshold:
                cell = cell if cm[i, j] > hide_threshold else empty_cell
            print(cell, end=" ")
        print()

In [8]:
def fit_and_predict(model,X_train,y_train,X_test,y_test):    
    clf = model
    clf.fit(X_train, y_train)
    predictions=clf.predict(X_test)
    confusion_matrix(y_test,predictions)
    conf = metrics.confusion_matrix(y_test, predictions)
    conf = conf.astype('float') / conf.sum(axis=1)[:, np.newaxis]
    print_cm(conf, ['true','false'])
    print('-'*50)
    print(classification_report(y_test,predictions))
    print('-'*50)
    print("{}" .format(model))
    print('-'*50)
    print('Accuracy of classifier on training set:{}%'.format(round(clf.score(X_train, y_train)*100)))
    print('-'*50)
    print('Accuracy of classifier on test set:{}%' .format(round(accuracy_score(y_test,predictions)*100)))
    print('*'*50)

In [9]:
def fit_predict_models_sin_tune(X_test,X_train,y_test, y_train, models):
    
    for model in models:
        fit_and_predict(model,X_train, y_train,X_test,y_test)

## Limpiar features de df

In [10]:
dtype_train = {"id": np.int32, "keyword": "category", "target" : int}
train = pd.read_csv("original_data/train.csv", dtype = dtype_train, encoding='UTF_8')

target_train = train[["id","target"]]
    
def get_clean_values(df):
    
    if("target" in list(df.columns)):
        del df["target"]
    df = df.merge(target_train, how="inner", on="id")   
    
    X = df.copy().fillna(0)
    y = df["target"].copy()
    
    if("target" in list(X.columns)):
        del X['target']
    if("id" in list(X.columns)):
        del X['id']
    if("Unnamed: 0" in list(X.columns)):
        del X["Unnamed: 0"]
    
    return X,y

## Grid search con parametros tuenados

In [11]:
def grid_search_models(models, parameters, classiffiers, best_models_params, X_train, y_train):
    
    for key in classiffiers:
        
        clf = modelos[key]
        params = parameters[key]
        
        best_clf = RandomizedSearchCV(estimator = clf, param_distributions = params, n_iter = 10, cv = 3, verbose=2, random_state=42, n_jobs = -1)
        best_clf.fit(X_train, y_train)
        
        print("Model: ", key)
        evaluate(best_clf, X_test, y_test)
        print()
        
        best_models_params[key] = best_clf.best_params_

## Hyperparametros de modelos

In [12]:
models_sin_tune = [LogisticRegression(C=1.0, max_iter=1000),SVC(),MultinomialNB(),DecisionTreeClassifier(),
        KNeighborsClassifier(n_neighbors=5),RandomForestClassifier(),GaussianNB(),SGDClassifier(loss="hinge", penalty="l2", max_iter=500)]

In [13]:
params_XGB = {
    "criterion": ["entropy","gini"],
    'n_estimators': np.arange(10,2000,200),
    'max_features': ['auto', 'sqrt','log2'],
    'max_depth': np.linspace(1, 500, 10, endpoint=True),
    'min_samples_split': np.linspace(0.0, 1.0, 10, endpoint=True),
    'min_samples_leaf': np.arange(0.0, 1,0.05),
    'objective': ['binary:logistic'],
    'learning_rate':np.arange(0.1,0.5,0.1),
    'gamma':np.arange(0,0.5,0.1),
    'subsample':np.arange(0.6,1,0.1),'colsample_bytree':np.arange(0.6,0.91,0.05),
    'colsample_bylevel':np.arange(0.6,0.91,0.05),
    'reg_alpha':[1e-5, 1e-2, 0.1, 1, 100],
    'tree_method':['auto', 'exact', 'approx', 'hist', 'gpu_hist']
}


params_LGBM ={
    'objective': ['binary'],
    'num_leaves': np.arange(25,70,4),
    'learning_rate':[0.005,0.01,0.05,0.1,0.3],
    'n_estimators': np.arange(25,200,15),
    'max_depth': np.arange(5,13,1),
    'min_split_gain': [0.001,0.01,0.1,0.2],
    'bagging_fraction': np.arange(0.8,1.01,0.1),
    'feature_fraction': np.arange(0.1,0.91,0.2)
}


params_RF = {
    "criterion": ["entropy","gini"],
    'n_estimators': np.arange(10,2000,200),
    'max_features': ['auto', 'sqrt','log2'],
    'max_depth': np.linspace(1, 500, 10, endpoint=True),
    'min_samples_split': np.linspace(0.0, 1.0, 10, endpoint=True),
    'min_samples_leaf': np.arange(1, 10,1),
}

params_DT = {
    "criterion": ["entropy","gini"],
    'max_features': ['auto', 'sqrt','log2'],
    'max_depth': np.linspace(1, 500, 10, endpoint=True),
    'min_samples_split': np.linspace(0.0, 1.0, 10, endpoint=True),
    'min_samples_leaf': np.arange(0.0, 1,0.05),
}

params_ET = {
    "criterion": ["entropy","gini"],
    'n_estimators': np.arange(10,2000,200),
    'max_features': ['auto', 'sqrt','log2'],
    'max_depth': np.linspace(1, 500, 10, endpoint=True),
    'min_samples_split': np.linspace(0.0, 1.0, 10, endpoint=True),
    'min_samples_leaf': np.arange(0.0, 1,0.05),
}

params_MLPC = {'hidden_layer_sizes':[(n,n,n) for n in range(1,30)],
                              'activation':['identity', 'logistic', 'tanh', 'relu'],
                              'alpha':[1e-06,1e-05,1e-04,1e-03,1e-02,1e-01,1],
                              'beta_1':np.arange(0.0,1,0.01),
                              'beta_2':np.arange(0.0,1,0.01),
                              'early_stopping':[True, False],
                              'epsilon':[1e-07,1e-08,1e-06, 1e-09, 1e-010, 1e-11],
                              'learning_rate':['constant', 'adaptive'],
                              'solver':['adam', 'lbfgs', 'sgd'],
                              'validation_fraction':np.arange(0.15,0.5,0.01),
                               'max_iter':[200,300,400],
                             }

params_GBC = {
    'max_features':['auto', 'sqrt', 'log2'],
                             'max_leaf_nodes': [None,1,2,3,4,5,6,8],
                             'min_weight_fraction_leaf': np.linspace(0.0, 0.5, 1, endpoint=True),
                             'learning_rate': np.arange(0.1, 0.5, 0.05),
                             'min_samples_split': np.linspace(0.1, 1.0, 10, endpoint=True),
                             'min_samples_leaf': np.arange(0.1, 0.5, 5),  
                             'max_features' : list(range(1,100)),
                             'max_depth': [n for n in range(0,50,5)],
                             'n_estimators': [n for n in range(0,1000,10)],
                             'subsample': np.arange(0.3, 1,0.1),
                             'loss': ['deviance'],
                             'warm_start': [True, False],
                             'presort': ['auto'],

}

params_LR = {
    'penalty':['l1', 'l2', 'elasticnet'],
    'C': np.arange(0.5,0.8,20),
    'solver':['newton-cg', 'lbfgs', 'liblinear', 'sag', 'saga'],
    'multi_class': ['auto', 'ovr', 'multinomial'],
}

In [14]:
modelos = {
    "XGB": XGBClassifier(),
    "LBGM": LGBMClassifier(), 
    "RF": RandomForestClassifier(),
    "ET": ExtraTreesClassifier(), 
    "DT": DecisionTreeClassifier(),
    "MLPC": MLPClassifier(),
    "GBC": GradientBoostingClassifier(),
    "LR": LogisticRegression()
}

parametros = {
    "XGB": params_XGB,
    "LBGM": params_LGBM,
    "RF": params_RF,
    "ET": params_ET,
    "DT": params_DT,
    "MLPC": params_MLPC,
    "GBC": params_GBC,
    "LR": params_LR
}


# Modelado para distintos features

## features de bow limpieza profunda de texto

In [15]:
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer

In [16]:
# CARGO FEATURES

df = pd.read_csv("features/feature_selection/features_cleaned_text_train.csv")

X,y = get_clean_values(df)

corpus = list(X['text'].fillna(''))
vectorizer = CountVectorizer()
bow = vectorizer.fit_transform(corpus).todense()


#Split the data into train and test datasets for model training and testing
X_train, X_test, y_train, y_test =train_test_split(bow,y, test_size=0.2,random_state=2020)

In [29]:
# PUEBO ALGORITMOS SIN TUNEAR

fit_predict_models_sin_tune(X_test, X_train, y_test, y_train, models_sin_tune)

           true false 
     true   0.9   0.1 
    false   0.3   0.7 
--------------------------------------------------
              precision    recall  f1-score   support

           0       0.79      0.91      0.85       849
           1       0.86      0.69      0.77       674

    accuracy                           0.82      1523
   macro avg       0.83      0.80      0.81      1523
weighted avg       0.82      0.82      0.81      1523

--------------------------------------------------
LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=1000,
                   multi_class='auto', n_jobs=None, penalty='l2',
                   random_state=None, solver='lbfgs', tol=0.0001, verbose=0,
                   warm_start=False)
--------------------------------------------------
Accuracy of classifier on training set:97.0%
--------------------------------------------------
Accuracy of classifier on te

In [None]:
# PRUEBO CIERTOS ALGORITMOS TUNEADOS

classiffiers = [ "ET", "DT", "RF", "LBGM","MLPC", "GBC", 'LR']
best_params_Text = {}
grid_search_models(modelos, parametros, classiffiers, best_params_Text, X_train, y_train)

## features seleccionados con Decision Tree

In [16]:
# CARGO FEATURES

df = pd.read_csv("features/feature_selection/features_DecisionTreeClassiffier_BOW_SinTag_train.csv")
X,y = get_clean_values(df)

#Split the data into train and test datasets for model training and testing
X_train, X_test, y_train, y_test =train_test_split(X,y,test_size=0.2,random_state=2020)

In [17]:
# PUEBO ALGORITMOS SIN TUNEAR

fit_predict_models_sin_tune(X_test, X_train, y_test, y_train, models_sin_tune)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


           true false 
     true   0.8   0.2 
    false   0.4   0.6 
--------------------------------------------------
              precision    recall  f1-score   support

           0       0.73      0.83      0.78       849
           1       0.75      0.62      0.68       674

    accuracy                           0.74      1523
   macro avg       0.74      0.73      0.73      1523
weighted avg       0.74      0.74      0.74      1523

--------------------------------------------------
LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=1000,
                   multi_class='auto', n_jobs=None, penalty='l2',
                   random_state=None, solver='lbfgs', tol=0.0001, verbose=0,
                   warm_start=False)
--------------------------------------------------
Accuracy of classifier on training set:76.0%
--------------------------------------------------
Accuracy of classifier on te

In [30]:
# PRUEBO CIERTOS ALGORITMOS TUNEADOS

classiffiers = [ "ET", "DT", "RF", "LBGM","MLPC", "GBC", 'LR']
best_params_DT = {}
grid_search_models(modelos, parametros, classiffiers, best_params_DT, X_train, y_train)

Fitting 3 folds for each of 10 candidates, totalling 30 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  30 out of  30 | elapsed:   15.7s finished
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.


Model:  ET
Model Performance
Accuracy = 56.00%.

Fitting 3 folds for each of 10 candidates, totalling 30 fits


[Parallel(n_jobs=-1)]: Done  23 out of  30 | elapsed:    0.3s remaining:    0.1s
[Parallel(n_jobs=-1)]: Done  30 out of  30 | elapsed:    0.4s finished
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.


Model:  DT
Model Performance
Accuracy = 62.00%.

Fitting 3 folds for each of 10 candidates, totalling 30 fits


[Parallel(n_jobs=-1)]: Done  23 out of  30 | elapsed:   17.5s remaining:    5.3s
[Parallel(n_jobs=-1)]: Done  30 out of  30 | elapsed:   18.8s finished


Model:  RF
Model Performance
Accuracy = 70.00%.

Fitting 3 folds for each of 10 candidates, totalling 30 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  30 out of  30 | elapsed:    7.0s finished


Model:  LBGM
Model Performance
Accuracy = 76.00%.

Fitting 3 folds for each of 10 candidates, totalling 30 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  30 out of  30 | elapsed:   51.0s finished
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
  self.n_iter_ = _check_optimize_result("lbfgs", opt_res, self.max_iter)
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.


Model:  MLPC
Model Performance
Accuracy = 74.00%.

Fitting 3 folds for each of 10 candidates, totalling 30 fits


[Parallel(n_jobs=-1)]: Done  30 out of  30 | elapsed:    8.8s finished


Model:  GBC
Model Performance
Accuracy = 73.00%.

Fitting 3 folds for each of 10 candidates, totalling 30 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  23 out of  30 | elapsed:   12.0s remaining:    3.6s
[Parallel(n_jobs=-1)]: Done  30 out of  30 | elapsed:   14.0s finished


Model:  LR
Model Performance
Accuracy = 74.00%.



## features seleccionados con Random Forest

In [16]:
# CARGO FEATURES

df = pd.read_csv("features/feature_selection/features_RandoForest_train.csv")
X,y = get_clean_values(df)

#Split the data into train and test datasets for model training and testing
X_train, X_test, y_train, y_test =train_test_split(X, y,test_size=0.2,random_state=2020)


In [32]:
# PUEBO ALGORITMOS SIN TUNEAR

fit_predict_models_sin_tune(X_test, X_train, y_test, y_train, models_sin_tune)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


           true false 
     true   0.8   0.2 
    false   0.4   0.6 
--------------------------------------------------
              precision    recall  f1-score   support

           0       0.72      0.84      0.78       849
           1       0.75      0.58      0.65       674

    accuracy                           0.73      1523
   macro avg       0.73      0.71      0.71      1523
weighted avg       0.73      0.73      0.72      1523

--------------------------------------------------
LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=1000,
                   multi_class='auto', n_jobs=None, penalty='l2',
                   random_state=None, solver='lbfgs', tol=0.0001, verbose=0,
                   warm_start=False)
--------------------------------------------------
Accuracy of classifier on training set:77.0%
--------------------------------------------------
Accuracy of classifier on te

In [17]:
# PRUEBO CIERTOS ALGORITMOS TUNEADOS

classiffiers = [ "ET", "DT", "RF", "LBGM","MLPC", "GBC", 'LR']
best_params_RF = {}
grid_search_models(modelos, parametros, classiffiers, best_params_RF, X_train, y_train)

Fitting 3 folds for each of 10 candidates, totalling 30 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  30 out of  30 | elapsed:   50.5s finished


Model:  ET
Model Performance
Accuracy = 65.00%.

Fitting 3 folds for each of 10 candidates, totalling 30 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  23 out of  30 | elapsed:    0.8s remaining:    0.2s
[Parallel(n_jobs=-1)]: Done  30 out of  30 | elapsed:    1.0s finished


Model:  DT
Model Performance
Accuracy = 56.00%.

Fitting 3 folds for each of 10 candidates, totalling 30 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  30 out of  30 | elapsed:   23.5s finished


Model:  RF
Model Performance
Accuracy = 70.00%.

Fitting 3 folds for each of 10 candidates, totalling 30 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  30 out of  30 | elapsed:    6.4s finished


Model:  LBGM
Model Performance
Accuracy = 76.00%.

Fitting 3 folds for each of 10 candidates, totalling 30 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  30 out of  30 | elapsed:   52.0s finished


Model:  MLPC
Model Performance
Accuracy = 75.00%.

Fitting 3 folds for each of 10 candidates, totalling 30 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  30 out of  30 | elapsed:   17.3s finished


Model:  GBC
Model Performance
Accuracy = 70.00%.

Fitting 3 folds for each of 10 candidates, totalling 30 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  23 out of  30 | elapsed:   11.6s remaining:    3.5s
[Parallel(n_jobs=-1)]: Done  30 out of  30 | elapsed:   13.6s finished


Model:  LR
Model Performance
Accuracy = 73.00%.



## features seleccionados con Extra Trees

In [None]:
# CARGO FEATURES

df = pd.read_csv("features/feature_selection/features_ExtraTreesClassifier_BOW_SinTag_train.csv")
X,y = get_clean_values(df)

#Split the data into train and test datasets for model training and testing
X_train, X_test, y_train, y_test =train_test_split(X, y, test_size=0.2,random_state=2020)


In [None]:
# PUEBO ALGORITMOS SIN TUNEAR

fit_predict_models_sin_tune(X_test, X_train, y_test, y_train, models_sin_tune)

In [None]:
# PRUEBO CIERTOS ALGORITMOS TUNEADOS

classiffiers = [ "ET", "DT", "RF", "LBGM","MLPC", "GBC", 'LR']
best_params_ET = {}
grid_search_models(modelos, parameteros, classiffiers, best_params_ET, X_train, y_train)