### Import and Setup Packages

In [1]:
import pandas as pd
import numpy as np
import pickle as pkl
import time

from sklearn import linear_model, naive_bayes, metrics, svm, ensemble
from sklearn.metrics import accuracy_score, confusion_matrix, cohen_kappa_score, precision_score, matthews_corrcoef, roc_auc_score, balanced_accuracy_score, recall_score, f1_score, make_scorer
from sklearn.linear_model import SGDClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import cross_validate

from xgboost import XGBClassifier

from tqdm import tqdm

# ---- Call tqdm to see progress bar with pandas
tqdm().pandas()

import warnings
warnings.filterwarnings('ignore')

0it [00:00, ?it/s]


In [2]:
# List of paramters for the notebook, choose option like and model to run
save_results           = True
lang                   = False
sample                 = False
multinomial_naive_bayes= True
logistic_regression    = True
svm_model              = True
k_nn_model             = True
sgd                    = True
random_forest          = True
gradient_boosting      = True
xgboost_classifier     = True
shallow_network        = True
deep_nn                = True
rnn                    = True
lstm                   = True
cnn                    = True
gru                    = True
cnn_lstm               = True
cnn_gru                = True
bidirectional_rnn      = True
bidirectional_lstm     = True
bidirectional_gru      = True
rcnn                   = True
pre_trained            = True

### Import Data

In [3]:
tweet_x_y = {}
news_x_y = {}

with open('/kaggle/input/sarcasm-preprocessed/tweet_x_values.pkl', 'rb') as f:
    tweet_x_y.update({"xtrain_count" : pkl.load(f)})
    tweet_x_y.update({"xtrain_tfidf" : pkl.load(f)})
    tweet_x_y.update({"xtrain_tfidf_ngram" : pkl.load(f)})
    tweet_x_y.update({"xtrain_tfidf_ngram_chars" : pkl.load(f)})
    tweet_x_y.update({"train_seq_x" : pkl.load(f)})
    tweet_x_y.update({"xvalid_count" : pkl.load(f)})
    tweet_x_y.update({"xvalid_tfidf" : pkl.load(f)})
    tweet_x_y.update({"xvalid_tfidf_ngram" : pkl.load(f)})
    tweet_x_y.update({"xvalid_tfidf_ngram_chars" : pkl.load(f)})
    tweet_x_y.update({"valid_seq_x" : pkl.load(f)})
    
with open('/kaggle/input/sarcasm-preprocessed/tweet_y_values.pkl', 'rb') as f:
    tweet_x_y.update({"train_y_sw" : pkl.load(f)})
    tweet_x_y.update({"train_y" : pkl.load(f)})
    tweet_x_y.update({"valid_y_sw" : pkl.load(f)})
    tweet_x_y.update({"valid_y" : pkl.load(f)})
    
with open('/kaggle/input/sarcasm-preprocessed/news_x_values.pkl', 'rb') as f:
    news_x_y.update({"xtrain_count" : pkl.load(f)})
    news_x_y.update({"xtrain_tfidf" : pkl.load(f)})
    news_x_y.update({"xtrain_tfidf_ngram" : pkl.load(f)})
    news_x_y.update({"xtrain_tfidf_ngram_chars" : pkl.load(f)})
    news_x_y.update({"train_seq_x" : pkl.load(f)})
    news_x_y.update({"xvalid_count" : pkl.load(f)})
    news_x_y.update({"xvalid_tfidf" : pkl.load(f)})
    news_x_y.update({"xvalid_tfidf_ngram" : pkl.load(f)})
    news_x_y.update({"xvalid_tfidf_ngram_chars" : pkl.load(f)})
    news_x_y.update({"valid_seq_x" : pkl.load(f)})
    
with open('/kaggle/input/sarcasm-preprocessed/news_y_values.pkl', 'rb') as f:
    news_x_y.update({"train_y_sw" : pkl.load(f)})
    news_x_y.update({"train_y" : pkl.load(f)})
    news_x_y.update({"valid_y_sw" : pkl.load(f)})
    news_x_y.update({"valid_y" : pkl.load(f)})

### Functions and Data Structures 

In [4]:
tweet_results = pd.DataFrame()
news_results = pd.DataFrame()

In [5]:
def report(clf, x, y, name='classifier', cv=5, dict_scoring=None, fit_params=None):
    #print(dict_scoring)
    if dict_scoring!=None:
        score = dict_scoring.copy()
        for i in score.keys():
            score[i] = make_scorer(score[i])
    
    #if clf==XGBClassifier():
    scores = cross_validate(clf, x, y, scoring=score,
                         cv=cv, return_train_score=False, n_jobs=-1,  fit_params=fit_params)
    
    index = []
    value = []
    index.append("Model")
    value.append(name)
    for i in scores:
        if i == "estimator":
            continue
        for j in enumerate(scores[i]):
            index.append(i+"_cv"+str(j[0]+1))
            value.append(j[1])
        #if any(x in i for x in scoring.keys()):
        
        index.append(i+"_mean")
        value.append(np.mean(scores[i]))
        index.append(i+"_std")
        value.append(np.std(scores[i]))
        
    return pd.DataFrame(data=value, index=index).T

In [6]:
def tn(y_true, y_pred): return confusion_matrix(y_true, y_pred)[0, 0]
def fp(y_true, y_pred): return confusion_matrix(y_true, y_pred)[0, 1]
def fn(y_true, y_pred): return confusion_matrix(y_true, y_pred)[1, 0]
def tp(y_true, y_pred): return confusion_matrix(y_true, y_pred)[1, 1]

score_metrics = {'acc': accuracy_score,
               'balanced_accuracy': balanced_accuracy_score,
               'prec': precision_score,
               'recall': recall_score,
               'f1-score': f1_score,
               'tp': tp, 'tn': tn,
               'fp': fp, 'fn': fn,
               'cohens_kappa':cohen_kappa_score,
               'matthews_corrcoef':matthews_corrcoef,
               "roc_auc":roc_auc_score}

### Machine Learning Models

#### Naive Bayes

In [7]:
def NB(dict,df_results):
    if multinomial_naive_bayes:
        df_results = df_results.append(report(naive_bayes.MultinomialNB(), dict['xtrain_count'],dict['train_y_sw'], name='NB_Count_Vectors', cv=5, dict_scoring=score_metrics))
        df_results = df_results.append(report(naive_bayes.MultinomialNB(), dict['xtrain_tfidf'],dict['train_y'], name='NB_WordLevel_TF-IDF', cv=5, dict_scoring=score_metrics))
        df_results = df_results.append(report(naive_bayes.MultinomialNB(), dict['xtrain_tfidf_ngram'],dict['train_y'], name='NB_N-Gram_TF-IDF', cv=5, dict_scoring=score_metrics))
        df_results = df_results.append(report(naive_bayes.MultinomialNB(), dict['xtrain_tfidf_ngram_chars'],dict['train_y'], name='NB_CharLevel_TF-IDF', cv=5, dict_scoring=score_metrics))
        df_results = df_results.append(report(naive_bayes.MultinomialNB(), dict['train_seq_x'],dict['train_y'], name='NB_Words', cv=5, dict_scoring=score_metrics))
        return df_results

In [8]:
start = time.process_time()
 
tweet_results = NB(tweet_x_y, tweet_results) 

end = time.process_time()

print(f"Elapsed time : {(end - start):.03f} secs.")


Elapsed time : 0.298 secs.


In [9]:
start = time.process_time()

news_results = NB(news_x_y, news_results)

end = time.process_time()

print(f"Elapsed time : {(end - start):.03f} secs.")

Elapsed time : 0.301 secs.


#### Logistic Regression

In [10]:
def LR(dict,df_results):
    if logistic_regression:
        df_results = df_results.append(report(linear_model.LogisticRegression(max_iter=1000), dict['xtrain_count'],dict['train_y_sw'], name='LR_Count_Vectors', cv=5, dict_scoring=score_metrics))
        df_results = df_results.append(report(linear_model.LogisticRegression(max_iter=1000), dict['xtrain_tfidf'],dict['train_y'], name='LR_WordLevel_TF-IDF', cv=5, dict_scoring=score_metrics))
        df_results = df_results.append(report(linear_model.LogisticRegression(max_iter=1000), dict['xtrain_tfidf_ngram'],dict['train_y'], name='LR_N-Gram_TF-IDF', cv=5, dict_scoring=score_metrics))
        df_results = df_results.append(report(linear_model.LogisticRegression(max_iter=1000), dict['xtrain_tfidf_ngram_chars'],dict['train_y'], name='LR_CharLevel_TF-IDF', cv=5, dict_scoring=score_metrics))
        df_results = df_results.append(report(linear_model.LogisticRegression(max_iter=1000), dict['train_seq_x'],dict['train_y'], name='LR_Words', cv=5, dict_scoring=score_metrics))
        return df_results

In [11]:
start = time.process_time()

tweet_results = LR(tweet_x_y, tweet_results)

end = time.process_time()

print(f"Elapsed time : {(end - start):.03f} secs.")

Elapsed time : 0.179 secs.


In [12]:
start = time.process_time()

news_results = LR(news_x_y, news_results)

end = time.process_time()

print(f"Elapsed time : {(end - start):.03f} secs.")

Elapsed time : 0.268 secs.


#### Support Vector Machine

In [13]:
def support_vector(dict,df_results):
    if svm_model:
        df_results = df_results.append(report(svm.SVC(), dict['xtrain_count'],dict['train_y_sw'], name='SVM_Count_Vectors', cv=5, dict_scoring=score_metrics))
        df_results = df_results.append(report(svm.SVC(), dict['xtrain_tfidf'],dict['train_y'], name='SVM_WordLevel_TF-IDF', cv=5, dict_scoring=score_metrics))
        df_results = df_results.append(report(svm.SVC(), dict['xtrain_tfidf_ngram'],dict['train_y'], name='SVM_N-Gram_TF-IDF', cv=5, dict_scoring=score_metrics))
        df_results = df_results.append(report(svm.SVC(), dict['xtrain_tfidf_ngram_chars'],dict['train_y'], name='SVM_CharLevel_TF-IDF', cv=5, dict_scoring=score_metrics))
        df_results = df_results.append(report(svm.SVC(), dict['train_seq_x'],dict['train_y'], name='SVM_Words', cv=5, dict_scoring=score_metrics))
        return df_results

In [14]:
start = time.process_time()

tweet_results = support_vector(tweet_x_y, tweet_results)

end = time.process_time()

print(f"Elapsed time : {(end - start):.03f} secs.")

Elapsed time : 0.187 secs.


In [15]:
start = time.process_time()

news_results = support_vector(news_x_y, news_results)

end = time.process_time()

print(f"Elapsed time : {(end - start):.03f} secs.")

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Inc

Elapsed time : 0.998 secs.


#### Kth Nearest Neighbor

In [16]:
def knn(dict,df_results):
    if k_nn_model:
        df_results = df_results.append(report(KNeighborsClassifier(n_neighbors=20, weights='distance', n_jobs=-1), dict['xtrain_count'],dict['train_y_sw'], name='kNN_Count_Vectors', cv=5, dict_scoring=score_metrics))
        df_results = df_results.append(report(KNeighborsClassifier(n_neighbors=20, weights='distance', n_jobs=-1), dict['xtrain_tfidf'],dict['train_y'], name='kNN_WordLevel_TF-IDF', cv=5, dict_scoring=score_metrics))
        df_results = df_results.append(report(KNeighborsClassifier(n_neighbors=20, weights='distance', n_jobs=-1), dict['xtrain_tfidf_ngram'],dict['train_y'], name='kNN_N-Gram_TF-IDF', cv=5, dict_scoring=score_metrics))
        df_results = df_results.append(report(KNeighborsClassifier(n_neighbors=20, weights='distance', n_jobs=-1), dict['xtrain_tfidf_ngram_chars'],dict['train_y'], name='kNN_CharLevel_TF-IDF', cv=5, dict_scoring=score_metrics))
        df_results = df_results.append(report(KNeighborsClassifier(n_neighbors=20, weights='distance', n_jobs=-1), dict['train_seq_x'],dict['train_y'], name='kNN_Words', cv=5, dict_scoring=score_metrics))
        return df_results

In [17]:
start = time.process_time()

tweet_results = knn(tweet_x_y, tweet_results)

end = time.process_time()

print(f"Elapsed time : {(end - start):.03f} secs.")

Elapsed time : 0.186 secs.


In [18]:
start = time.process_time()

news_results = knn(news_x_y, news_results)

end = time.process_time()

print(f"Elapsed time : {(end - start):.03f} secs.")

Elapsed time : 0.303 secs.


#### Random Forest

In [19]:
def random_forest(dict,df_results):
    if random_forest:
        df_results = df_results.append(report(ensemble.RandomForestClassifier(bootstrap=True,min_impurity_decrease=1e-7,n_jobs=-1, random_state=42), dict['xtrain_count'],dict['train_y_sw'], name='RF_Count_Vectors', cv=5, dict_scoring=score_metrics))
        df_results = df_results.append(report(ensemble.RandomForestClassifier(bootstrap=True,min_impurity_decrease=1e-7,n_jobs=-1, random_state=42), dict['xtrain_tfidf'],dict['train_y'], name='RF_WordLevel_TF-IDF', cv=5, dict_scoring=score_metrics))
        df_results = df_results.append(report(ensemble.RandomForestClassifier(bootstrap=True,min_impurity_decrease=1e-7,n_jobs=-1, random_state=42), dict['xtrain_tfidf_ngram'],dict['train_y'], name='RF_N-Gram_TF-IDF', cv=5, dict_scoring=score_metrics))
        df_results = df_results.append(report(ensemble.RandomForestClassifier(bootstrap=True,min_impurity_decrease=1e-7,n_jobs=-1, random_state=42), dict['xtrain_tfidf_ngram_chars'],dict['train_y'], name='RF_CharLevel_TF-IDF', cv=5, dict_scoring=score_metrics))
        df_results = df_results.append(report(ensemble.RandomForestClassifier(bootstrap=True,min_impurity_decrease=1e-7,n_jobs=-1, random_state=42), dict['train_seq_x'],dict['train_y'], name='RF_Words', cv=5, dict_scoring=score_metrics))
        return df_results

In [20]:
start = time.process_time()

tweet_results = random_forest(tweet_x_y, tweet_results)

end = time.process_time()

print(f"Elapsed time : {(end - start):.03f} secs.")

Elapsed time : 0.202 secs.


In [21]:
start = time.process_time()

news_results = random_forest(news_x_y, news_results)

end = time.process_time()

print(f"Elapsed time : {(end - start):.03f} secs.")

Elapsed time : 0.385 secs.


#### Stochastic Gradient Descent

In [22]:
def sgd(dict,df_results):
    if sgd:
        df_results = df_results.append(report(SGDClassifier(loss='modified_huber', max_iter=1000, tol=1e-3,   n_iter_no_change=10, early_stopping=True, n_jobs=-1 ), dict['xtrain_count'],dict['train_y_sw'], name='SGD_Count_Vectors', cv=5, dict_scoring=score_metrics))
        df_results = df_results.append(report(SGDClassifier(loss='modified_huber', max_iter=1000, tol=1e-3,   n_iter_no_change=10, early_stopping=True, n_jobs=-1 ), dict['xtrain_tfidf'],dict['train_y'], name='SGD_WordLevel_TF-IDF', cv=5, dict_scoring=score_metrics))
        df_results = df_results.append(report(SGDClassifier(loss='modified_huber', max_iter=1000, tol=1e-3,   n_iter_no_change=10, early_stopping=True, n_jobs=-1 ), dict['xtrain_tfidf_ngram'],dict['train_y'], name='SGD_N-Gram_Vectors', cv=5, dict_scoring=score_metrics))
        df_results = df_results.append(report(SGDClassifier(loss='modified_huber', max_iter=1000, tol=1e-3,   n_iter_no_change=10, early_stopping=True, n_jobs=-1 ), dict['xtrain_tfidf_ngram_chars'],dict['train_y'], name='SGD_CharLevel_Vectors', cv=5, dict_scoring=score_metrics))
        df_results = df_results.append(report(SGDClassifier(loss='modified_huber', max_iter=1000, tol=1e-3,   n_iter_no_change=10, early_stopping=True, n_jobs=-1 ), dict['train_seq_x'],dict['train_y'], name='SGD_Words', cv=5, dict_scoring=score_metrics))
        return df_results

In [23]:
start = time.process_time()

tweet_results = sgd(tweet_x_y, tweet_results)

end = time.process_time()

print(f"Elapsed time : {(end - start):.03f} secs.")

Elapsed time : 0.198 secs.


In [24]:
start = time.process_time()

news_results = sgd(news_x_y, news_results)

end = time.process_time()

print(f"Elapsed time : {(end - start):.03f} secs.")

Elapsed time : 0.268 secs.


#### Gradient Boosting

In [25]:
def gradient_boosting(dict,df_results):
    if gradient_boosting:
        df_results = df_results.append(report(ensemble.GradientBoostingClassifier(n_estimators=1000,
                                                   validation_fraction=0.2,
                                                   n_iter_no_change=10, tol=0.01,
                                                   random_state=0, verbose=0 ), dict['xtrain_count'],dict['train_y_sw'], name='GB_Count_Vectors', cv=5, dict_scoring=score_metrics))
        df_results = df_results.append(report(ensemble.GradientBoostingClassifier(n_estimators=1000,
                                                   validation_fraction=0.2,
                                                   n_iter_no_change=10, tol=0.01,
                                                   random_state=0, verbose=0 ), dict['xtrain_tfidf'],dict['train_y'], name='GB_WordLevel_TF-IDF', cv=5, dict_scoring=score_metrics))
        df_results = df_results.append(report(ensemble.GradientBoostingClassifier(n_estimators=1000,
                                                   validation_fraction=0.2,
                                                   n_iter_no_change=10, tol=0.01,
                                                   random_state=0, verbose=0 ), dict['xtrain_tfidf_ngram'],dict['train_y'], name='GB_N-Gram_TF-IDF', cv=5, dict_scoring=score_metrics))
        df_results = df_results.append(report(ensemble.GradientBoostingClassifier(n_estimators=1000,
                                                   validation_fraction=0.2,
                                                   n_iter_no_change=10, tol=0.01,
                                                   random_state=0, verbose=0 ), dict['xtrain_tfidf_ngram_chars'],dict['train_y'], name='GB_CharLevel_TF-IDF', cv=5, dict_scoring=score_metrics))
        df_results = df_results.append(report(ensemble.GradientBoostingClassifier(n_estimators=1000,
                                                   validation_fraction=0.2,
                                                   n_iter_no_change=10, tol=0.01,
                                                   random_state=0, verbose=0 ), dict['train_seq_x'],dict['train_y'], name='GB_Words', cv=5, dict_scoring=score_metrics))
        
        return df_results

In [26]:
start = time.process_time()

tweet_results = gradient_boosting(tweet_x_y, tweet_results)

end = time.process_time()

print(f"Elapsed time : {(end - start):.03f} secs.")

Elapsed time : 0.181 secs.


In [27]:
start = time.process_time()

news_results = gradient_boosting(news_x_y, news_results)

end = time.process_time()

print(f"Elapsed time : {(end - start):.03f} secs.")

Elapsed time : 0.299 secs.


#### XG Boosting

In [28]:
def xg_boosting(dict,df_results):
    if xgboost_classifier:
        fit_params1={'early_stopping_rounds':10,\
                             'eval_set':[(dict['xvalid_count'],dict['valid_y_sw'])]}
        df_results = df_results.append(report(XGBClassifier(n_estimators=1000, subsample=0.8), dict['xtrain_count'],dict['train_y_sw'], name='XGB_Count_Vectors', cv=5, fit_params=fit_params1, dict_scoring=score_metrics))

        fit_params2={'early_stopping_rounds':10,\
                             'eval_set':[(dict['xvalid_tfidf'],dict['valid_y'])]}
        df_results = df_results.append(report(XGBClassifier(n_estimators=1000, subsample=0.8), dict['xtrain_tfidf'],dict['train_y'], name='XGB_WordLevel_TF-IDF', cv=5, fit_params=fit_params2, dict_scoring=score_metrics))

        fit_params3={'early_stopping_rounds':10,\
                             'eval_set':[(dict['xvalid_tfidf_ngram'],dict['valid_y'])]}
        df_results = df_results.append(report(XGBClassifier(n_estimators=1000, subsample=0.8), dict['xtrain_tfidf_ngram'],dict['train_y'], name='XGB_N-Gram_TF-IDF', cv=5, fit_params=fit_params3, dict_scoring=score_metrics))

        fit_params4={'early_stopping_rounds':10,\
                             'eval_set':[(dict['xvalid_tfidf_ngram_chars'],dict['valid_y'])]}
        df_results = df_results.append(report(XGBClassifier(n_estimators=1000, subsample=0.8), dict['xtrain_tfidf_ngram_chars'],dict['train_y'], name='XGB_CharLevel_TF-IDF', cv=5, fit_params=fit_params4, dict_scoring=score_metrics))

        fit_params5={'early_stopping_rounds':10,\
                             'eval_set':[(dict['valid_seq_x'],dict['valid_y'])]}
        df_results = df_results.append(report(XGBClassifier(n_estimators=1000, subsample=0.8), dict['train_seq_x'],dict['train_y'], name='XGB_Words', cv=5, fit_params=fit_params5, dict_scoring=score_metrics))
        
        return df_results

In [29]:
start = time.process_time()

tweet_results = xg_boosting(tweet_x_y, tweet_results)

end = time.process_time()

print(f"Elapsed time : {(end - start):.03f} secs.")

[0]	validation_0-logloss:0.62492
[0]	validation_0-logloss:0.62612
[0]	validation_0-logloss:0.62809
[0]	validation_0-logloss:0.62318
[1]	validation_0-logloss:0.58909
[1]	validation_0-logloss:0.59302
[1]	validation_0-logloss:0.59075
[1]	validation_0-logloss:0.59512
[2]	validation_0-logloss:0.57301
[2]	validation_0-logloss:0.57625
[2]	validation_0-logloss:0.57857
[2]	validation_0-logloss:0.57518
[3]	validation_0-logloss:0.56504
[3]	validation_0-logloss:0.55915
[3]	validation_0-logloss:0.56955
[3]	validation_0-logloss:0.56447
[4]	validation_0-logloss:0.56027
[4]	validation_0-logloss:0.55339
[4]	validation_0-logloss:0.56587
[4]	validation_0-logloss:0.56028
[5]	validation_0-logloss:0.55827
[5]	validation_0-logloss:0.55177
[5]	validation_0-logloss:0.56372
[5]	validation_0-logloss:0.55828
[6]	validation_0-logloss:0.55774
[6]	validation_0-logloss:0.55224
[6]	validation_0-logloss:0.56344
[6]	validation_0-logloss:0.55722
[7]	validation_0-logloss:0.55561
[7]	validation_0-logloss:0.55122
[7]	valida

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


[5]	validation_0-logloss:0.55794
[6]	validation_0-logloss:0.55866
[7]	validation_0-logloss:0.55776
[8]	validation_0-logloss:0.55772
[9]	validation_0-logloss:0.55831
[10]	validation_0-logloss:0.55665
[11]	validation_0-logloss:0.55701
[12]	validation_0-logloss:0.55650
[13]	validation_0-logloss:0.55588
[14]	validation_0-logloss:0.55569
[15]	validation_0-logloss:0.55644
[16]	validation_0-logloss:0.56002
[17]	validation_0-logloss:0.56187
[18]	validation_0-logloss:0.56219


  _warn_prf(average, modifier, msg_start, len(result))


[19]	validation_0-logloss:0.56264
[20]	validation_0-logloss:0.56214
[21]	validation_0-logloss:0.56316
[22]	validation_0-logloss:0.56123
[23]	validation_0-logloss:0.56092
[0]	validation_0-logloss:0.62544
[1]	validation_0-logloss:0.59187
[0]	validation_0-logloss:0.62546
[2]	validation_0-logloss:0.57801
[1]	validation_0-logloss:0.59139
[3]	validation_0-logloss:0.56956
[2]	validation_0-logloss:0.57550
[4]	validation_0-logloss:0.56362
[3]	validation_0-logloss:0.56763
[5]	validation_0-logloss:0.56224
[4]	validation_0-logloss:0.56301
[6]	validation_0-logloss:0.56089
[5]	validation_0-logloss:0.56316
[7]	validation_0-logloss:0.55834
[6]	validation_0-logloss:0.56344
[8]	validation_0-logloss:0.56028
[7]	validation_0-logloss:0.56274
[9]	validation_0-logloss:0.55933
[8]	validation_0-logloss:0.56155
[10]	validation_0-logloss:0.56218
[9]	validation_0-logloss:0.56395
[11]	validation_0-logloss:0.56471
[10]	validation_0-logloss:0.56502
[12]	validation_0-logloss:0.56408
[11]	validation_0-logloss:0.56504


In [30]:
start = time.process_time()

news_results = xg_boosting(news_x_y, news_results)

end = time.process_time()

print(f"Elapsed time : {(end - start):.03f} secs.")

[0]	validation_0-logloss:0.67369
[0]	validation_0-logloss:0.67375
[0]	validation_0-logloss:0.67395
[0]	validation_0-logloss:0.67358
[1]	validation_0-logloss:0.66215
[1]	validation_0-logloss:0.66266
[1]	validation_0-logloss:0.66305
[1]	validation_0-logloss:0.66223
[2]	validation_0-logloss:0.65437
[2]	validation_0-logloss:0.65506
[2]	validation_0-logloss:0.65637
[2]	validation_0-logloss:0.65548
[3]	validation_0-logloss:0.64992
[3]	validation_0-logloss:0.64946
[3]	validation_0-logloss:0.65073
[3]	validation_0-logloss:0.64978
[4]	validation_0-logloss:0.64564
[4]	validation_0-logloss:0.64514
[4]	validation_0-logloss:0.64666
[4]	validation_0-logloss:0.64544
[5]	validation_0-logloss:0.64176
[5]	validation_0-logloss:0.64105
[5]	validation_0-logloss:0.64092
[5]	validation_0-logloss:0.64192
[6]	validation_0-logloss:0.63780
[6]	validation_0-logloss:0.63786
[6]	validation_0-logloss:0.63841
[6]	validation_0-logloss:0.63803
[7]	validation_0-logloss:0.63404
[7]	validation_0-logloss:0.63522
[7]	valida

### Results

In [31]:
tweet_results[["Model","test_acc_mean", "test_prec_mean", "test_recall_mean", "test_f1-score_mean", "test_cohens_kappa_mean"]].sort_values(by=["test_prec_mean"], ascending=False)

Unnamed: 0,Model,test_acc_mean,test_prec_mean,test_recall_mean,test_f1-score_mean,test_cohens_kappa_mean
0,GB_Words,0.908796,1.0,0.635439,0.77689,0.723151
0,RF_Words,0.897624,0.96541,0.612428,0.749193,0.688965
0,XGB_Words,0.901946,0.916043,0.670003,0.773752,0.713067
0,SVM_Words,0.764235,0.643585,0.138275,0.226845,0.150658
0,kNN_Words,0.772534,0.626285,0.227672,0.333105,0.230673
0,LR_N-Gram_TF-IDF,0.750541,0.6,0.005766,0.011408,0.007169
0,XGB_Count_Vectors,0.750899,0.536667,0.028819,0.054178,0.030015
0,XGB_WordLevel_TF-IDF,0.745489,0.448776,0.041789,0.075853,0.03169
0,LR_CharLevel_TF-IDF,0.749822,0.426667,0.008675,0.016863,0.00856
0,LR_Count_Vectors,0.737924,0.422885,0.129674,0.198199,0.091653


In [32]:
news_results[["Model","test_acc_mean", "test_prec_mean", "test_recall_mean", "test_f1-score_mean", "test_cohens_kappa_mean"]].sort_values(by=["test_prec_mean"], ascending=False)

Unnamed: 0,Model,test_acc_mean,test_prec_mean,test_recall_mean,test_f1-score_mean,test_cohens_kappa_mean
0,GB_N-Gram_TF-IDF,0.544049,0.983478,0.043642,0.083549,0.044908
0,kNN_Count_Vectors,0.539419,0.976062,0.034015,0.065736,0.034777
0,kNN_WordLevel_TF-IDF,0.529679,0.88807,0.014945,0.02932,0.013543
0,GB_WordLevel_TF-IDF,0.641188,0.869847,0.290454,0.435353,0.258507
0,GB_Count_Vectors,0.641712,0.868558,0.292565,0.437423,0.259699
0,RF_Count_Vectors,0.760515,0.827326,0.628678,0.714347,0.51471
0,XGB_N-Gram_TF-IDF,0.574885,0.811082,0.140643,0.239563,0.114891
0,kNN_CharLevel_TF-IDF,0.735313,0.81041,0.582005,0.675889,0.462578
0,SVM_Count_Vectors,0.796287,0.810096,0.747778,0.777616,0.590243
0,SVM_WordLevel_TF-IDF,0.795807,0.808926,0.748418,0.777405,0.589328


In [33]:
if save_results:
    tweet_results.sort_values(by=["test_prec_mean", "test_recall_mean"], ascending=False).to_csv("tweet_results_ML.csv", index=False)
    news_results.sort_values(by=["test_prec_mean", "test_recall_mean"], ascending=False).to_csv("news_results_ML.csv", index=False)