<a href="https://colab.research.google.com/github/alexlimatds/fact_extraction/blob/main/AILA2020/FACTS_AILA_TF_IDF_approach_1_cross_validation.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Facts extraction with AILA data and TF-IDF features

This notebook experiments TF-IDF features in order to find the best hyperparameters.

The computation of the TF-IDF weights is based on sentences instead on the traditional document-based approach:

- Sentences are used to train the TF-IDF model.
- TF-IDF vectors are computed for sentences and in order to do this, a sentence is fed into the TF-IDF model.

Data used in this notebook:

- for cross-validation: the train dataset from AILA 2020. This can be obtained at https://github.com/Law-AI/semantic-segmentation;

### Loading dataset

In [None]:
from google.colab import drive
drive.mount('/content/gdrive', force_remount=True)
g_drive_dir = "/content/gdrive/MyDrive/"

Mounted at /content/gdrive


In [None]:
!rm -r data
!mkdir data
!mkdir data/train
!tar -xf {g_drive_dir}fact_extraction_AILA/train.tar.xz -C data/train

train_dir = 'data/train/'

rm: cannot remove 'data': No such file or directory


In [None]:
import pandas as pd
from os import listdir

def read_docs(dir_name):
  """
  Read the docs in a directory.
  Params:
    dir_name : the directory that contains the documents.
  Returns:
    A dictionary whose keys are the names of the read files and the values are 
    pandas dataframes. Each dataframe has the columns sentence and label.
  """
  docs = {} # key: file name, value: dataframe with sentences and labels
  for f in listdir(dir_name):
    df = pd.read_csv(
        dir_name + f, 
        sep='\t', 
        names=['sentence', 'label'])
    docs[f] = df
  return docs

docs_train = read_docs(train_dir)

print(f'TRAIN: {len(docs_train)} documents read.')

TRAIN: 50 documents read.


### Counting sentences by label

In [None]:
def sentences_to_list(docs):
  """
  Returns, as lists, the sentences and the labels in a set of documents.
  Params:
    docs : a dictionary as returned by the read_docs function.
  Returns:
    - A list containing all sentences in the docs param.
    - A list containing all labels in the docs param. The indexes of this list are 
    respective to indexes in returned sentence list.

  """
  sentences_list = []
  targets_list = []
  for df in docs.values():
    sentences_list.extend(df['sentence'].tolist())
    targets_list.extend(df['label'].tolist())
  return sentences_list, targets_list

def target_stats(set_name, targets):
  stats = {}
  for t in targets:
    stats[t] = stats.get(t, 0) + 1
  print(f'Statistics of the {set_name} set:')
  print(f'   Total number of sentences: {len(targets)}')
  for t, n in stats.items():
    print(f'   Number of {t} labels: {n}')

sentences_train, train_targets = sentences_to_list(docs_train)
target_stats('TRAIN', train_targets)

Statistics of the TRAIN set:
   Total number of sentences: 9380
   Number of Facts labels: 2219
   Number of Other labels: 7161


### Spliting sentences into folds

In [None]:
# Reading the file containing the sets of trains documents and test documents by fold
train_files_by_fold = []  # Each index in the list represents a fold and stores a list of file names
test_files_by_fold = []   # Each index in the list represents a fold and stores a list of file names

df_folds = pd.read_csv(
  g_drive_dir + 'fact_extraction_AILA/train_docs_by_fold.csv', 
  sep=';', 
  names=['train', 'test'])
for line in df_folds['train'].tolist():
  train_files_by_fold.append(line.split(','))
for line in df_folds['test'].tolist():
  test_files_by_fold.append(line.split(','))

for i in range(len(test_files_by_fold)):
  print(f'Fold {i}: \n\tTrain files: {train_files_by_fold[i]} \n\tTest files: {test_files_by_fold[i]}')

Fold 0: 
	Train files: ['d_44.txt', 'd_39.txt', 'd_12.txt', 'd_2.txt', 'd_7.txt', 'd_33.txt', 'd_16.txt', 'd_8.txt', 'd_42.txt', 'd_34.txt', 'd_40.txt', 'd_24.txt', 'd_36.txt', 'd_11.txt', 'd_13.txt', 'd_19.txt', 'd_18.txt', 'd_4.txt', 'd_1.txt', 'd_21.txt', 'd_15.txt', 'd_23.txt', 'd_32.txt', 'd_9.txt', 'd_5.txt', 'd_3.txt', 'd_26.txt', 'd_20.txt', 'd_30.txt', 'd_41.txt', 'd_46.txt', 'd_43.txt', 'd_50.txt', 'd_27.txt', 'd_25.txt', 'd_35.txt', 'd_45.txt', 'd_17.txt', 'd_48.txt', 'd_6.txt'] 
	Test files: ['d_22.txt', 'd_31.txt', 'd_49.txt', 'd_14.txt', 'd_29.txt', 'd_47.txt', 'd_10.txt', 'd_38.txt', 'd_28.txt', 'd_37.txt']
Fold 1: 
	Train files: ['d_22.txt', 'd_31.txt', 'd_49.txt', 'd_14.txt', 'd_29.txt', 'd_47.txt', 'd_10.txt', 'd_38.txt', 'd_28.txt', 'd_37.txt', 'd_40.txt', 'd_24.txt', 'd_36.txt', 'd_11.txt', 'd_13.txt', 'd_19.txt', 'd_18.txt', 'd_4.txt', 'd_1.txt', 'd_21.txt', 'd_15.txt', 'd_23.txt', 'd_32.txt', 'd_9.txt', 'd_5.txt', 'd_3.txt', 'd_26.txt', 'd_20.txt', 'd_30.txt', 'd_

In [None]:
from sklearn.model_selection import KFold

def docs_to_sentences(file_names, docs_dic):
  """
  Extracts the sentences and the labels from a subset of documents.
  Params:
    file_names    : List with the names of the documents in the desired subset.
    docs_dic      : Dictionary of documents as returned by the read_docs function.
  Returns:
    - A list of sentences (strings).
    - A list of labels (strings). The indexes of this list are 
    respective to the indexes in the returned sentence list.
  """
  sentences_ = []
  targets_ = []
  for fname in file_names:
    sentences_.extend(docs_dic[fname]['sentence'].tolist())
    targets_.extend(docs_dic[fname]['label'].tolist())
  
  return sentences_, targets_


### Evaluation functions

In [None]:
import sklearn
from sklearn.model_selection import KFold
from sklearn.metrics import precision_recall_fscore_support
from IPython.display import display, HTML
import numpy as np

def metrics_report(title, metrics):
  report_df = pd.DataFrame(columns=['Precision', 'P std', 'Recall', 'R std', 'F1', 'F1 std'])
  for (model, p, p_std, r, r_std, f1, f1_std) in metrics:
    report_df.loc[model] = [f'{p:.4f}', f'{p_std:.4f}', f'{r:.4f}', f'{r_std:.4f}', f'{f1:.4f}', f'{f1_std:.4f}']
    display(HTML(f'<br><span style="font-weight: bold">{title}: cross-validation averages</span>'))
    display(report_df)

def update_report(display_id, report_df, metrics):
  model, p, p_std, r, r_std, f1, f1_std = metrics
  report_df.loc[model] = [f'{p:.4f}', f'{p_std:.4f}', f'{r:.4f}', f'{r_std:.4f}', f'{f1:.4f}', f'{f1_std:.4f}']
  update_display(report_df, display_id=display_id)

test_metrics = {}

def cross_validation(model_tuples, tfidf_builder, set_description, verbose_vocab=False):
  """
  Params:
    model_tuples  : A list of tuples. For each tuple the first element is a function 
                    returning a unfited machine learning model and the second one 
                    is a flag to use numpy vectors or not.
    tfidf_builder : A function returning a unfited TF-IDF model.
    set_description : Text description of the feature set.
    verbose_vocab   : If the size of the vocabulary must be printed or not.
  """
  train_metrics_cross = {}
  test_metrics_cross = {}
  tfidf_model = tfidf_builder()
  #for i_fold, (train_sentences, train_targets, test_sentences, test_targets) in enumerate(sentence_folds):
  for i_fold in range(len(train_files_by_fold)):
    print(f'Starting fold {i_fold}')
    train_sentences, train_targets = docs_to_sentences(train_files_by_fold[i_fold], docs_train)
    test_sentences, test_targets = docs_to_sentences(test_files_by_fold[i_fold], docs_train)
    tfidf_model.fit(train_sentences)
    if verbose_vocab:
      print(f'   Learned {len(tfidf_model.vocabulary_)} terms.')
    for (model_builder, to_dense) in model_tuples:
      model = model_builder()
      model_name = model.__class__.__name__
      print(f'   Processing model: {model_name}')
      if to_dense:
        train_features = tfidf_model.transform(train_sentences).toarray()
        test_features = tfidf_model.transform(test_sentences).toarray()
      else:
        train_features = tfidf_model.transform(train_sentences)
        test_features = tfidf_model.transform(test_sentences)
      model.fit(train_features, train_targets)
      # test metrics
      predictions = model.predict(test_features)
      p_test, r_test, f1_test, _ = precision_recall_fscore_support(
          test_targets, 
          predictions, 
          average='binary', 
          pos_label='Facts', 
          zero_division=0)
      model_metrics = test_metrics_cross.get(model_name, [])
      model_metrics.append([p_test, r_test, f1_test])
      test_metrics_cross[model_name] = model_metrics
      # train metrics
      predictions = model.predict(train_features)
      p_train, r_train, f1_train, _ = precision_recall_fscore_support(
          train_targets, 
          predictions, 
          average='binary', 
          pos_label='Facts', 
          zero_division=0)
      model_metrics = train_metrics_cross.get(model_name, [])
      model_metrics.append([p_train, r_train, f1_train])
      train_metrics_cross[model_name] = model_metrics

  # averaging and reporting the metrics achieved in each fold
  # train metrics
  report_df_train = pd.DataFrame(columns=['Precision', 'P std', 'Recall', 'R std', 'F1', 'F1 std'])
  for model_name, metrics in train_metrics_cross.items():
    model_metrics = np.array(metrics)
    mean = np.mean(model_metrics, axis=0)
    std = np.std(model_metrics, axis=0)
    report_df_train.loc[model_name] = [
        f'{mean[0]:.4f}', f'{std[0]:.4f}',  # precision
        f'{mean[1]:.4f}', f'{std[1]:.4f}',  # recall
        f'{mean[2]:.4f}', f'{std[2]:.4f}']  # f1
  display(HTML(f'<br><span style="font-weight: bold">TRAIN: cross-validation averages</span>'))
  display(report_df_train)
  # test metrics
  report_df_test = pd.DataFrame(columns=['Precision', 'P std', 'Recall', 'R std', 'F1', 'F1 std'])
  for model_name, metrics in test_metrics_cross.items():
    model_metrics = np.array(metrics)
    mean = np.mean(model_metrics, axis=0)
    std = np.std(model_metrics, axis=0)
    report_df_test.loc[model_name] = [
        f'{mean[0]:.4f}', f'{std[0]:.4f}',  # precision
        f'{mean[1]:.4f}', f'{std[1]:.4f}',  # recall
        f'{mean[2]:.4f}', f'{std[2]:.4f}']  # f1
    # metrics for the summary
    summary_model_metrics = test_metrics.get(model_name, [])
    summary_model_metrics.append((set_description, mean, std))
    test_metrics[model_name] = summary_model_metrics
  display(HTML(f'<br><span style="font-weight: bold">TEST: cross-validation averages</span>'))
  display(report_df_test)


### Pre-processing function

In [None]:
import re

def preprocess(str):
  pstr = str
  pstr = re.sub(r'[/(){}\[\]\|@,;]', ' ', pstr) # replaces symbols with spaces
  pstr = re.sub(r'[^0-9a-z #+_]', '', pstr)     # removes bad symbols
  pstr = re.sub(r'\d+', '', pstr)               # removes numbers
  return pstr

### Models

#### MLP

In [None]:
from sklearn.neural_network import MLPClassifier

def mlp():
  # Default MLP from scikit-learn
  return MLPClassifier(early_stopping=True, random_state=1)

#### Linear SVM

In [None]:
from sklearn.svm import LinearSVC

def linear_svm():
  return LinearSVC(random_state=1)

#### RBF SVM

In [None]:
from sklearn.svm import SVC

def rbf_svm():
  return SVC(kernel='rbf', random_state=1)

#### Logistic regression

In [None]:
from sklearn.linear_model import LogisticRegression

def logistic_regression():
  return LogisticRegression(solver='sag', max_iter=200, random_state=1)

#### KNN

In [None]:
from sklearn.neighbors import KNeighborsClassifier

def knn():
  return KNeighborsClassifier(5)

#### Decision Tree

In [None]:
from sklearn.tree import DecisionTreeClassifier

def decision_tree():
  return DecisionTreeClassifier(random_state=1)

#### Random Forest

In [None]:
from sklearn.ensemble import RandomForestClassifier

def random_forest():
  return RandomForestClassifier(random_state=1)

#### AdaBoost

In [None]:
from sklearn.ensemble import AdaBoostClassifier

def adaboost():
  return AdaBoostClassifier(random_state=1)

#### Naive Bayes

In [None]:
from sklearn.naive_bayes import GaussianNB

def naive_bayes():
  return GaussianNB()

#### XGBoost

In [None]:
from xgboost.sklearn import XGBClassifier

def xgboost():
  return XGBClassifier(objective="binary:logistic", tree_method='hist')

### Set 1

- N-grams: 1 to 3
- Stop words removal: No
- Vocabulary's size: no limits

Notes:
- It doesn't apply Naive Bayes and XGBoost models because there's no enough RAM to run them.

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer

def get_tf_idf_set1():
  return TfidfVectorizer(
      preprocessor=preprocess, 
      ngram_range=(1, 3), 
      max_features=None)


In [None]:
%%time

cross_validation(
    [(mlp, False), (linear_svm, False), (rbf_svm, False), (logistic_regression, False), 
    (knn, False), (decision_tree, False), (random_forest, False), (adaboost, False)], 
    get_tf_idf_set1, 
    'SET 1', 
    verbose_vocab=True)

Starting fold 0
   Learned 202869 terms.
   Processing model: MLPClassifier
   Processing model: LinearSVC
   Processing model: SVC
   Processing model: LogisticRegression
   Processing model: KNeighborsClassifier
   Processing model: DecisionTreeClassifier
   Processing model: RandomForestClassifier
   Processing model: AdaBoostClassifier
Starting fold 1
   Learned 202985 terms.
   Processing model: MLPClassifier
   Processing model: LinearSVC
   Processing model: SVC
   Processing model: LogisticRegression
   Processing model: KNeighborsClassifier
   Processing model: DecisionTreeClassifier
   Processing model: RandomForestClassifier
   Processing model: AdaBoostClassifier
Starting fold 2
   Learned 183416 terms.
   Processing model: MLPClassifier
   Processing model: LinearSVC
   Processing model: SVC
   Processing model: LogisticRegression
   Processing model: KNeighborsClassifier
   Processing model: DecisionTreeClassifier
   Processing model: RandomForestClassifier
   Processing 

Unnamed: 0,Precision,P std,Recall,R std,F1,F1 std
MLPClassifier,0.9835,0.0034,0.9174,0.008,0.9493,0.0042
LinearSVC,0.9982,0.0008,0.9716,0.0027,0.9847,0.001
SVC,0.9993,0.0005,0.9446,0.0045,0.9711,0.0025
LogisticRegression,0.9838,0.001,0.3364,0.0683,0.4973,0.0768
KNeighborsClassifier,0.6038,0.0536,0.2148,0.0353,0.3141,0.0383
DecisionTreeClassifier,0.995,0.0008,0.9761,0.0018,0.9854,0.0012
RandomForestClassifier,0.9983,0.0009,0.9727,0.002,0.9853,0.0013
AdaBoostClassifier,0.6891,0.0132,0.3424,0.0324,0.4563,0.0269


Unnamed: 0,Precision,P std,Recall,R std,F1,F1 std
MLPClassifier,0.6197,0.0863,0.333,0.0584,0.4262,0.0433
LinearSVC,0.6059,0.0828,0.3584,0.0324,0.4456,0.0214
SVC,0.7144,0.0531,0.1222,0.0178,0.2078,0.0254
LogisticRegression,0.7119,0.0945,0.1334,0.0153,0.2238,0.0234
KNeighborsClassifier,0.2403,0.0793,0.0833,0.023,0.1181,0.023
DecisionTreeClassifier,0.4096,0.0894,0.3761,0.042,0.3827,0.0271
RandomForestClassifier,0.6865,0.0984,0.1012,0.0112,0.1752,0.0144
AdaBoostClassifier,0.5188,0.0983,0.2235,0.0383,0.3108,0.0521


CPU times: user 26min, sys: 6min 3s, total: 32min 3s
Wall time: 25min 7s


### Set 2

- N-grams: 1 to 3
- Stop words removal: No
- Maximum vocabulary's size: 20,000


In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer

def get_tf_idf_set2():
  return TfidfVectorizer(
      preprocessor=preprocess, 
      ngram_range=(1, 3), 
      max_features=20000)


In [None]:
%%time

cross_validation(
    [(mlp, False), (linear_svm, False), (rbf_svm, False), (logistic_regression, False), 
    (knn, False), (decision_tree, False), (random_forest, False), (adaboost, False), 
    (xgboost, False), (naive_bayes, True)], 
    get_tf_idf_set2, 
    'SET 2', 
    verbose_vocab=True)

Starting fold 0
   Learned 20000 terms.
   Processing model: MLPClassifier
   Processing model: LinearSVC
   Processing model: SVC
   Processing model: LogisticRegression
   Processing model: KNeighborsClassifier
   Processing model: DecisionTreeClassifier
   Processing model: RandomForestClassifier
   Processing model: AdaBoostClassifier
   Processing model: XGBClassifier
   Processing model: GaussianNB
Starting fold 1
   Learned 20000 terms.
   Processing model: MLPClassifier
   Processing model: LinearSVC
   Processing model: SVC
   Processing model: LogisticRegression
   Processing model: KNeighborsClassifier
   Processing model: DecisionTreeClassifier
   Processing model: RandomForestClassifier
   Processing model: AdaBoostClassifier
   Processing model: XGBClassifier
   Processing model: GaussianNB
Starting fold 2
   Learned 20000 terms.
   Processing model: MLPClassifier
   Processing model: LinearSVC
   Processing model: SVC
   Processing model: LogisticRegression
   Processing

Unnamed: 0,Precision,P std,Recall,R std,F1,F1 std
MLPClassifier,0.9374,0.0147,0.8181,0.0543,0.8729,0.0353
LinearSVC,0.9887,0.0036,0.9418,0.0033,0.9647,0.0023
SVC,0.9953,0.0019,0.8948,0.0125,0.9423,0.0063
LogisticRegression,0.9422,0.0109,0.4885,0.0444,0.6419,0.0364
KNeighborsClassifier,0.6426,0.0673,0.253,0.0398,0.359,0.0363
DecisionTreeClassifier,0.9933,0.0017,0.9708,0.0024,0.9819,0.0014
RandomForestClassifier,0.9971,0.0014,0.967,0.0025,0.9818,0.0015
AdaBoostClassifier,0.6924,0.0121,0.3543,0.0244,0.4684,0.0233
XGBClassifier,0.9159,0.0069,0.2457,0.0212,0.3869,0.0261
GaussianNB,0.7011,0.0107,1.0,0.0,0.8243,0.0074


Unnamed: 0,Precision,P std,Recall,R std,F1,F1 std
MLPClassifier,0.6031,0.1044,0.3622,0.0255,0.4463,0.0153
LinearSVC,0.5665,0.0865,0.3816,0.0371,0.4506,0.0251
SVC,0.6589,0.0751,0.212,0.033,0.3174,0.0344
LogisticRegression,0.6777,0.0733,0.2065,0.0281,0.3143,0.0315
KNeighborsClassifier,0.2397,0.0934,0.0768,0.0142,0.1109,0.0107
DecisionTreeClassifier,0.4135,0.0909,0.3518,0.0335,0.3725,0.0285
RandomForestClassifier,0.6238,0.0838,0.1329,0.0144,0.2185,0.0221
AdaBoostClassifier,0.5295,0.1182,0.2415,0.0329,0.3305,0.0537
XGBClassifier,0.6769,0.0923,0.1159,0.0072,0.1976,0.013
GaussianNB,0.3932,0.0888,0.4333,0.0762,0.4027,0.0509


CPU times: user 7min 25s, sys: 1min 59s, total: 9min 25s
Wall time: 7min 27s


### Set 3

- N-grams: 1 to 3
- Stop words removal: No
- Maximum vocabulary's size: 2,000

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer

def get_tf_idf_set3():
  return TfidfVectorizer(
      preprocessor=preprocess, 
      ngram_range=(1, 3), 
      max_features=2000)


In [None]:
%%time

cross_validation(
    [(mlp, False), (linear_svm, False), (rbf_svm, False), (logistic_regression, False), 
    (knn, False), (decision_tree, False), (random_forest, False), (adaboost, False), 
    (xgboost, False), (naive_bayes, True)], 
    get_tf_idf_set3, 
    'SET 3', 
    verbose_vocab=True)

Starting fold 0
   Learned 2000 terms.
   Processing model: MLPClassifier
   Processing model: LinearSVC
   Processing model: SVC
   Processing model: LogisticRegression
   Processing model: KNeighborsClassifier
   Processing model: DecisionTreeClassifier
   Processing model: RandomForestClassifier
   Processing model: AdaBoostClassifier
   Processing model: XGBClassifier
   Processing model: GaussianNB
Starting fold 1
   Learned 2000 terms.
   Processing model: MLPClassifier
   Processing model: LinearSVC
   Processing model: SVC
   Processing model: LogisticRegression
   Processing model: KNeighborsClassifier
   Processing model: DecisionTreeClassifier
   Processing model: RandomForestClassifier
   Processing model: AdaBoostClassifier
   Processing model: XGBClassifier
   Processing model: GaussianNB
Starting fold 2
   Learned 2000 terms.
   Processing model: MLPClassifier
   Processing model: LinearSVC
   Processing model: SVC
   Processing model: LogisticRegression
   Processing mo

Unnamed: 0,Precision,P std,Recall,R std,F1,F1 std
MLPClassifier,0.7981,0.0103,0.5319,0.0476,0.6373,0.0346
LinearSVC,0.8534,0.0078,0.677,0.024,0.7547,0.0139
SVC,0.9762,0.0049,0.7887,0.021,0.8723,0.011
LogisticRegression,0.8347,0.012,0.4648,0.0345,0.5962,0.0277
KNeighborsClassifier,0.6625,0.0414,0.35,0.0348,0.4554,0.0227
DecisionTreeClassifier,0.9932,0.0012,0.954,0.0033,0.9732,0.0016
RandomForestClassifier,0.9969,0.0008,0.9502,0.0032,0.973,0.0015
AdaBoostClassifier,0.6896,0.0106,0.3522,0.0264,0.4658,0.0247
XGBClassifier,0.9114,0.016,0.2474,0.0219,0.3885,0.0262
GaussianNB,0.433,0.0292,0.9484,0.0081,0.5938,0.026


Unnamed: 0,Precision,P std,Recall,R std,F1,F1 std
MLPClassifier,0.6121,0.0893,0.3522,0.0343,0.4416,0.0221
LinearSVC,0.5359,0.0808,0.412,0.0317,0.4617,0.0322
SVC,0.6318,0.0894,0.2956,0.0356,0.3979,0.0272
LogisticRegression,0.6402,0.0729,0.3053,0.0327,0.4095,0.0225
KNeighborsClassifier,0.3035,0.0947,0.139,0.0303,0.1814,0.0301
DecisionTreeClassifier,0.4024,0.1031,0.3657,0.0271,0.3735,0.0363
RandomForestClassifier,0.6096,0.0824,0.2055,0.0181,0.3049,0.0139
AdaBoostClassifier,0.5143,0.1164,0.257,0.0379,0.3405,0.0553
XGBClassifier,0.6682,0.0837,0.1215,0.0079,0.2052,0.0121
GaussianNB,0.3507,0.0829,0.7989,0.0667,0.4805,0.0764


CPU times: user 3min 29s, sys: 15.1 s, total: 3min 44s
Wall time: 3min 29s


### Set 4

- N-grams: 1 to 2
- Stop words removal: No
- Vocabulary's size: No limits

Notes:
- It doesn't apply Naive Bayes and XGBoost models because there's no enough RAM to run them.

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer

def get_tf_idf_set4():
  return TfidfVectorizer(
      preprocessor=preprocess, 
      ngram_range=(1, 2), 
      max_features=None)


In [None]:
%%time

cross_validation(
    [(mlp, False), (linear_svm, False), (rbf_svm, False), (logistic_regression, False), 
    (knn, False), (decision_tree, False), (random_forest, False), (adaboost, False)], 
    get_tf_idf_set4, 
    'SET 4', 
    verbose_vocab=True)

Starting fold 0
   Learned 77988 terms.
   Processing model: MLPClassifier
   Processing model: LinearSVC
   Processing model: SVC
   Processing model: LogisticRegression
   Processing model: KNeighborsClassifier
   Processing model: DecisionTreeClassifier
   Processing model: RandomForestClassifier
   Processing model: AdaBoostClassifier
Starting fold 1
   Learned 77844 terms.
   Processing model: MLPClassifier
   Processing model: LinearSVC
   Processing model: SVC
   Processing model: LogisticRegression
   Processing model: KNeighborsClassifier
   Processing model: DecisionTreeClassifier
   Processing model: RandomForestClassifier
   Processing model: AdaBoostClassifier
Starting fold 2
   Learned 70986 terms.
   Processing model: MLPClassifier
   Processing model: LinearSVC
   Processing model: SVC
   Processing model: LogisticRegression
   Processing model: KNeighborsClassifier
   Processing model: DecisionTreeClassifier
   Processing model: RandomForestClassifier
   Processing mod

Unnamed: 0,Precision,P std,Recall,R std,F1,F1 std
MLPClassifier,0.9775,0.0029,0.9075,0.0155,0.9411,0.0084
LinearSVC,0.9981,0.0005,0.9695,0.0026,0.9836,0.0011
SVC,0.9988,0.0004,0.9276,0.0043,0.9618,0.0025
LogisticRegression,0.9701,0.0033,0.4324,0.0582,0.5958,0.0558
KNeighborsClassifier,0.6264,0.0341,0.2182,0.0274,0.3218,0.0269
DecisionTreeClassifier,0.995,0.0008,0.9761,0.0018,0.9854,0.0012
RandomForestClassifier,0.998,0.0007,0.9731,0.0021,0.9854,0.0012
AdaBoostClassifier,0.6867,0.0034,0.3505,0.0309,0.4633,0.0273


Unnamed: 0,Precision,P std,Recall,R std,F1,F1 std
MLPClassifier,0.6042,0.0949,0.3447,0.0494,0.4324,0.0346
LinearSVC,0.5888,0.0788,0.3692,0.0322,0.449,0.0164
SVC,0.7025,0.0676,0.1722,0.0207,0.2751,0.0255
LogisticRegression,0.7021,0.0832,0.1733,0.0172,0.2767,0.0225
KNeighborsClassifier,0.2699,0.0881,0.0876,0.0254,0.1258,0.0317
DecisionTreeClassifier,0.4097,0.0955,0.3715,0.0332,0.3829,0.0402
RandomForestClassifier,0.6655,0.0798,0.0986,0.0105,0.1711,0.016
AdaBoostClassifier,0.5148,0.1088,0.2282,0.0148,0.314,0.0324


CPU times: user 13min 26s, sys: 5min 46s, total: 19min 12s
Wall time: 13min 3s


### Set 5

- N-grams: 1 to 2
- Stop words removal: No
- Maximum vocabulary's size: 20,000

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer

def get_tf_idf_set5():
  return TfidfVectorizer(
      preprocessor=preprocess, 
      ngram_range=(1, 2), 
      max_features=20000)


In [None]:
%%time

cross_validation(
    [(mlp, False), (linear_svm, False), (rbf_svm, False), (logistic_regression, False), 
    (knn, False), (decision_tree, False), (random_forest, False), (adaboost, False), 
    (xgboost, False), (naive_bayes, True)], 
    get_tf_idf_set5, 
    'SET 5', 
    verbose_vocab=True)

Starting fold 0
   Learned 20000 terms.
   Processing model: MLPClassifier
   Processing model: LinearSVC
   Processing model: SVC
   Processing model: LogisticRegression
   Processing model: KNeighborsClassifier
   Processing model: DecisionTreeClassifier
   Processing model: RandomForestClassifier
   Processing model: AdaBoostClassifier
   Processing model: XGBClassifier
   Processing model: GaussianNB
Starting fold 1
   Learned 20000 terms.
   Processing model: MLPClassifier
   Processing model: LinearSVC
   Processing model: SVC
   Processing model: LogisticRegression
   Processing model: KNeighborsClassifier
   Processing model: DecisionTreeClassifier
   Processing model: RandomForestClassifier
   Processing model: AdaBoostClassifier
   Processing model: XGBClassifier
   Processing model: GaussianNB
Starting fold 2
   Learned 20000 terms.
   Processing model: MLPClassifier
   Processing model: LinearSVC
   Processing model: SVC
   Processing model: LogisticRegression
   Processing

Unnamed: 0,Precision,P std,Recall,R std,F1,F1 std
MLPClassifier,0.9455,0.0098,0.8461,0.0357,0.8927,0.0213
LinearSVC,0.9916,0.0028,0.9489,0.0025,0.9698,0.002
SVC,0.9959,0.0016,0.9027,0.0112,0.9469,0.0055
LogisticRegression,0.9418,0.0106,0.4911,0.0453,0.644,0.0375
KNeighborsClassifier,0.6633,0.0497,0.238,0.0335,0.3476,0.0323
DecisionTreeClassifier,0.9937,0.0018,0.9726,0.0024,0.983,0.0017
RandomForestClassifier,0.9974,0.0016,0.9689,0.0024,0.9829,0.0018
AdaBoostClassifier,0.6928,0.0131,0.3487,0.0243,0.4634,0.023
XGBClassifier,0.9099,0.0143,0.2363,0.0241,0.3743,0.0294
GaussianNB,0.7298,0.0109,1.0,0.0,0.8437,0.0072


Unnamed: 0,Precision,P std,Recall,R std,F1,F1 std
MLPClassifier,0.5966,0.1083,0.3704,0.0458,0.449,0.0296
LinearSVC,0.5691,0.0795,0.3841,0.0362,0.4532,0.0189
SVC,0.6713,0.0771,0.2137,0.0276,0.3215,0.029
LogisticRegression,0.6846,0.0809,0.209,0.0239,0.3181,0.0266
KNeighborsClassifier,0.241,0.0784,0.0788,0.0219,0.114,0.0241
DecisionTreeClassifier,0.4109,0.0929,0.3546,0.0337,0.3727,0.0316
RandomForestClassifier,0.6392,0.118,0.1209,0.0102,0.202,0.0158
AdaBoostClassifier,0.5402,0.1197,0.2245,0.0416,0.3153,0.0576
XGBClassifier,0.6577,0.1064,0.1131,0.0151,0.1926,0.0251
GaussianNB,0.3887,0.0824,0.4086,0.0648,0.3888,0.0395


CPU times: user 7min 3s, sys: 2min 9s, total: 9min 13s
Wall time: 7min 4s


### Set 6

- N-grams: 1 to 2
- Stop words removal: No
- Maximum vocabulary's size: 2,000

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer

def get_tf_idf_set6():
  return TfidfVectorizer(
      preprocessor=preprocess, 
      ngram_range=(1, 2), 
      max_features=2000)


In [None]:
%%time

cross_validation(
    [(mlp, False), (linear_svm, False), (rbf_svm, False), (logistic_regression, False), 
    (knn, False), (decision_tree, False), (random_forest, False), (adaboost, False), 
    (xgboost, False), (naive_bayes, True)], 
    get_tf_idf_set6, 
    'SET 6', 
    verbose_vocab=True)

Starting fold 0
   Learned 2000 terms.
   Processing model: MLPClassifier
   Processing model: LinearSVC
   Processing model: SVC
   Processing model: LogisticRegression
   Processing model: KNeighborsClassifier
   Processing model: DecisionTreeClassifier
   Processing model: RandomForestClassifier
   Processing model: AdaBoostClassifier
   Processing model: XGBClassifier
   Processing model: GaussianNB
Starting fold 1
   Learned 2000 terms.
   Processing model: MLPClassifier
   Processing model: LinearSVC
   Processing model: SVC
   Processing model: LogisticRegression
   Processing model: KNeighborsClassifier
   Processing model: DecisionTreeClassifier
   Processing model: RandomForestClassifier
   Processing model: AdaBoostClassifier
   Processing model: XGBClassifier
   Processing model: GaussianNB
Starting fold 2
   Learned 2000 terms.
   Processing model: MLPClassifier
   Processing model: LinearSVC
   Processing model: SVC
   Processing model: LogisticRegression
   Processing mo

Unnamed: 0,Precision,P std,Recall,R std,F1,F1 std
MLPClassifier,0.8087,0.0109,0.5209,0.0873,0.6291,0.0683
LinearSVC,0.8633,0.0052,0.6887,0.0194,0.766,0.0118
SVC,0.978,0.0061,0.7933,0.0194,0.8759,0.0095
LogisticRegression,0.8398,0.0108,0.47,0.0356,0.6017,0.0288
KNeighborsClassifier,0.6929,0.0207,0.3353,0.0335,0.4508,0.0321
DecisionTreeClassifier,0.9928,0.0014,0.9559,0.0029,0.974,0.0016
RandomForestClassifier,0.9967,0.0005,0.9521,0.0027,0.9739,0.0016
AdaBoostClassifier,0.6939,0.0095,0.361,0.0246,0.4746,0.0229
XGBClassifier,0.9099,0.0098,0.2471,0.0221,0.3881,0.0274
GaussianNB,0.42,0.0277,0.9631,0.0057,0.5843,0.0263


Unnamed: 0,Precision,P std,Recall,R std,F1,F1 std
MLPClassifier,0.6313,0.0949,0.3314,0.0503,0.4266,0.0329
LinearSVC,0.5389,0.0812,0.4032,0.0443,0.4566,0.0364
SVC,0.6382,0.0904,0.2891,0.0303,0.3935,0.0223
LogisticRegression,0.6441,0.0784,0.3027,0.0293,0.4082,0.021
KNeighborsClassifier,0.3108,0.0868,0.1353,0.0266,0.1821,0.0267
DecisionTreeClassifier,0.398,0.0943,0.3639,0.0438,0.3719,0.0422
RandomForestClassifier,0.5927,0.0832,0.1997,0.0149,0.2966,0.0147
AdaBoostClassifier,0.5159,0.1055,0.2496,0.0405,0.3352,0.0567
XGBClassifier,0.6685,0.0871,0.1183,0.0089,0.2007,0.0152
GaussianNB,0.3439,0.079,0.8114,0.0648,0.4761,0.0727


CPU times: user 3min 16s, sys: 15.7 s, total: 3min 31s
Wall time: 3min 17s


### Set 7

- N-grams: 1
- Stop words removal: No
- Vocabulary's size: No limits

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer

def get_tf_idf_set7():
  return TfidfVectorizer(
      preprocessor=preprocess, 
      ngram_range=(1, 1), 
      max_features=None)


In [None]:
%%time

cross_validation(
    [(mlp, False), (linear_svm, False), (rbf_svm, False), (logistic_regression, False), 
    (knn, False), (decision_tree, False), (random_forest, False), (adaboost, False)], 
    get_tf_idf_set7, 
    'SET 7', 
    verbose_vocab=True)

Starting fold 0
   Learned 10394 terms.
   Processing model: MLPClassifier
   Processing model: LinearSVC
   Processing model: SVC
   Processing model: LogisticRegression
   Processing model: KNeighborsClassifier
   Processing model: DecisionTreeClassifier
   Processing model: RandomForestClassifier
   Processing model: AdaBoostClassifier
Starting fold 1
   Learned 10208 terms.
   Processing model: MLPClassifier
   Processing model: LinearSVC
   Processing model: SVC
   Processing model: LogisticRegression
   Processing model: KNeighborsClassifier
   Processing model: DecisionTreeClassifier
   Processing model: RandomForestClassifier
   Processing model: AdaBoostClassifier
Starting fold 2
   Learned 9612 terms.
   Processing model: MLPClassifier
   Processing model: LinearSVC
   Processing model: SVC
   Processing model: LogisticRegression
   Processing model: KNeighborsClassifier
   Processing model: DecisionTreeClassifier
   Processing model: RandomForestClassifier
   Processing mode

Unnamed: 0,Precision,P std,Recall,R std,F1,F1 std
MLPClassifier,0.9089,0.0215,0.7717,0.0533,0.8342,0.0392
LinearSVC,0.963,0.0044,0.8768,0.0055,0.9179,0.003
SVC,0.9891,0.0026,0.8708,0.0124,0.9261,0.0059
LogisticRegression,0.8998,0.0096,0.4957,0.0378,0.6381,0.0294
KNeighborsClassifier,0.6523,0.042,0.2834,0.0382,0.392,0.0328
DecisionTreeClassifier,0.9945,0.0006,0.9761,0.0018,0.9852,0.0011
RandomForestClassifier,0.9982,0.0005,0.9724,0.0022,0.9851,0.0012
AdaBoostClassifier,0.6988,0.0099,0.3521,0.0226,0.4679,0.0211


Unnamed: 0,Precision,P std,Recall,R std,F1,F1 std
MLPClassifier,0.5774,0.0716,0.3424,0.0216,0.4274,0.0206
LinearSVC,0.5463,0.09,0.3859,0.0337,0.4459,0.0183
SVC,0.6475,0.0793,0.2317,0.0262,0.338,0.0228
LogisticRegression,0.6534,0.0832,0.2384,0.0179,0.3469,0.0154
KNeighborsClassifier,0.2551,0.0663,0.1033,0.0246,0.1417,0.0229
DecisionTreeClassifier,0.4053,0.0816,0.3396,0.0564,0.3635,0.0422
RandomForestClassifier,0.646,0.0613,0.1208,0.011,0.2028,0.014
AdaBoostClassifier,0.5166,0.0959,0.2131,0.0336,0.3003,0.0471


CPU times: user 3min 54s, sys: 1min 36s, total: 5min 31s
Wall time: 4min 1s


### Set 8

- N-grams: 1
- Stop words removal: No
- Maximum vocabulary's size: 20,000

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer

def get_tf_idf_set8():
  return TfidfVectorizer(
      preprocessor=preprocess, 
      ngram_range=(1, 1), 
      max_features=20000)


In [None]:
%%time

cross_validation(
    [(mlp, False), (linear_svm, False), (rbf_svm, False), (logistic_regression, False), 
    (knn, False), (decision_tree, False), (random_forest, False), (adaboost, False), 
    (xgboost, False), (naive_bayes, True)], 
    get_tf_idf_set8, 
    'SET 8', 
    verbose_vocab=True)

Starting fold 0
   Learned 10394 terms.
   Processing model: MLPClassifier
   Processing model: LinearSVC
   Processing model: SVC
   Processing model: LogisticRegression
   Processing model: KNeighborsClassifier
   Processing model: DecisionTreeClassifier
   Processing model: RandomForestClassifier
   Processing model: AdaBoostClassifier
   Processing model: XGBClassifier
   Processing model: GaussianNB
Starting fold 1
   Learned 10208 terms.
   Processing model: MLPClassifier
   Processing model: LinearSVC
   Processing model: SVC
   Processing model: LogisticRegression
   Processing model: KNeighborsClassifier
   Processing model: DecisionTreeClassifier
   Processing model: RandomForestClassifier
   Processing model: AdaBoostClassifier
   Processing model: XGBClassifier
   Processing model: GaussianNB
Starting fold 2
   Learned 9612 terms.
   Processing model: MLPClassifier
   Processing model: LinearSVC
   Processing model: SVC
   Processing model: LogisticRegression
   Processing 

Unnamed: 0,Precision,P std,Recall,R std,F1,F1 std
MLPClassifier,0.9089,0.0215,0.7717,0.0533,0.8342,0.0392
LinearSVC,0.963,0.0044,0.8768,0.0055,0.9179,0.003
SVC,0.9891,0.0026,0.8708,0.0124,0.9261,0.0059
LogisticRegression,0.8998,0.0096,0.4957,0.0378,0.6381,0.0294
KNeighborsClassifier,0.6523,0.042,0.2834,0.0382,0.392,0.0328
DecisionTreeClassifier,0.9945,0.0006,0.9761,0.0018,0.9852,0.0011
RandomForestClassifier,0.9982,0.0005,0.9724,0.0022,0.9851,0.0012
AdaBoostClassifier,0.6988,0.0099,0.3521,0.0226,0.4679,0.0211
XGBClassifier,0.906,0.0128,0.2353,0.0248,0.3728,0.0308
GaussianNB,0.5458,0.0153,1.0,0.0,0.706,0.0128


Unnamed: 0,Precision,P std,Recall,R std,F1,F1 std
MLPClassifier,0.5774,0.0716,0.3424,0.0216,0.4274,0.0206
LinearSVC,0.5463,0.09,0.3859,0.0337,0.4459,0.0183
SVC,0.6475,0.0793,0.2317,0.0262,0.338,0.0228
LogisticRegression,0.6534,0.0832,0.2384,0.0179,0.3469,0.0154
KNeighborsClassifier,0.2551,0.0663,0.1033,0.0246,0.1417,0.0229
DecisionTreeClassifier,0.4053,0.0816,0.3396,0.0564,0.3635,0.0422
RandomForestClassifier,0.646,0.0613,0.1208,0.011,0.2028,0.014
AdaBoostClassifier,0.5166,0.0959,0.2131,0.0336,0.3003,0.0471
XGBClassifier,0.6925,0.0945,0.1098,0.0076,0.189,0.0119
GaussianNB,0.3108,0.0782,0.5217,0.039,0.3813,0.0518


CPU times: user 4min 14s, sys: 1min 11s, total: 5min 25s
Wall time: 4min 13s


### Set 9

- N-grams: 1
- Stop words removal: No
- Maximum vocabulary's size: 2,000

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer

def get_tf_idf_set9():
  return TfidfVectorizer(
      preprocessor=preprocess, 
      ngram_range=(1, 1), 
      max_features=2000)


In [None]:
%%time

cross_validation(
    [(mlp, False), (linear_svm, False), (rbf_svm, False), (logistic_regression, False), 
    (knn, False), (decision_tree, False), (random_forest, False), (adaboost, False), 
    (xgboost, False), (naive_bayes, True)], 
    get_tf_idf_set9, 
    'SET 9', 
    verbose_vocab=True)

Starting fold 0
   Learned 2000 terms.
   Processing model: MLPClassifier
   Processing model: LinearSVC
   Processing model: SVC
   Processing model: LogisticRegression
   Processing model: KNeighborsClassifier
   Processing model: DecisionTreeClassifier
   Processing model: RandomForestClassifier
   Processing model: AdaBoostClassifier
   Processing model: XGBClassifier
   Processing model: GaussianNB
Starting fold 1
   Learned 2000 terms.
   Processing model: MLPClassifier
   Processing model: LinearSVC
   Processing model: SVC
   Processing model: LogisticRegression
   Processing model: KNeighborsClassifier
   Processing model: DecisionTreeClassifier
   Processing model: RandomForestClassifier
   Processing model: AdaBoostClassifier
   Processing model: XGBClassifier
   Processing model: GaussianNB
Starting fold 2
   Learned 2000 terms.
   Processing model: MLPClassifier
   Processing model: LinearSVC
   Processing model: SVC
   Processing model: LogisticRegression
   Processing mo

Unnamed: 0,Precision,P std,Recall,R std,F1,F1 std
MLPClassifier,0.8225,0.0078,0.6384,0.0376,0.7182,0.0245
LinearSVC,0.8717,0.004,0.7055,0.0204,0.7796,0.0112
SVC,0.9797,0.0043,0.8086,0.0209,0.8858,0.0111
LogisticRegression,0.8512,0.0124,0.4908,0.0318,0.6217,0.0229
KNeighborsClassifier,0.6865,0.0365,0.3341,0.0316,0.4474,0.0237
DecisionTreeClassifier,0.9922,0.001,0.9628,0.0024,0.9773,0.0015
RandomForestClassifier,0.997,0.0007,0.9581,0.0024,0.9772,0.0015
AdaBoostClassifier,0.6981,0.014,0.3522,0.0233,0.4678,0.0215
XGBClassifier,0.9036,0.02,0.24,0.025,0.3785,0.0307
GaussianNB,0.3827,0.0237,0.9914,0.0019,0.5518,0.0245


Unnamed: 0,Precision,P std,Recall,R std,F1,F1 std
MLPClassifier,0.5906,0.0851,0.38,0.0666,0.4534,0.0388
LinearSVC,0.5293,0.0852,0.3978,0.0439,0.447,0.0195
SVC,0.6301,0.0746,0.2878,0.049,0.3885,0.034
LogisticRegression,0.6323,0.0795,0.2926,0.0318,0.3959,0.023
KNeighborsClassifier,0.2885,0.0872,0.1067,0.019,0.1503,0.019
DecisionTreeClassifier,0.4077,0.0951,0.3534,0.0293,0.3714,0.0342
RandomForestClassifier,0.6446,0.1077,0.1862,0.0118,0.2863,0.0059
AdaBoostClassifier,0.5338,0.1129,0.2293,0.0384,0.3202,0.0569
XGBClassifier,0.687,0.079,0.1165,0.0113,0.1984,0.0154
GaussianNB,0.3035,0.0776,0.8031,0.0472,0.4336,0.075


CPU times: user 2min 39s, sys: 14.7 s, total: 2min 54s
Wall time: 2min 39s


### Set 10

- N-grams: 1 to 3
- Stop words removal: Yes
- Vocabulary's size: No limits

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer

def get_tf_idf_set10():
  return TfidfVectorizer(
      preprocessor=preprocess, 
      ngram_range=(1, 3), 
      max_features=None, 
      stop_words='english')


In [None]:
%%time

cross_validation(
    [(mlp, False), (linear_svm, False), (rbf_svm, False), (logistic_regression, False), 
    (knn, False), (decision_tree, False), (random_forest, False), (adaboost, False)], 
    get_tf_idf_set10, 
    'SET 10', 
    verbose_vocab=True)

Starting fold 0
   Learned 140738 terms.
   Processing model: MLPClassifier
   Processing model: LinearSVC
   Processing model: SVC
   Processing model: LogisticRegression
   Processing model: KNeighborsClassifier
   Processing model: DecisionTreeClassifier
   Processing model: RandomForestClassifier
   Processing model: AdaBoostClassifier
Starting fold 1
   Learned 141041 terms.
   Processing model: MLPClassifier
   Processing model: LinearSVC
   Processing model: SVC
   Processing model: LogisticRegression
   Processing model: KNeighborsClassifier
   Processing model: DecisionTreeClassifier
   Processing model: RandomForestClassifier
   Processing model: AdaBoostClassifier
Starting fold 2
   Learned 128106 terms.
   Processing model: MLPClassifier
   Processing model: LinearSVC
   Processing model: SVC
   Processing model: LogisticRegression
   Processing model: KNeighborsClassifier
   Processing model: DecisionTreeClassifier
   Processing model: RandomForestClassifier
   Processing 

Unnamed: 0,Precision,P std,Recall,R std,F1,F1 std
MLPClassifier,0.9812,0.0058,0.9065,0.0068,0.9423,0.0024
LinearSVC,0.9955,0.0008,0.9662,0.002,0.9806,0.001
SVC,0.9971,0.0013,0.9356,0.0036,0.9653,0.0017
LogisticRegression,0.9744,0.0043,0.2495,0.0562,0.394,0.0722
KNeighborsClassifier,0.5706,0.0261,0.2262,0.0401,0.322,0.042
DecisionTreeClassifier,0.9942,0.0011,0.9696,0.0017,0.9817,0.0013
RandomForestClassifier,0.997,0.0006,0.9667,0.0022,0.9816,0.0014
AdaBoostClassifier,0.6874,0.0138,0.2808,0.016,0.3986,0.0182


Unnamed: 0,Precision,P std,Recall,R std,F1,F1 std
MLPClassifier,0.5436,0.0989,0.3095,0.0402,0.3859,0.0213
LinearSVC,0.548,0.0834,0.3309,0.0291,0.4088,0.0266
SVC,0.6351,0.1014,0.1088,0.0091,0.1856,0.0168
LogisticRegression,0.6329,0.0984,0.1171,0.0142,0.1973,0.0239
KNeighborsClassifier,0.2529,0.0568,0.1027,0.0264,0.1391,0.0233
DecisionTreeClassifier,0.3469,0.0911,0.3887,0.0583,0.3604,0.0601
RandomForestClassifier,0.4863,0.1065,0.2173,0.0177,0.2948,0.0098
AdaBoostClassifier,0.5,0.1022,0.154,0.045,0.2337,0.0607


CPU times: user 22min 1s, sys: 7min 2s, total: 29min 4s
Wall time: 21min 49s


### Set 11

- N-grams: 1 to 3
- Stop words removal: Yes
- Maximum vocabulary's size: 20,000

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer

def get_tf_idf_set11():
  return TfidfVectorizer(
      preprocessor=preprocess, 
      ngram_range=(1, 3), 
      max_features=20000, 
      stop_words='english')


In [None]:
%%time

cross_validation(
    [(mlp, False), (linear_svm, False), (rbf_svm, False), (logistic_regression, False), 
    (knn, False), (decision_tree, False), (random_forest, False), (adaboost, False), 
    (xgboost, False), (naive_bayes, True)], 
    get_tf_idf_set11, 
    'SET 11', 
    verbose_vocab=True)

Starting fold 0
   Learned 20000 terms.
   Processing model: MLPClassifier
   Processing model: LinearSVC
   Processing model: SVC
   Processing model: LogisticRegression
   Processing model: KNeighborsClassifier
   Processing model: DecisionTreeClassifier
   Processing model: RandomForestClassifier
   Processing model: AdaBoostClassifier
   Processing model: XGBClassifier
   Processing model: GaussianNB
Starting fold 1
   Learned 20000 terms.
   Processing model: MLPClassifier
   Processing model: LinearSVC
   Processing model: SVC
   Processing model: LogisticRegression
   Processing model: KNeighborsClassifier
   Processing model: DecisionTreeClassifier
   Processing model: RandomForestClassifier
   Processing model: AdaBoostClassifier
   Processing model: XGBClassifier
   Processing model: GaussianNB
Starting fold 2
   Learned 20000 terms.
   Processing model: MLPClassifier
   Processing model: LinearSVC
   Processing model: SVC
   Processing model: LogisticRegression
   Processing

Unnamed: 0,Precision,P std,Recall,R std,F1,F1 std
MLPClassifier,0.9427,0.0086,0.8129,0.0189,0.8729,0.0135
LinearSVC,0.984,0.003,0.9284,0.0024,0.9554,0.0015
SVC,0.9943,0.0024,0.8613,0.0144,0.923,0.0074
LogisticRegression,0.9461,0.0119,0.4135,0.0427,0.5739,0.0404
KNeighborsClassifier,0.6376,0.0306,0.2849,0.0376,0.3913,0.0316
DecisionTreeClassifier,0.9933,0.0015,0.9648,0.002,0.9789,0.0016
RandomForestClassifier,0.996,0.0012,0.9621,0.0022,0.9788,0.0016
AdaBoostClassifier,0.6907,0.0157,0.2957,0.0197,0.4138,0.0217
XGBClassifier,0.9363,0.02,0.1655,0.0238,0.2804,0.034
GaussianNB,0.6732,0.0129,1.0,0.0,0.8046,0.0092


Unnamed: 0,Precision,P std,Recall,R std,F1,F1 std
MLPClassifier,0.5656,0.0933,0.2976,0.0339,0.3854,0.0283
LinearSVC,0.506,0.0855,0.3358,0.0243,0.3988,0.021
SVC,0.6173,0.1,0.1563,0.0133,0.2483,0.0205
LogisticRegression,0.6317,0.0823,0.1524,0.0074,0.2451,0.0142
KNeighborsClassifier,0.2779,0.0823,0.1135,0.0251,0.1533,0.0189
DecisionTreeClassifier,0.3943,0.1035,0.3293,0.0314,0.3517,0.0436
RandomForestClassifier,0.5149,0.1137,0.1985,0.0266,0.2805,0.0183
AdaBoostClassifier,0.4814,0.1105,0.1666,0.0404,0.2454,0.0549
XGBClassifier,0.6453,0.1131,0.067,0.0147,0.1209,0.0244
GaussianNB,0.3423,0.0835,0.5199,0.0776,0.4013,0.0507


CPU times: user 5min 51s, sys: 2min 13s, total: 8min 4s
Wall time: 5min 57s


### Set 12

- N-grams: 1 to 3
- Stop words removal: Yes
- Maximum vocabulary's size: 2,000

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer

def get_tf_idf_set12():
  return TfidfVectorizer(
      preprocessor=preprocess, 
      ngram_range=(1, 3), 
      max_features=2000, 
      stop_words='english')


In [None]:
%%time

cross_validation(
    [(mlp, False), (linear_svm, False), (rbf_svm, False), (logistic_regression, False), 
    (knn, False), (decision_tree, False), (random_forest, False), (adaboost, False), 
    (xgboost, False), (naive_bayes, True)], 
    get_tf_idf_set12, 
    'SET 12', 
    verbose_vocab=True)

Starting fold 0
   Learned 2000 terms.
   Processing model: MLPClassifier
   Processing model: LinearSVC
   Processing model: SVC
   Processing model: LogisticRegression
   Processing model: KNeighborsClassifier
   Processing model: DecisionTreeClassifier
   Processing model: RandomForestClassifier
   Processing model: AdaBoostClassifier
   Processing model: XGBClassifier
   Processing model: GaussianNB
Starting fold 1
   Learned 2000 terms.
   Processing model: MLPClassifier
   Processing model: LinearSVC
   Processing model: SVC
   Processing model: LogisticRegression
   Processing model: KNeighborsClassifier
   Processing model: DecisionTreeClassifier
   Processing model: RandomForestClassifier
   Processing model: AdaBoostClassifier
   Processing model: XGBClassifier
   Processing model: GaussianNB
Starting fold 2
   Learned 2000 terms.
   Processing model: MLPClassifier
   Processing model: LinearSVC
   Processing model: SVC
   Processing model: LogisticRegression
   Processing mo

Unnamed: 0,Precision,P std,Recall,R std,F1,F1 std
MLPClassifier,0.8211,0.0141,0.5738,0.0833,0.6724,0.0628
LinearSVC,0.8562,0.0073,0.6676,0.0229,0.75,0.015
SVC,0.9776,0.0069,0.7446,0.0281,0.8449,0.016
LogisticRegression,0.8539,0.0128,0.4197,0.0331,0.5618,0.0283
KNeighborsClassifier,0.6875,0.0164,0.3907,0.011,0.4982,0.0113
DecisionTreeClassifier,0.987,0.0014,0.9404,0.0028,0.9631,0.0013
RandomForestClassifier,0.9931,0.0021,0.9344,0.0032,0.9629,0.0013
AdaBoostClassifier,0.695,0.0181,0.2948,0.0216,0.4136,0.0225
XGBClassifier,0.9306,0.0225,0.1654,0.0259,0.2798,0.0372
GaussianNB,0.384,0.0238,0.9933,0.0018,0.5534,0.0246


Unnamed: 0,Precision,P std,Recall,R std,F1,F1 std
MLPClassifier,0.5604,0.1066,0.3104,0.0556,0.3893,0.0318
LinearSVC,0.4923,0.1,0.3552,0.0122,0.4084,0.0374
SVC,0.587,0.1058,0.1955,0.0221,0.2902,0.0259
LogisticRegression,0.5982,0.0864,0.209,0.0094,0.3078,0.0086
KNeighborsClassifier,0.2835,0.085,0.1416,0.0091,0.1842,0.0134
DecisionTreeClassifier,0.3807,0.0973,0.3338,0.0291,0.3476,0.0369
RandomForestClassifier,0.4843,0.1088,0.2433,0.0309,0.3155,0.0163
AdaBoostClassifier,0.4799,0.1136,0.1615,0.0309,0.2389,0.0428
XGBClassifier,0.6674,0.0969,0.0678,0.0136,0.1224,0.022
GaussianNB,0.3049,0.0814,0.8103,0.0484,0.4353,0.0788


CPU times: user 2min 21s, sys: 14.9 s, total: 2min 35s
Wall time: 2min 21s


### Set 13

- N-grams: 1 to 2
- Stop words removal: Yes
- Vocabulary's size: No limits

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer

def get_tf_idf_set13():
  return TfidfVectorizer(
      preprocessor=preprocess, 
      ngram_range=(1, 2), 
      max_features=None, 
      stop_words='english')


In [None]:
%%time

cross_validation(
    [(mlp, False), (linear_svm, False), (rbf_svm, False), (logistic_regression, False), 
    (knn, False), (decision_tree, False), (random_forest, False), (adaboost, False)], 
    get_tf_idf_set13, 
    'SET 13', 
    verbose_vocab=True)

Starting fold 0
   Learned 70020 terms.
   Processing model: MLPClassifier
   Processing model: LinearSVC
   Processing model: SVC
   Processing model: LogisticRegression
   Processing model: KNeighborsClassifier
   Processing model: DecisionTreeClassifier
   Processing model: RandomForestClassifier
   Processing model: AdaBoostClassifier
Starting fold 1
   Learned 70165 terms.
   Processing model: MLPClassifier
   Processing model: LinearSVC
   Processing model: SVC
   Processing model: LogisticRegression
   Processing model: KNeighborsClassifier
   Processing model: DecisionTreeClassifier
   Processing model: RandomForestClassifier
   Processing model: AdaBoostClassifier
Starting fold 2
   Learned 63648 terms.
   Processing model: MLPClassifier
   Processing model: LinearSVC
   Processing model: SVC
   Processing model: LogisticRegression
   Processing model: KNeighborsClassifier
   Processing model: DecisionTreeClassifier
   Processing model: RandomForestClassifier
   Processing mod

Unnamed: 0,Precision,P std,Recall,R std,F1,F1 std
MLPClassifier,0.974,0.0015,0.9106,0.0149,0.9411,0.0086
LinearSVC,0.995,0.0007,0.9645,0.0019,0.9795,0.001
SVC,0.9968,0.0011,0.9223,0.0068,0.9581,0.0033
LogisticRegression,0.9642,0.0056,0.3256,0.051,0.4845,0.0572
KNeighborsClassifier,0.6089,0.0348,0.2322,0.0337,0.3352,0.0382
DecisionTreeClassifier,0.9942,0.0011,0.9696,0.0017,0.9817,0.0013
RandomForestClassifier,0.9968,0.0008,0.967,0.0022,0.9816,0.0014
AdaBoostClassifier,0.6887,0.0171,0.2821,0.0185,0.3998,0.0188


Unnamed: 0,Precision,P std,Recall,R std,F1,F1 std
MLPClassifier,0.53,0.0899,0.3303,0.0294,0.405,0.043
LinearSVC,0.5438,0.0885,0.3201,0.0217,0.3994,0.0259
SVC,0.6159,0.1196,0.1227,0.0131,0.2042,0.0241
LogisticRegression,0.6391,0.0936,0.1278,0.009,0.2126,0.0164
KNeighborsClassifier,0.2605,0.1118,0.088,0.0255,0.1251,0.0298
DecisionTreeClassifier,0.3549,0.0839,0.35,0.0345,0.3474,0.0455
RandomForestClassifier,0.5067,0.0785,0.1971,0.0267,0.2797,0.0211
AdaBoostClassifier,0.4719,0.1153,0.1496,0.0372,0.2254,0.0541


CPU times: user 14min 21s, sys: 8min 48s, total: 23min 9s
Wall time: 15min 11s


### Set 14

- N-grams: 1 to 2
- Stop words removal: Yes
- Maximum vocabulary's size: 20,000

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer

def get_tf_idf_set14():
  return TfidfVectorizer(
      preprocessor=preprocess, 
      ngram_range=(1, 2), 
      max_features=20000, 
      stop_words='english')


In [None]:
%%time

cross_validation(
    [(mlp, False), (linear_svm, False), (rbf_svm, False), (logistic_regression, False), 
    (knn, False), (decision_tree, False), (random_forest, False), (adaboost, False), 
    (xgboost, False), (naive_bayes, True)], 
    get_tf_idf_set14, 
    'SET 14', 
    verbose_vocab=True)

Starting fold 0
   Learned 20000 terms.
   Processing model: MLPClassifier
   Processing model: LinearSVC
   Processing model: SVC
   Processing model: LogisticRegression
   Processing model: KNeighborsClassifier
   Processing model: DecisionTreeClassifier
   Processing model: RandomForestClassifier
   Processing model: AdaBoostClassifier
   Processing model: XGBClassifier
   Processing model: GaussianNB
Starting fold 1
   Learned 20000 terms.
   Processing model: MLPClassifier
   Processing model: LinearSVC
   Processing model: SVC
   Processing model: LogisticRegression
   Processing model: KNeighborsClassifier
   Processing model: DecisionTreeClassifier
   Processing model: RandomForestClassifier
   Processing model: AdaBoostClassifier
   Processing model: XGBClassifier
   Processing model: GaussianNB
Starting fold 2
   Learned 20000 terms.
   Processing model: MLPClassifier
   Processing model: LinearSVC
   Processing model: SVC
   Processing model: LogisticRegression
   Processing

Unnamed: 0,Precision,P std,Recall,R std,F1,F1 std
MLPClassifier,0.9564,0.0046,0.8876,0.0375,0.9203,0.0208
LinearSVC,0.9864,0.0022,0.9352,0.0012,0.9601,0.0013
SVC,0.9942,0.0018,0.8726,0.0141,0.9294,0.0073
LogisticRegression,0.9422,0.0088,0.4065,0.045,0.5662,0.043
KNeighborsClassifier,0.6239,0.0469,0.2889,0.0553,0.3902,0.0461
DecisionTreeClassifier,0.9935,0.0015,0.9655,0.0021,0.9793,0.0016
RandomForestClassifier,0.9962,0.001,0.9628,0.0024,0.9792,0.0016
AdaBoostClassifier,0.6996,0.0202,0.2766,0.019,0.3961,0.0205
XGBClassifier,0.9407,0.0212,0.1664,0.0239,0.2818,0.0339
GaussianNB,0.7033,0.014,1.0,0.0,0.8257,0.0097


Unnamed: 0,Precision,P std,Recall,R std,F1,F1 std
MLPClassifier,0.5173,0.1047,0.3391,0.0473,0.3992,0.0221
LinearSVC,0.4956,0.087,0.3313,0.0222,0.393,0.0287
SVC,0.6075,0.0981,0.1563,0.0141,0.2474,0.0207
LogisticRegression,0.6295,0.0753,0.1515,0.0066,0.2436,0.0108
KNeighborsClassifier,0.2729,0.0668,0.1199,0.0363,0.1579,0.021
DecisionTreeClassifier,0.381,0.0939,0.3456,0.0362,0.3556,0.0449
RandomForestClassifier,0.4896,0.1051,0.1881,0.0333,0.2648,0.0281
AdaBoostClassifier,0.4846,0.0985,0.1451,0.0325,0.2218,0.0464
XGBClassifier,0.642,0.1046,0.066,0.0151,0.1191,0.025
GaussianNB,0.3386,0.0809,0.5021,0.0662,0.3936,0.0454


CPU times: user 6min 22s, sys: 2min 58s, total: 9min 20s
Wall time: 6min 28s


### Set 15

- N-grams: 1 to 2
- Stop words removal: Yes
- Maximum vocabulary's size: 2,000

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer

def get_tf_idf_set15():
  return TfidfVectorizer(
      preprocessor=preprocess, 
      ngram_range=(1, 2), 
      max_features=2000, 
      stop_words='english')


In [None]:
%%time

cross_validation(
    [(mlp, False), (linear_svm, False), (rbf_svm, False), (logistic_regression, False), 
    (knn, False), (decision_tree, False), (random_forest, False), (adaboost, False), 
    (xgboost, False), (naive_bayes, True)], 
    get_tf_idf_set15, 
    'SET 15', 
    verbose_vocab=True)

Starting fold 0
   Learned 2000 terms.
   Processing model: MLPClassifier
   Processing model: LinearSVC
   Processing model: SVC
   Processing model: LogisticRegression
   Processing model: KNeighborsClassifier
   Processing model: DecisionTreeClassifier
   Processing model: RandomForestClassifier
   Processing model: AdaBoostClassifier
   Processing model: XGBClassifier
   Processing model: GaussianNB
Starting fold 1
   Learned 2000 terms.
   Processing model: MLPClassifier
   Processing model: LinearSVC
   Processing model: SVC
   Processing model: LogisticRegression
   Processing model: KNeighborsClassifier
   Processing model: DecisionTreeClassifier
   Processing model: RandomForestClassifier
   Processing model: AdaBoostClassifier
   Processing model: XGBClassifier
   Processing model: GaussianNB
Starting fold 2
   Learned 2000 terms.
   Processing model: MLPClassifier
   Processing model: LinearSVC
   Processing model: SVC
   Processing model: LogisticRegression
   Processing mo

Unnamed: 0,Precision,P std,Recall,R std,F1,F1 std
MLPClassifier,0.8301,0.0116,0.5665,0.1339,0.6653,0.0976
LinearSVC,0.8595,0.0075,0.6727,0.0237,0.7545,0.0162
SVC,0.9782,0.0067,0.7515,0.0292,0.8496,0.0168
LogisticRegression,0.8544,0.0132,0.4205,0.0336,0.5626,0.0289
KNeighborsClassifier,0.6684,0.0469,0.3967,0.0286,0.4959,0.0181
DecisionTreeClassifier,0.9877,0.0014,0.9413,0.0022,0.9639,0.0011
RandomForestClassifier,0.9934,0.002,0.9358,0.0028,0.9637,0.0011
AdaBoostClassifier,0.7,0.0223,0.2896,0.0226,0.409,0.0225
XGBClassifier,0.939,0.0162,0.1658,0.0227,0.2811,0.0328
GaussianNB,0.3874,0.0242,0.9941,0.0013,0.557,0.025


Unnamed: 0,Precision,P std,Recall,R std,F1,F1 std
MLPClassifier,0.5643,0.1348,0.3084,0.0777,0.3776,0.034
LinearSVC,0.4821,0.0992,0.3573,0.0209,0.4046,0.0315
SVC,0.581,0.0888,0.2015,0.0314,0.2949,0.0304
LogisticRegression,0.5971,0.0936,0.213,0.0086,0.3118,0.0058
KNeighborsClassifier,0.2826,0.0949,0.1544,0.0314,0.194,0.0347
DecisionTreeClassifier,0.3742,0.097,0.3508,0.0408,0.3521,0.0389
RandomForestClassifier,0.4685,0.0994,0.2506,0.0333,0.319,0.0194
AdaBoostClassifier,0.4852,0.1258,0.156,0.0297,0.2328,0.0422
XGBClassifier,0.6457,0.1047,0.07,0.0134,0.1256,0.0212
GaussianNB,0.3058,0.0816,0.8102,0.0431,0.4363,0.079


CPU times: user 2min 16s, sys: 15.5 s, total: 2min 31s
Wall time: 2min 16s


### Set 16

- N-grams: 1
- Stop words removal: Yes
- Vocabulary's size: No limits

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer

def get_tf_idf_set16():
  return TfidfVectorizer(
      preprocessor=preprocess, 
      ngram_range=(1, 1), 
      max_features=None, 
      stop_words='english')


In [None]:
%%time

cross_validation(
    [(mlp, False), (linear_svm, False), (rbf_svm, False), (logistic_regression, False), 
    (knn, False), (decision_tree, False), (random_forest, False), (adaboost, False)], 
    get_tf_idf_set16, 
    'SET 16', 
    verbose_vocab=True)

Starting fold 0
   Learned 10108 terms.
   Processing model: MLPClassifier
   Processing model: LinearSVC
   Processing model: SVC
   Processing model: LogisticRegression
   Processing model: KNeighborsClassifier
   Processing model: DecisionTreeClassifier
   Processing model: RandomForestClassifier
   Processing model: AdaBoostClassifier
Starting fold 1
   Learned 9927 terms.
   Processing model: MLPClassifier
   Processing model: LinearSVC
   Processing model: SVC
   Processing model: LogisticRegression
   Processing model: KNeighborsClassifier
   Processing model: DecisionTreeClassifier
   Processing model: RandomForestClassifier
   Processing model: AdaBoostClassifier
Starting fold 2
   Learned 9334 terms.
   Processing model: MLPClassifier
   Processing model: LinearSVC
   Processing model: SVC
   Processing model: LogisticRegression
   Processing model: KNeighborsClassifier
   Processing model: DecisionTreeClassifier
   Processing model: RandomForestClassifier
   Processing model

Unnamed: 0,Precision,P std,Recall,R std,F1,F1 std
MLPClassifier,0.9224,0.015,0.8078,0.0456,0.8609,0.0323
LinearSVC,0.9567,0.0035,0.8675,0.0066,0.9099,0.0043
SVC,0.9912,0.0032,0.8509,0.0149,0.9156,0.0076
LogisticRegression,0.9037,0.009,0.4208,0.0382,0.573,0.0354
KNeighborsClassifier,0.6742,0.0254,0.2918,0.0285,0.4062,0.0269
DecisionTreeClassifier,0.9936,0.0011,0.9692,0.0017,0.9812,0.0013
RandomForestClassifier,0.997,0.0008,0.9657,0.0021,0.9811,0.0013
AdaBoostClassifier,0.6876,0.0114,0.2823,0.0241,0.3997,0.025


Unnamed: 0,Precision,P std,Recall,R std,F1,F1 std
MLPClassifier,0.5289,0.0993,0.3096,0.0247,0.3845,0.0182
LinearSVC,0.5023,0.1071,0.3371,0.0204,0.3974,0.0298
SVC,0.6021,0.1052,0.1582,0.0084,0.2486,0.0066
LogisticRegression,0.6322,0.0929,0.1557,0.0111,0.2494,0.0194
KNeighborsClassifier,0.2724,0.0855,0.1089,0.0326,0.1484,0.0267
DecisionTreeClassifier,0.3644,0.0912,0.3096,0.0229,0.3283,0.0335
RandomForestClassifier,0.5108,0.0855,0.1892,0.0358,0.2693,0.0292
AdaBoostClassifier,0.4979,0.1119,0.1675,0.049,0.2482,0.0625


CPU times: user 3min 31s, sys: 1min 20s, total: 4min 51s
Wall time: 3min 30s


### Set 17

- N-grams: 1
- Stop words removal: Yes
- Maximum vocabulary's size: 20,000

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer

def get_tf_idf_set17():
  return TfidfVectorizer(
      preprocessor=preprocess, 
      ngram_range=(1, 1), 
      max_features=20000, 
      stop_words='english')


In [None]:
%%time

cross_validation(
    [(mlp, False), (linear_svm, False), (rbf_svm, False), (logistic_regression, False), 
    (knn, False), (decision_tree, False), (random_forest, False), (adaboost, False), 
    (xgboost, False), (naive_bayes, True)], 
    get_tf_idf_set17, 
    'SET 17', 
    verbose_vocab=True)

Starting fold 0
   Learned 10108 terms.
   Processing model: MLPClassifier
   Processing model: LinearSVC
   Processing model: SVC
   Processing model: LogisticRegression
   Processing model: KNeighborsClassifier
   Processing model: DecisionTreeClassifier
   Processing model: RandomForestClassifier
   Processing model: AdaBoostClassifier
   Processing model: XGBClassifier
   Processing model: GaussianNB
Starting fold 1
   Learned 9927 terms.
   Processing model: MLPClassifier
   Processing model: LinearSVC
   Processing model: SVC
   Processing model: LogisticRegression
   Processing model: KNeighborsClassifier
   Processing model: DecisionTreeClassifier
   Processing model: RandomForestClassifier
   Processing model: AdaBoostClassifier
   Processing model: XGBClassifier
   Processing model: GaussianNB
Starting fold 2
   Learned 9334 terms.
   Processing model: MLPClassifier
   Processing model: LinearSVC
   Processing model: SVC
   Processing model: LogisticRegression
   Processing m

Unnamed: 0,Precision,P std,Recall,R std,F1,F1 std
MLPClassifier,0.9224,0.015,0.8078,0.0456,0.8609,0.0323
LinearSVC,0.9567,0.0035,0.8675,0.0066,0.9099,0.0043
SVC,0.9912,0.0032,0.8509,0.0149,0.9156,0.0076
LogisticRegression,0.9037,0.009,0.4208,0.0382,0.573,0.0354
KNeighborsClassifier,0.6742,0.0254,0.2918,0.0285,0.4062,0.0269
DecisionTreeClassifier,0.9936,0.0011,0.9692,0.0017,0.9812,0.0013
RandomForestClassifier,0.997,0.0008,0.9657,0.0021,0.9811,0.0013
AdaBoostClassifier,0.6876,0.0114,0.2823,0.0241,0.3997,0.025
XGBClassifier,0.9366,0.0133,0.1626,0.0217,0.2764,0.0315
GaussianNB,0.542,0.0162,1.0,0.0,0.7028,0.0137


Unnamed: 0,Precision,P std,Recall,R std,F1,F1 std
MLPClassifier,0.5289,0.0993,0.3096,0.0247,0.3845,0.0182
LinearSVC,0.5023,0.1071,0.3371,0.0204,0.3974,0.0298
SVC,0.6021,0.1052,0.1582,0.0084,0.2486,0.0066
LogisticRegression,0.6322,0.0929,0.1557,0.0111,0.2494,0.0194
KNeighborsClassifier,0.2724,0.0855,0.1089,0.0326,0.1484,0.0267
DecisionTreeClassifier,0.3644,0.0912,0.3096,0.0229,0.3283,0.0335
RandomForestClassifier,0.5108,0.0855,0.1892,0.0358,0.2693,0.0292
AdaBoostClassifier,0.4979,0.1119,0.1675,0.049,0.2482,0.0625
XGBClassifier,0.6363,0.1025,0.0606,0.0104,0.1104,0.0179
GaussianNB,0.3107,0.0791,0.5266,0.0357,0.3825,0.0528


CPU times: user 3min 57s, sys: 1min 24s, total: 5min 21s
Wall time: 3min 59s


### Set 18

- N-grams: 1
- Stop words removal: Yes
- Maximum vocabulary's size: 2,000

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer

def get_tf_idf_set18():
  return TfidfVectorizer(
      preprocessor=preprocess, 
      ngram_range=(1, 1), 
      max_features=2000, 
      stop_words='english')


In [None]:
%%time

cross_validation(
    [(mlp, False), (linear_svm, False), (rbf_svm, False), (logistic_regression, False), 
    (knn, False), (decision_tree, False), (random_forest, False), (adaboost, False), 
    (xgboost, False), (naive_bayes, True)], 
    get_tf_idf_set18, 
    'SET 18', 
    verbose_vocab=True)

Starting fold 0
   Learned 2000 terms.
   Processing model: MLPClassifier
   Processing model: LinearSVC
   Processing model: SVC
   Processing model: LogisticRegression
   Processing model: KNeighborsClassifier
   Processing model: DecisionTreeClassifier
   Processing model: RandomForestClassifier
   Processing model: AdaBoostClassifier
   Processing model: XGBClassifier
   Processing model: GaussianNB
Starting fold 1
   Learned 2000 terms.
   Processing model: MLPClassifier
   Processing model: LinearSVC
   Processing model: SVC
   Processing model: LogisticRegression
   Processing model: KNeighborsClassifier
   Processing model: DecisionTreeClassifier
   Processing model: RandomForestClassifier
   Processing model: AdaBoostClassifier
   Processing model: XGBClassifier
   Processing model: GaussianNB
Starting fold 2
   Learned 2000 terms.
   Processing model: MLPClassifier
   Processing model: LinearSVC
   Processing model: SVC
   Processing model: LogisticRegression
   Processing mo

Unnamed: 0,Precision,P std,Recall,R std,F1,F1 std
MLPClassifier,0.826,0.0149,0.5723,0.1014,0.6711,0.0719
LinearSVC,0.8613,0.0058,0.6845,0.0265,0.7626,0.0177
SVC,0.9809,0.006,0.7683,0.0284,0.8613,0.016
LogisticRegression,0.8541,0.0114,0.4246,0.0327,0.5663,0.0282
KNeighborsClassifier,0.6853,0.0243,0.3583,0.0182,0.4702,0.0166
DecisionTreeClassifier,0.9884,0.0019,0.9477,0.0034,0.9676,0.0017
RandomForestClassifier,0.9931,0.0017,0.9431,0.0037,0.9675,0.0017
AdaBoostClassifier,0.6946,0.014,0.2921,0.0275,0.4105,0.0273
XGBClassifier,0.9382,0.0156,0.162,0.0239,0.2755,0.0348
GaussianNB,0.3805,0.0235,0.9952,0.0023,0.5501,0.0244


Unnamed: 0,Precision,P std,Recall,R std,F1,F1 std
MLPClassifier,0.5732,0.1126,0.3044,0.0593,0.3867,0.0361
LinearSVC,0.4841,0.1,0.3532,0.0335,0.4019,0.0348
SVC,0.5802,0.097,0.2004,0.0269,0.2938,0.0266
LogisticRegression,0.6065,0.0873,0.2104,0.0116,0.3107,0.0158
KNeighborsClassifier,0.2836,0.0683,0.1352,0.0258,0.1773,0.0142
DecisionTreeClassifier,0.374,0.0962,0.3365,0.0359,0.3446,0.0324
RandomForestClassifier,0.4939,0.1117,0.2521,0.0386,0.324,0.0194
AdaBoostClassifier,0.4689,0.1266,0.1656,0.0433,0.2414,0.057
XGBClassifier,0.6612,0.0864,0.0638,0.0132,0.1157,0.0215
GaussianNB,0.2995,0.081,0.8065,0.0347,0.4296,0.0798


CPU times: user 2min 7s, sys: 14.8 s, total: 2min 22s
Wall time: 2min 7s


### Set 19

- N-grams: 1 to 3
- Stop words removal: No
- Vocabulary's size: No limits
- Maximum DF: 0.85

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer

def get_tf_idf_set19():
  return TfidfVectorizer(
      preprocessor=preprocess, 
      ngram_range=(1, 3), 
      max_features=None, 
      max_df=0.85)


In [None]:
%%time

cross_validation(
    [(mlp, False), (linear_svm, False), (rbf_svm, False), (logistic_regression, False), 
    (knn, False), (decision_tree, False), (random_forest, False), (adaboost, False)], 
    get_tf_idf_set19, 
    'SET 19', 
    verbose_vocab=True)

Starting fold 0
   Learned 202869 terms.
   Processing model: MLPClassifier
   Processing model: LinearSVC
   Processing model: SVC
   Processing model: LogisticRegression
   Processing model: KNeighborsClassifier
   Processing model: DecisionTreeClassifier
   Processing model: RandomForestClassifier
   Processing model: AdaBoostClassifier
Starting fold 1
   Learned 202985 terms.
   Processing model: MLPClassifier
   Processing model: LinearSVC
   Processing model: SVC
   Processing model: LogisticRegression
   Processing model: KNeighborsClassifier
   Processing model: DecisionTreeClassifier
   Processing model: RandomForestClassifier
   Processing model: AdaBoostClassifier
Starting fold 2
   Learned 183416 terms.
   Processing model: MLPClassifier
   Processing model: LinearSVC
   Processing model: SVC
   Processing model: LogisticRegression
   Processing model: KNeighborsClassifier
   Processing model: DecisionTreeClassifier
   Processing model: RandomForestClassifier
   Processing 

Unnamed: 0,Precision,P std,Recall,R std,F1,F1 std
MLPClassifier,0.9835,0.0034,0.9174,0.008,0.9493,0.0042
LinearSVC,0.9982,0.0008,0.9716,0.0027,0.9847,0.001
SVC,0.9993,0.0005,0.9446,0.0045,0.9711,0.0025
LogisticRegression,0.9838,0.001,0.3364,0.0683,0.4973,0.0768
KNeighborsClassifier,0.6038,0.0536,0.2148,0.0353,0.3141,0.0383
DecisionTreeClassifier,0.995,0.0008,0.9761,0.0018,0.9854,0.0012
RandomForestClassifier,0.9983,0.0009,0.9727,0.002,0.9853,0.0013
AdaBoostClassifier,0.6891,0.0132,0.3424,0.0324,0.4563,0.0269


Unnamed: 0,Precision,P std,Recall,R std,F1,F1 std
MLPClassifier,0.6197,0.0863,0.333,0.0584,0.4262,0.0433
LinearSVC,0.6059,0.0828,0.3584,0.0324,0.4456,0.0214
SVC,0.7144,0.0531,0.1222,0.0178,0.2078,0.0254
LogisticRegression,0.7119,0.0945,0.1334,0.0153,0.2238,0.0234
KNeighborsClassifier,0.2403,0.0793,0.0833,0.023,0.1181,0.023
DecisionTreeClassifier,0.4096,0.0894,0.3761,0.042,0.3827,0.0271
RandomForestClassifier,0.6865,0.0984,0.1012,0.0112,0.1752,0.0144
AdaBoostClassifier,0.5188,0.0983,0.2235,0.0383,0.3108,0.0521


CPU times: user 29min 14s, sys: 6min 49s, total: 36min 3s
Wall time: 29min 17s


### Set 20

- N-grams: 1 to 2
- Stop words removal: No
- Vocabulary's size: No limits
- Maximum DF: 0.85

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer

def get_tf_idf_set20():
  return TfidfVectorizer(
      preprocessor=preprocess, 
      ngram_range=(1, 2), 
      max_features=None, 
      max_df=0.85)


In [None]:
%%time

cross_validation(
    [(mlp, False), (linear_svm, False), (rbf_svm, False), (logistic_regression, False), 
    (knn, False), (decision_tree, False), (random_forest, False), (adaboost, False)], 
    get_tf_idf_set20, 
    'SET 20', 
    verbose_vocab=True)

Starting fold 0
   Learned 77988 terms.
   Processing model: MLPClassifier
   Processing model: LinearSVC
   Processing model: SVC
   Processing model: LogisticRegression
   Processing model: KNeighborsClassifier
   Processing model: DecisionTreeClassifier
   Processing model: RandomForestClassifier
   Processing model: AdaBoostClassifier
Starting fold 1
   Learned 77844 terms.
   Processing model: MLPClassifier
   Processing model: LinearSVC
   Processing model: SVC
   Processing model: LogisticRegression
   Processing model: KNeighborsClassifier
   Processing model: DecisionTreeClassifier
   Processing model: RandomForestClassifier
   Processing model: AdaBoostClassifier
Starting fold 2
   Learned 70986 terms.
   Processing model: MLPClassifier
   Processing model: LinearSVC
   Processing model: SVC
   Processing model: LogisticRegression
   Processing model: KNeighborsClassifier
   Processing model: DecisionTreeClassifier
   Processing model: RandomForestClassifier
   Processing mod

Unnamed: 0,Precision,P std,Recall,R std,F1,F1 std
MLPClassifier,0.9775,0.0029,0.9075,0.0155,0.9411,0.0084
LinearSVC,0.9981,0.0005,0.9695,0.0026,0.9836,0.0011
SVC,0.9988,0.0004,0.9276,0.0043,0.9618,0.0025
LogisticRegression,0.9701,0.0033,0.4324,0.0582,0.5958,0.0558
KNeighborsClassifier,0.6264,0.0341,0.2182,0.0274,0.3218,0.0269
DecisionTreeClassifier,0.995,0.0008,0.9761,0.0018,0.9854,0.0012
RandomForestClassifier,0.998,0.0007,0.9731,0.0021,0.9854,0.0012
AdaBoostClassifier,0.6867,0.0034,0.3505,0.0309,0.4633,0.0273


Unnamed: 0,Precision,P std,Recall,R std,F1,F1 std
MLPClassifier,0.6042,0.0949,0.3447,0.0494,0.4324,0.0346
LinearSVC,0.5888,0.0788,0.3692,0.0322,0.449,0.0164
SVC,0.7025,0.0676,0.1722,0.0207,0.2751,0.0255
LogisticRegression,0.7021,0.0832,0.1733,0.0172,0.2767,0.0225
KNeighborsClassifier,0.2699,0.0881,0.0876,0.0254,0.1258,0.0317
DecisionTreeClassifier,0.4097,0.0955,0.3715,0.0332,0.3829,0.0402
RandomForestClassifier,0.6655,0.0798,0.0986,0.0105,0.1711,0.016
AdaBoostClassifier,0.5148,0.1088,0.2282,0.0148,0.314,0.0324


CPU times: user 13min 37s, sys: 7min 7s, total: 20min 45s
Wall time: 14min 35s


### Set 21

- N-grams: 1
- Stop words removal: No
- Vocabulary's size: No limits
- Maximum DF: 0.85

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer

def get_tf_idf_set21():
  return TfidfVectorizer(
      preprocessor=preprocess, 
      ngram_range=(1, 1), 
      max_features=None, 
      max_df=0.85)


In [None]:
%%time

cross_validation(
    [(mlp, False), (linear_svm, False), (rbf_svm, False), (logistic_regression, False), 
    (knn, False), (decision_tree, False), (random_forest, False), (adaboost, False)], 
    get_tf_idf_set21, 
    'SET 21', 
    verbose_vocab=True)

Starting fold 0
   Learned 10394 terms.
   Processing model: MLPClassifier
   Processing model: LinearSVC
   Processing model: SVC
   Processing model: LogisticRegression
   Processing model: KNeighborsClassifier
   Processing model: DecisionTreeClassifier
   Processing model: RandomForestClassifier
   Processing model: AdaBoostClassifier
Starting fold 1
   Learned 10208 terms.
   Processing model: MLPClassifier
   Processing model: LinearSVC
   Processing model: SVC
   Processing model: LogisticRegression
   Processing model: KNeighborsClassifier
   Processing model: DecisionTreeClassifier
   Processing model: RandomForestClassifier
   Processing model: AdaBoostClassifier
Starting fold 2
   Learned 9612 terms.
   Processing model: MLPClassifier
   Processing model: LinearSVC
   Processing model: SVC
   Processing model: LogisticRegression
   Processing model: KNeighborsClassifier
   Processing model: DecisionTreeClassifier
   Processing model: RandomForestClassifier
   Processing mode

Unnamed: 0,Precision,P std,Recall,R std,F1,F1 std
MLPClassifier,0.9089,0.0215,0.7717,0.0533,0.8342,0.0392
LinearSVC,0.963,0.0044,0.8768,0.0055,0.9179,0.003
SVC,0.9891,0.0026,0.8708,0.0124,0.9261,0.0059
LogisticRegression,0.8998,0.0096,0.4957,0.0378,0.6381,0.0294
KNeighborsClassifier,0.6523,0.042,0.2834,0.0382,0.392,0.0328
DecisionTreeClassifier,0.9945,0.0006,0.9761,0.0018,0.9852,0.0011
RandomForestClassifier,0.9982,0.0005,0.9724,0.0022,0.9851,0.0012
AdaBoostClassifier,0.6988,0.0099,0.3521,0.0226,0.4679,0.0211


Unnamed: 0,Precision,P std,Recall,R std,F1,F1 std
MLPClassifier,0.5774,0.0716,0.3424,0.0216,0.4274,0.0206
LinearSVC,0.5463,0.09,0.3859,0.0337,0.4459,0.0183
SVC,0.6475,0.0793,0.2317,0.0262,0.338,0.0228
LogisticRegression,0.6534,0.0832,0.2384,0.0179,0.3469,0.0154
KNeighborsClassifier,0.2551,0.0663,0.1033,0.0246,0.1417,0.0229
DecisionTreeClassifier,0.4053,0.0816,0.3396,0.0564,0.3635,0.0422
RandomForestClassifier,0.646,0.0613,0.1208,0.011,0.2028,0.014
AdaBoostClassifier,0.5166,0.0959,0.2131,0.0336,0.3003,0.0471


CPU times: user 3min 46s, sys: 1min 25s, total: 5min 11s
Wall time: 3min 51s


### Summary

In [None]:
from IPython.display import display, update_display

pd.set_option("display.max_rows", None)
metrics_df = pd.DataFrame(columns=['Model', 'TF-IDF set', 'Precision', 'P STD', 'Recall', 'R STD', 'F1', 'F1 STD'])
i = 0
for model_name, metrics in test_metrics.items():
  for m in metrics:
    metrics_df.loc[i] = [model_name, m[0], f'{m[1][0]:.4f}', f'{m[2][0]:.4f}', f'{m[1][1]:.4f}', f'{m[2][1]:.4f}', f'{m[1][2]:.4f}', f'{m[2][2]:.4f}']
    i += 1
metrics_display = display(metrics_df, display_id='metrics_table')

Unnamed: 0,Model,TF-IDF set,Precision,P STD,Recall,R STD,F1,F1 STD
0,MLPClassifier,SET 1,0.6197,0.0863,0.333,0.0584,0.4262,0.0433
1,MLPClassifier,SET 2,0.6031,0.1044,0.3622,0.0255,0.4463,0.0153
2,MLPClassifier,SET 3,0.6121,0.0893,0.3522,0.0343,0.4416,0.0221
3,MLPClassifier,SET 4,0.6042,0.0949,0.3447,0.0494,0.4324,0.0346
4,MLPClassifier,SET 5,0.5966,0.1083,0.3704,0.0458,0.449,0.0296
5,MLPClassifier,SET 6,0.6313,0.0949,0.3314,0.0503,0.4266,0.0329
6,MLPClassifier,SET 7,0.5774,0.0716,0.3424,0.0216,0.4274,0.0206
7,MLPClassifier,SET 8,0.5774,0.0716,0.3424,0.0216,0.4274,0.0206
8,MLPClassifier,SET 9,0.5906,0.0851,0.38,0.0666,0.4534,0.0388
9,MLPClassifier,SET 10,0.5436,0.0989,0.3095,0.0402,0.3859,0.0213


###Reference paper:

> Paheli Bhattacharya, Shounak Paul, Kripabandhu Ghosh, Saptarshi Ghosh, and Adam Wyner. 2019. **Identification of Rhetorical Roles of Sentences in Indian Legal Judgments**. In Proc. International Conference on Legal Knowledge and Information Systems (JURIX).

