In [1]:
import import_ipynb
import spacy
#from codalab_shared_task import add_column_names, sort_dataset, load_data, undersample_dataset
import pandas as pd
from spacy.lemmatizer import Lemmatizer


from spacy.lookups import Lookups
lookups = Lookups()
lemmatizer = Lemmatizer(lookups)

import sklearn
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics.pairwise import cosine_similarity

from gensim.models import Word2Vec
import stop_words

import nltk
import stanfordnlp

import warnings
warnings.filterwarnings('ignore')


languages = ['basque','bulgarian','danish','dutch','estonian',
             'german','hungarian','italian','irish','portuguese',
             'russian','serbian','slovene']

folder = 'data/train'



# Load stopwords for all languages

In [2]:
stopwords = {}
folder_st = 'data/stopwords/'

for lang in languages:
    if lang in ['basque', 'estonian','irish','slovene','serbian']:
        filename = folder_st+lang+'.txt'
        #print(filename)
        file = open(filename,'r')
        st = file.read()
        if lang=='serbian':
            st_list = st.split('\n')
        else:
            st_list = st.replace('[','').replace(']','').replace('"','').split(',')
    else:

        st_list = stop_words.get_stop_words(lang)
              
    print(lang, len(st_list))
    stopwords[lang] = st_list

basque 98
bulgarian 259
danish 94
dutch 101
estonian 35
german 232
hungarian 199
italian 308
irish 109
portuguese 203
russian 421
serbian 389
slovene 446


# Load training data for all languages

In [3]:
def load_data(file_path):
    loaded_data = pd.read_csv(file_path, sep='\t', header=None)
    add_column_names(loaded_data)

    return loaded_data

def add_column_names(df):
    column_names = ['word', 'pos', 'def1', 'def2', 'relation']
    df.columns = column_names

def undersample_dataset(imbalanced_set):
    none = imbalanced_set[has_label(imbalanced_set, 'none') == True]
    second_biggest = imbalanced_set.groupby('relation').count().word.sort_values(ascending=False)[1]
    result = imbalanced_set.drop(none.index[second_biggest:])

    return result.sample(frac=1, random_state=7)

def sort_dataset(all_data, dataset_lang):
    lang_data = []
    for key in all_data.keys():
        if dataset_lang in key:
            lang_data.append(all_data[key])
    sorted_sets = list(filter(lambda elem: filter_small_length(elem, 100), sort(lang_data)))
    return sorted_sets


def balance_dataset(sorted_sets, balancing):
    if balancing == 'undersampling':
        #print(len(sorted_sets))
        result = undersample_dataset(sorted_sets)

    else:
        smallest = sorted_sets[0]
        bigger = sorted_sets[1]

        smallest_by_label = categorize_by_label(smallest)
        bigger_by_label = categorize_by_label(bigger)

        result = combine_labels(upsample_from_bigger_set(smallest_by_label, bigger_by_label))

    return result


# REMOVE STOPWORDS AND PUNCTUATION

def load_and_preprocess(dataset_lang):
    data = load_data(folder+'/'+dataset_lang+'.tsv')
    balanced = undersample_dataset(data)  
    clean = clean_punctuation(balanced)
    clean = clean_stopwords(clean, dataset_lang)
    
    return clean

def has_label(df, label):
    return df['relation'] == label

def remove_punctuation(definition):
    cleandef = ''
    for word in nltk.tokenize.word_tokenize(definition):
        if word.isalpha():
            cleandef+=' '+word.lower()
            #print(word)
    if cleandef == '':
        return definition.lower()
    return cleandef

def clean_punctuation(dataset):
    for index, row in dataset.iterrows():
        dataset.at[index,'def1_clean']= remove_punctuation(row['def1'])
        dataset.at[index,'def2_clean'] = remove_punctuation(row['def2'])        
    return dataset
    
    
def remove_stopwords(definition, lang):
    cleandef = ''
    for word in definition.split():
        if word not in stopwords[lang]:
            cleandef+=' '+word
    if cleandef == '':
        return definition
    return cleandef
        

def clean_stopwords(dataset, lang):
    for index, row in dataset.iterrows():
        dataset.at[index,'def1_stop'] = remove_stopwords(row['def1_clean'], lang)
        dataset.at[index,'def2_stop'] = remove_stopwords(row['def2_clean'], lang) 
    return dataset



In [4]:
balanced_data = {}

for lang in languages:
    balanced_data[lang] = load_and_preprocess(lang)
    print(lang, len(balanced_data[lang]))

basque 1094
bulgarian 1623
danish 2520
dutch 93
estonian 2045
german 388
hungarian 1281
italian 781
irish 1547
portuguese 413
russian 709
serbian 1156
slovene 1551


# Create word embeddings for all languages

In [5]:
embeddings = {}

def make_word_embedding(dataset):
    all_definitions = []
    for index, row in dataset.iterrows(): 
        all_definitions.append(nltk.tokenize.word_tokenize(row['def1_stop']))
        all_definitions.append(nltk.tokenize.word_tokenize(row['def2_stop']))

    model = Word2Vec(all_definitions,
                     min_count=1,
                     size=200,
                     workers=2,
                     window=5,
                     iter=30) 
    return model


for lang in balanced_data:
    embeddings[lang] = make_word_embedding(balanced_data[lang])
    print('finished',lang)
        


finished  basque
finished  bulgarian
finished  danish
finished  dutch
finished  estonian
finished  german
finished  hungarian
finished  italian
finished  irish
finished  portuguese
finished  russian
finished  serbian
finished  slovene


# Initialize NLP pipelines for all languages

In [6]:
nlp_pipelines = {}

lang_codes = {'basque':'eu','bulgarian':'bg','danish':'da','dutch':'nl','estonian':'et',
             'german':'de','hungarian':'hu','italian':'it','irish':'ga','portuguese':'pt',
             'russian':'ru','serbian':'sr','slovene':'sl'}

for lang in languages:
    #stanfordnlp.download(lang_codes[lang])
    nlp_pipelines[lang] = stanfordnlp.Pipeline(lang = lang_codes[lang])
    #for definition in balanced_data[lang]['def1_clean'][:10]:
    #    doc = nlp_pipeline(definition)

Use device: cpu
---
Loading: tokenize
With settings: 
{'model_path': '/Users/lenka/stanfordnlp_resources/eu_bdt_models/eu_bdt_tokenizer.pt', 'lang': 'eu', 'shorthand': 'eu_bdt', 'mode': 'predict'}
---
Loading: pos
With settings: 
{'model_path': '/Users/lenka/stanfordnlp_resources/eu_bdt_models/eu_bdt_tagger.pt', 'pretrain_path': '/Users/lenka/stanfordnlp_resources/eu_bdt_models/eu_bdt.pretrain.pt', 'lang': 'eu', 'shorthand': 'eu_bdt', 'mode': 'predict'}
---
Loading: lemma
With settings: 
{'model_path': '/Users/lenka/stanfordnlp_resources/eu_bdt_models/eu_bdt_lemmatizer.pt', 'lang': 'eu', 'shorthand': 'eu_bdt', 'mode': 'predict'}
Building an attentional Seq2Seq model...
Using a Bi-LSTM encoder
Using soft attention for LSTM.
Finetune all embeddings.
[Running seq2seq lemmatizer with edit classifier]
---
Loading: depparse
With settings: 
{'model_path': '/Users/lenka/stanfordnlp_resources/eu_bdt_models/eu_bdt_parser.pt', 'pretrain_path': '/Users/lenka/stanfordnlp_resources/eu_bdt_models/eu_

---
Loading: lemma
With settings: 
{'model_path': '/Users/lenka/stanfordnlp_resources/it_isdt_models/it_isdt_lemmatizer.pt', 'lang': 'it', 'shorthand': 'it_isdt', 'mode': 'predict'}
Building an attentional Seq2Seq model...
Using a Bi-LSTM encoder
Using soft attention for LSTM.
Finetune all embeddings.
[Running seq2seq lemmatizer with edit classifier]
---
Loading: depparse
With settings: 
{'model_path': '/Users/lenka/stanfordnlp_resources/it_isdt_models/it_isdt_parser.pt', 'pretrain_path': '/Users/lenka/stanfordnlp_resources/it_isdt_models/it_isdt.pretrain.pt', 'lang': 'it', 'shorthand': 'it_isdt', 'mode': 'predict'}
Done loading processors!
---
finished italian
Use device: cpu
---
Loading: tokenize
With settings: 
{'model_path': '/Users/lenka/stanfordnlp_resources/ga_idt_models/ga_idt_tokenizer.pt', 'lang': 'ga', 'shorthand': 'ga_idt', 'mode': 'predict'}
---
Loading: pos
With settings: 
{'model_path': '/Users/lenka/stanfordnlp_resources/ga_idt_models/ga_idt_tagger.pt', 'pretrain_path':

# Extract features

In [102]:
def first_word_same(row):
    return row['def1_clean'].split(' ')[0].lower() == row['def2_clean'].split(' ')[0].lower()


def difference_in_length(row):
    return abs(len(row['def1_clean'].split(' ')) - len(row['def2_clean'].split(' ')[0]))


def jaccard_sim(row):
    return get_jaccard_sim(row['def1_clean'], row['def2_clean'])

def cos_gensim(row, lang):
    return get_cos_gensim(row['def1_stop'], row['def2_stop'], lang)[0,1]


#import scipy.spatial as spatial
import numpy as np

def get_cos_gensim(def1, def2, lang):
    avg1=[]
    for word in nltk.tokenize.word_tokenize(def1):
        avg1.append(embeddings[lang][word])
        
    avg2=[]
    for word in nltk.tokenize.word_tokenize(def2):
        avg2.append(embeddings[lang][word])
        
    v1 = np.array(avg1).reshape(-1,1)
    v2 = np.array(avg2).reshape(-1,1)

    return cosine_similarity(v1, v2)


def wmd(row, lang):
    return get_wmd(lang, row['def1_stop'], row['def2_stop'])

def get_wmd(lang, def1, def2):
    return embeddings[lang].wmdistance(nltk.tokenize.word_tokenize(def1), nltk.tokenize.word_tokenize(def2))

def cosine(row):
    cos = get_cosine_sim(row['def1_stop'], row['def2_stop'])
    return cos[0, 1]

def get_cosine_sim(*strs):
    vectors = [t for t in get_vectors(*strs)]
    return cosine_similarity(vectors)



def get_jaccard_sim(str1, str2):
    a = set(str1.split())
    b = set(str2.split())
    c = a.intersection(b)
    return float(len(c)) / (len(a) + len(b) - len(c))

def get_vectors(*strs):
    text = [t for t in strs]
    vectorizer = CountVectorizer(text)
    vectorizer.fit(text)
    return vectorizer.transform(text).toarray()

# TODO LEMMA
def root_same(row, lang):
    return root_word_same(row['def1_clean'], row['def2_clean'], lang)

def root_word_same(def1, def2, lang):
    root1 = ''
    root2 = ''
    
    doc1 = nlp_pipelines[lang](def1)
    doc2 = nlp_pipelines[lang](def2)

    for token in doc1.sentences[0].tokens:
        if token.words[0].dependency_relation == 'root':
            root1 = token.words[0].lemma
            break
        
    for token in doc2.sentences[0].tokens:
        if token.words[0].dependency_relation == 'root':
            root2 = token.words[0].lemma
            break
            
    #print(root1, root2, root1==root2)
    return root1==root2
    

In [103]:
from sklearn.feature_extraction.text import TfidfVectorizer

def tfidf(col1, col2):
    tfidf_holder = pd.DataFrame()
    tfidf_holder['col1'] = col1
    tfidf_holder['col2'] = col2

    values = join_definitions(col1, col2)
    tfidf_holder['tfidf_1'], tfidf_holder['tfidf_2'] = tfidf_vectors(values)

    return tfidf_holder.apply(lambda row: cosine_similarity([row['tfidf_1'], row['tfidf_2']])[0, 1], axis=1)


def convert_to_text(token_array):
    seperator = ' '
    return seperator.join(token_array)


def join_definitions(col1, col2):
    joined_definitions = pd.concat([col1, col2])
    return joined_definitions.apply(lambda tokens: ' '.join(tokens)).values.T


def tfidf_vectors(values):
    tfidf_matrix = TfidfVectorizer().fit_transform(values)

    split_index = int(tfidf_matrix.get_shape()[0] / 2)
    tfidf_array = tfidf_matrix.todense()

    df_result1 = [row.tolist()[0] for row in tfidf_array[0:split_index]]
    df_result2 = [row.tolist()[0] for row in tfidf_array[split_index:]]

    return df_result1, df_result2




In [104]:
def extract_features(data, feats_to_scale, lang):
    #def sentence2vec(row):
    #    return row['processed_1'].similarity(row['processed_2'])

    feat = pd.DataFrame()
    #print(data)
    #feat['similarities'] = data.apply(lambda row: sentence2vec(row), axis=1)
    feat['first_word_same'] = data.apply(lambda row: first_word_same(row), axis=1)
    feat['len_diff'] = data.apply(lambda row: difference_in_length(row), axis=1)
    feat['jaccard'] = data.apply(lambda row: jaccard_sim(row), axis=1)
    feat['cos'] = data.apply(lambda row: cosine(row), axis=1)
    feat['wmd'] = data.apply(lambda row: wmd(row, lang), axis=1)
    #feat['jaccard_gensim'] = data.apply(lambda row: jaccard_gensim(row, lang), axis=1)
    feat['cos_gensim'] = data.apply(lambda row: cos_gensim(row, lang), axis=1)
    
    feat['root_same'] = data.apply(lambda row: root_same(row, lang), axis=1)
    #feat['diff_pos_count'] = data.apply(lambda row: diff_pos_count(row), axis = 1)
    #feat['tfidf_similarity'] = tfidf(data['def1_stop'], data['def2_stop'])

    for c_name in feats_to_scale:
        feat[c_name] = preprocessing.scale(feat[c_name])

    return feat

In [105]:
features = {}
labels = {}

for lang in languages:
    #balanced_data[lang] = load_and_preprocess(lang)
    #print(lang, len(balanced_data[lang]))
    features[lang] = extract_features(balanced_data[lang],[], lang)
    labels[lang] = balanced_data[lang]['relation']
    print('extracted features for', lang)


extracted features for basque
extracted features for bulgarian
extracted features for danish
extracted features for dutch
extracted features for estonian
extracted features for german
extracted features for hungarian
extracted features for italian
extracted features for irish
extracted features for portuguese
extracted features for russian
extracted features for serbian
extracted features for slovene


# Train models

In [110]:
def train(data, with_testset=False):
    #train_and_test_classifiers(data['nltk']['trainset'], data['nltk']['testset'])
    trained_models = train_models_sklearn(data['pd']['x_trainset'],
                                          data['pd']['y_trainset'])
    cross_val_models(trained_models, data['pd']['x_trainset'],
                     data['pd']['y_trainset'])

    if with_testset:
        compare_on_testset(trained_models, data['pd']['x_testset'],
                           data['pd']['y_testset'])

        

def cross_val_models(models, x_train, y_train):
    for estimator in models:
        run_cv_with_dataset(estimator, x_train, y_train)

    
def run_cv_with_dataset(model, trainset, y_train):
    scores = cross_val_score(model, trainset, y_train, cv=5)
    print('Cross validation scores for model' + model.__class__.__name__ + '\n')
    print("Accuracy: %0.4f (+/- %0.4f)" % (scores.mean(), scores.std() * 2) + '\n')

        
def train_models_sklearn(x_train, y_train):
    lr = {'estimator': LogisticRegression(solver='lbfgs', multi_class='auto', max_iter = 500), 'parameters': {}}
    svm_model = {
        'estimator': SVC(),
        'parameters': {
            'C': [3, 5, 10],
            'kernel': ['rbf', 'linear', 'poly', 'sigmoid']
        }
    }
    rf = {
        'estimator': RandomForestClassifier(),
        'parameters': {
            'bootstrap': [True],
            'max_depth': [2, 3, 5, 7, 10],
            'max_features': [2, 3],
            'min_samples_leaf': [3, 4, 5],
            'min_samples_split': [2, 5, 8, 10, 12],
            'n_estimators': [50, 100, 200]
        }
    }
    dt = {'estimator': DecisionTreeClassifier(), 'parameters': {}}

    models = {'unscaled': [lr, rf]}

    tuned_models = tune_hyperparams(models, x_train, y_train)

    return tuned_models


def tune_hyperparams(estimators, x_train, y_train):
    result = []
    for estimator in estimators['unscaled']:
        params = estimator['parameters']

        scores = ['precision', 'recall', 'f1']

        for score in scores:
            print("# Tuning hyper-parameters for %s" % score)
            print()

            grid_search = GridSearchCV(estimator=estimator['estimator'], param_grid=params,
                                       scoring='%s_weighted' % score, cv=5,
                                       n_jobs=-1, verbose=1)

            print("Performing grid search...")
            print("parameters:")
            pprint(params)
            grid_search.fit(x_train, y_train)
            print()

            means = grid_search.cv_results_['mean_test_score']
            stds = grid_search.cv_results_['std_test_score']
            print('Precision: \n')
            #for mean, std, parameters in zip(means, stds, grid_search.cv_results_['params']):
            #    print("%0.3f (+/-%0.03f) for %r"
            #                      % (mean, std * 2, parameters) + '\n')

            print("Best score: %0.3f" % grid_search.best_score_ + '\n')
            print("Best parameters set:\n")
            best_parameters = grid_search.best_estimator_.get_params()
            for param_name in sorted(params.keys()):
                print("\t%s: %r" % (param_name, best_parameters[param_name]) + '\n')

            result.append(grid_search.best_estimator_)

    return result



# Run models

In [111]:
###### from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import cross_val_score, GridSearchCV
from pprint import pprint

def split_data(featuresets):
    f = int(len(featuresets) / 5)
    return featuresets[f:], featuresets[:f]



for lang in features:
    
    print('*************************')
    print(lang)
    print('*************************')
    #train_set, test_set = split_data(features[lang])
    
    data = {'pd':{}}
    
    data['pd']['x_trainset'] = features[lang]
    data['pd']['y_trainset'] = labels[lang]
    
    train(data)
    
    #print(len(features[lang]))
    
    

*************************
basque
*************************
# Tuning hyper-parameters for precision

Performing grid search...
parameters:
{}
Fitting 5 folds for each of 1 candidates, totalling 5 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done   5 out of   5 | elapsed:    2.8s finished
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.



Precision: 

Best score: 0.458

Best parameters set:

# Tuning hyper-parameters for recall

Performing grid search...
parameters:
{}
Fitting 5 folds for each of 1 candidates, totalling 5 fits


[Parallel(n_jobs=-1)]: Done   5 out of   5 | elapsed:    0.5s finished
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.



Precision: 

Best score: 0.522

Best parameters set:

# Tuning hyper-parameters for f1

Performing grid search...
parameters:
{}
Fitting 5 folds for each of 1 candidates, totalling 5 fits


[Parallel(n_jobs=-1)]: Done   5 out of   5 | elapsed:    0.5s finished



Precision: 

Best score: 0.420

Best parameters set:

# Tuning hyper-parameters for precision

Performing grid search...
parameters:
{'bootstrap': [True],
 'max_depth': [2, 3, 5, 7, 10],
 'max_features': [2, 3],
 'min_samples_leaf': [3, 4, 5],
 'min_samples_split': [2, 5, 8, 10, 12],
 'n_estimators': [50, 100, 200]}
Fitting 5 folds for each of 450 candidates, totalling 2250 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  76 tasks      | elapsed:    6.2s
[Parallel(n_jobs=-1)]: Done 376 tasks      | elapsed:   29.4s
[Parallel(n_jobs=-1)]: Done 808 tasks      | elapsed:  1.2min
[Parallel(n_jobs=-1)]: Done 1158 tasks      | elapsed:  1.8min
[Parallel(n_jobs=-1)]: Done 1608 tasks      | elapsed:  2.5min
[Parallel(n_jobs=-1)]: Done 2158 tasks      | elapsed:  3.3min
[Parallel(n_jobs=-1)]: Done 2250 out of 2250 | elapsed:  3.5min finished
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.



Precision: 

Best score: 0.476

Best parameters set:

	bootstrap: True

	max_depth: 10

	max_features: 2

	min_samples_leaf: 5

	min_samples_split: 10

	n_estimators: 50

# Tuning hyper-parameters for recall

Performing grid search...
parameters:
{'bootstrap': [True],
 'max_depth': [2, 3, 5, 7, 10],
 'max_features': [2, 3],
 'min_samples_leaf': [3, 4, 5],
 'min_samples_split': [2, 5, 8, 10, 12],
 'n_estimators': [50, 100, 200]}
Fitting 5 folds for each of 450 candidates, totalling 2250 fits


[Parallel(n_jobs=-1)]: Done  76 tasks      | elapsed:    6.8s
[Parallel(n_jobs=-1)]: Done 376 tasks      | elapsed:   31.0s
[Parallel(n_jobs=-1)]: Done 876 tasks      | elapsed:  1.2min
[Parallel(n_jobs=-1)]: Done 1576 tasks      | elapsed:  2.3min
[Parallel(n_jobs=-1)]: Done 2133 tasks      | elapsed:  3.2min
[Parallel(n_jobs=-1)]: Done 2250 out of 2250 | elapsed:  3.5min finished
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.



Precision: 

Best score: 0.528

Best parameters set:

	bootstrap: True

	max_depth: 5

	max_features: 2

	min_samples_leaf: 3

	min_samples_split: 2

	n_estimators: 50

# Tuning hyper-parameters for f1

Performing grid search...
parameters:
{'bootstrap': [True],
 'max_depth': [2, 3, 5, 7, 10],
 'max_features': [2, 3],
 'min_samples_leaf': [3, 4, 5],
 'min_samples_split': [2, 5, 8, 10, 12],
 'n_estimators': [50, 100, 200]}
Fitting 5 folds for each of 450 candidates, totalling 2250 fits


[Parallel(n_jobs=-1)]: Done  76 tasks      | elapsed:    6.9s
[Parallel(n_jobs=-1)]: Done 376 tasks      | elapsed:   32.6s
[Parallel(n_jobs=-1)]: Done 817 tasks      | elapsed:  1.2min
[Parallel(n_jobs=-1)]: Done 1167 tasks      | elapsed:  1.8min
[Parallel(n_jobs=-1)]: Done 1617 tasks      | elapsed:  2.5min
[Parallel(n_jobs=-1)]: Done 2167 tasks      | elapsed:  3.4min
[Parallel(n_jobs=-1)]: Done 2250 out of 2250 | elapsed:  3.6min finished



Precision: 

Best score: 0.454

Best parameters set:

	bootstrap: True

	max_depth: 10

	max_features: 3

	min_samples_leaf: 3

	min_samples_split: 2

	n_estimators: 100

Cross validation scores for modelLogisticRegression

Accuracy: 0.5220 (+/- 0.0261)

Cross validation scores for modelLogisticRegression

Accuracy: 0.5220 (+/- 0.0261)

Cross validation scores for modelLogisticRegression

Accuracy: 0.5220 (+/- 0.0261)

Cross validation scores for modelRandomForestClassifier

Accuracy: 0.5201 (+/- 0.0223)

Cross validation scores for modelRandomForestClassifier

Accuracy: 0.5201 (+/- 0.0261)

Cross validation scores for modelRandomForestClassifier

Accuracy: 0.5155 (+/- 0.0356)

*************************
bulgarian
*************************
# Tuning hyper-parameters for precision

Performing grid search...
parameters:
{}
Fitting 5 folds for each of 1 candidates, totalling 5 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done   5 out of   5 | elapsed:    0.7s finished



Precision: 

Best score: 0.316

Best parameters set:

# Tuning hyper-parameters for recall

Performing grid search...
parameters:
{}
Fitting 5 folds for each of 1 candidates, totalling 5 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done   5 out of   5 | elapsed:    0.7s finished



Precision: 

Best score: 0.475

Best parameters set:

# Tuning hyper-parameters for f1

Performing grid search...
parameters:
{}
Fitting 5 folds for each of 1 candidates, totalling 5 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done   5 out of   5 | elapsed:    0.8s finished



Precision: 

Best score: 0.374

Best parameters set:

# Tuning hyper-parameters for precision

Performing grid search...
parameters:
{'bootstrap': [True],
 'max_depth': [2, 3, 5, 7, 10],
 'max_features': [2, 3],
 'min_samples_leaf': [3, 4, 5],
 'min_samples_split': [2, 5, 8, 10, 12],
 'n_estimators': [50, 100, 200]}
Fitting 5 folds for each of 450 candidates, totalling 2250 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  76 tasks      | elapsed:    7.0s
[Parallel(n_jobs=-1)]: Done 303 tasks      | elapsed:   32.3s
[Parallel(n_jobs=-1)]: Done 553 tasks      | elapsed:  1.0min
[Parallel(n_jobs=-1)]: Done 903 tasks      | elapsed:  1.6min
[Parallel(n_jobs=-1)]: Done 1353 tasks      | elapsed:  2.3min
[Parallel(n_jobs=-1)]: Done 1903 tasks      | elapsed:  3.3min
[Parallel(n_jobs=-1)]: Done 2250 out of 2250 | elapsed:  3.8min finished



Precision: 

Best score: 0.418

Best parameters set:

	bootstrap: True

	max_depth: 10

	max_features: 3

	min_samples_leaf: 3

	min_samples_split: 8

	n_estimators: 200

# Tuning hyper-parameters for recall

Performing grid search...
parameters:
{'bootstrap': [True],
 'max_depth': [2, 3, 5, 7, 10],
 'max_features': [2, 3],
 'min_samples_leaf': [3, 4, 5],
 'min_samples_split': [2, 5, 8, 10, 12],
 'n_estimators': [50, 100, 200]}
Fitting 5 folds for each of 450 candidates, totalling 2250 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  76 tasks      | elapsed:    6.2s
[Parallel(n_jobs=-1)]: Done 376 tasks      | elapsed:   30.8s
[Parallel(n_jobs=-1)]: Done 876 tasks      | elapsed:  1.2min
[Parallel(n_jobs=-1)]: Done 1262 tasks      | elapsed:  2.0min
[Parallel(n_jobs=-1)]: Done 1712 tasks      | elapsed:  2.9min
[Parallel(n_jobs=-1)]: Done 2250 out of 2250 | elapsed:  3.9min finished
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.



Precision: 

Best score: 0.487

Best parameters set:

	bootstrap: True

	max_depth: 5

	max_features: 2

	min_samples_leaf: 3

	min_samples_split: 5

	n_estimators: 100

# Tuning hyper-parameters for f1

Performing grid search...
parameters:
{'bootstrap': [True],
 'max_depth': [2, 3, 5, 7, 10],
 'max_features': [2, 3],
 'min_samples_leaf': [3, 4, 5],
 'min_samples_split': [2, 5, 8, 10, 12],
 'n_estimators': [50, 100, 200]}
Fitting 5 folds for each of 450 candidates, totalling 2250 fits


[Parallel(n_jobs=-1)]: Done  76 tasks      | elapsed:    6.4s
[Parallel(n_jobs=-1)]: Done 354 tasks      | elapsed:   33.6s
[Parallel(n_jobs=-1)]: Done 604 tasks      | elapsed:   58.2s
[Parallel(n_jobs=-1)]: Done 954 tasks      | elapsed:  1.6min
[Parallel(n_jobs=-1)]: Done 1404 tasks      | elapsed:  2.3min
[Parallel(n_jobs=-1)]: Done 1954 tasks      | elapsed:  3.3min
[Parallel(n_jobs=-1)]: Done 2250 out of 2250 | elapsed:  3.8min finished



Precision: 

Best score: 0.401

Best parameters set:

	bootstrap: True

	max_depth: 10

	max_features: 3

	min_samples_leaf: 3

	min_samples_split: 2

	n_estimators: 50

Cross validation scores for modelLogisticRegression

Accuracy: 0.4751 (+/- 0.0234)

Cross validation scores for modelLogisticRegression

Accuracy: 0.4751 (+/- 0.0234)

Cross validation scores for modelLogisticRegression

Accuracy: 0.4751 (+/- 0.0234)

Cross validation scores for modelRandomForestClassifier

Accuracy: 0.4683 (+/- 0.0393)

Cross validation scores for modelRandomForestClassifier

Accuracy: 0.4837 (+/- 0.0343)

Cross validation scores for modelRandomForestClassifier

Accuracy: 0.4701 (+/- 0.0205)

*************************
danish
*************************
# Tuning hyper-parameters for precision

Performing grid search...
parameters:
{}
Fitting 5 folds for each of 1 candidates, totalling 5 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done   5 out of   5 | elapsed:    1.1s finished



Precision: 

Best score: 0.506

Best parameters set:

# Tuning hyper-parameters for recall

Performing grid search...
parameters:
{}
Fitting 5 folds for each of 1 candidates, totalling 5 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done   5 out of   5 | elapsed:    1.1s finished



Precision: 

Best score: 0.627

Best parameters set:

# Tuning hyper-parameters for f1

Performing grid search...
parameters:
{}
Fitting 5 folds for each of 1 candidates, totalling 5 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done   5 out of   5 | elapsed:    1.1s finished



Precision: 

Best score: 0.558

Best parameters set:

# Tuning hyper-parameters for precision

Performing grid search...
parameters:
{'bootstrap': [True],
 'max_depth': [2, 3, 5, 7, 10],
 'max_features': [2, 3],
 'min_samples_leaf': [3, 4, 5],
 'min_samples_split': [2, 5, 8, 10, 12],
 'n_estimators': [50, 100, 200]}
Fitting 5 folds for each of 450 candidates, totalling 2250 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  76 tasks      | elapsed:    7.1s
[Parallel(n_jobs=-1)]: Done 376 tasks      | elapsed:   36.0s
[Parallel(n_jobs=-1)]: Done 876 tasks      | elapsed:  1.5min
[Parallel(n_jobs=-1)]: Done 1257 tasks      | elapsed:  2.2min
[Parallel(n_jobs=-1)]: Done 1707 tasks      | elapsed:  3.1min
[Parallel(n_jobs=-1)]: Done 2250 out of 2250 | elapsed:  4.4min finished



Precision: 

Best score: 0.608

Best parameters set:

	bootstrap: True

	max_depth: 10

	max_features: 3

	min_samples_leaf: 4

	min_samples_split: 5

	n_estimators: 100

# Tuning hyper-parameters for recall

Performing grid search...
parameters:
{'bootstrap': [True],
 'max_depth': [2, 3, 5, 7, 10],
 'max_features': [2, 3],
 'min_samples_leaf': [3, 4, 5],
 'min_samples_split': [2, 5, 8, 10, 12],
 'n_estimators': [50, 100, 200]}
Fitting 5 folds for each of 450 candidates, totalling 2250 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  76 tasks      | elapsed:    7.5s
[Parallel(n_jobs=-1)]: Done 376 tasks      | elapsed:   36.7s
[Parallel(n_jobs=-1)]: Done 688 tasks      | elapsed:  1.2min
[Parallel(n_jobs=-1)]: Done 1038 tasks      | elapsed:  1.8min
[Parallel(n_jobs=-1)]: Done 1488 tasks      | elapsed:  2.7min
[Parallel(n_jobs=-1)]: Done 2038 tasks      | elapsed:  3.9min
[Parallel(n_jobs=-1)]: Done 2250 out of 2250 | elapsed:  4.5min finished



Precision: 

Best score: 0.650

Best parameters set:

	bootstrap: True

	max_depth: 10

	max_features: 3

	min_samples_leaf: 4

	min_samples_split: 5

	n_estimators: 100

# Tuning hyper-parameters for f1

Performing grid search...
parameters:
{'bootstrap': [True],
 'max_depth': [2, 3, 5, 7, 10],
 'max_features': [2, 3],
 'min_samples_leaf': [3, 4, 5],
 'min_samples_split': [2, 5, 8, 10, 12],
 'n_estimators': [50, 100, 200]}
Fitting 5 folds for each of 450 candidates, totalling 2250 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  76 tasks      | elapsed:    8.0s
[Parallel(n_jobs=-1)]: Done 281 tasks      | elapsed:   31.1s
[Parallel(n_jobs=-1)]: Done 531 tasks      | elapsed:   59.8s
[Parallel(n_jobs=-1)]: Done 881 tasks      | elapsed:  1.7min
[Parallel(n_jobs=-1)]: Done 1331 tasks      | elapsed:  2.5min
[Parallel(n_jobs=-1)]: Done 1881 tasks      | elapsed:  3.8min
[Parallel(n_jobs=-1)]: Done 2250 out of 2250 | elapsed:  4.8min finished



Precision: 

Best score: 0.582

Best parameters set:

	bootstrap: True

	max_depth: 10

	max_features: 3

	min_samples_leaf: 5

	min_samples_split: 10

	n_estimators: 100

Cross validation scores for modelLogisticRegression

Accuracy: 0.6273 (+/- 0.0373)

Cross validation scores for modelLogisticRegression

Accuracy: 0.6273 (+/- 0.0373)

Cross validation scores for modelLogisticRegression

Accuracy: 0.6273 (+/- 0.0373)

Cross validation scores for modelRandomForestClassifier

Accuracy: 0.6468 (+/- 0.0185)

Cross validation scores for modelRandomForestClassifier

Accuracy: 0.6420 (+/- 0.0281)

Cross validation scores for modelRandomForestClassifier

Accuracy: 0.6444 (+/- 0.0255)

*************************
dutch
*************************
# Tuning hyper-parameters for precision

Performing grid search...
parameters:
{}
Fitting 5 folds for each of 1 candidates, totalling 5 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done   5 out of   5 | elapsed:    0.2s finished
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.



Precision: 

Best score: 0.747

Best parameters set:

# Tuning hyper-parameters for recall

Performing grid search...
parameters:
{}
Fitting 5 folds for each of 1 candidates, totalling 5 fits


[Parallel(n_jobs=-1)]: Done   5 out of   5 | elapsed:    0.2s finished
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.



Precision: 

Best score: 0.742

Best parameters set:

# Tuning hyper-parameters for f1

Performing grid search...
parameters:
{}
Fitting 5 folds for each of 1 candidates, totalling 5 fits


[Parallel(n_jobs=-1)]: Done   5 out of   5 | elapsed:    0.2s finished
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.



Precision: 

Best score: 0.736

Best parameters set:

# Tuning hyper-parameters for precision

Performing grid search...
parameters:
{'bootstrap': [True],
 'max_depth': [2, 3, 5, 7, 10],
 'max_features': [2, 3],
 'min_samples_leaf': [3, 4, 5],
 'min_samples_split': [2, 5, 8, 10, 12],
 'n_estimators': [50, 100, 200]}
Fitting 5 folds for each of 450 candidates, totalling 2250 fits


[Parallel(n_jobs=-1)]: Done  76 tasks      | elapsed:    5.1s
[Parallel(n_jobs=-1)]: Done 376 tasks      | elapsed:   25.2s
[Parallel(n_jobs=-1)]: Done 876 tasks      | elapsed:   58.4s
[Parallel(n_jobs=-1)]: Done 1576 tasks      | elapsed:  1.8min
[Parallel(n_jobs=-1)]: Done 2250 out of 2250 | elapsed:  2.6min finished
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.



Precision: 

Best score: 0.858

Best parameters set:

	bootstrap: True

	max_depth: 2

	max_features: 3

	min_samples_leaf: 3

	min_samples_split: 2

	n_estimators: 50

# Tuning hyper-parameters for recall

Performing grid search...
parameters:
{'bootstrap': [True],
 'max_depth': [2, 3, 5, 7, 10],
 'max_features': [2, 3],
 'min_samples_leaf': [3, 4, 5],
 'min_samples_split': [2, 5, 8, 10, 12],
 'n_estimators': [50, 100, 200]}
Fitting 5 folds for each of 450 candidates, totalling 2250 fits


[Parallel(n_jobs=-1)]: Done  76 tasks      | elapsed:    5.1s
[Parallel(n_jobs=-1)]: Done 376 tasks      | elapsed:   25.0s
[Parallel(n_jobs=-1)]: Done 876 tasks      | elapsed:   58.3s
[Parallel(n_jobs=-1)]: Done 1413 tasks      | elapsed:  1.8min
[Parallel(n_jobs=-1)]: Done 1863 tasks      | elapsed:  2.3min
[Parallel(n_jobs=-1)]: Done 2250 out of 2250 | elapsed:  2.8min finished
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.



Precision: 

Best score: 0.860

Best parameters set:

	bootstrap: True

	max_depth: 2

	max_features: 2

	min_samples_leaf: 5

	min_samples_split: 2

	n_estimators: 50

# Tuning hyper-parameters for f1

Performing grid search...
parameters:
{'bootstrap': [True],
 'max_depth': [2, 3, 5, 7, 10],
 'max_features': [2, 3],
 'min_samples_leaf': [3, 4, 5],
 'min_samples_split': [2, 5, 8, 10, 12],
 'n_estimators': [50, 100, 200]}
Fitting 5 folds for each of 450 candidates, totalling 2250 fits


[Parallel(n_jobs=-1)]: Done  76 tasks      | elapsed:    5.0s
[Parallel(n_jobs=-1)]: Done 376 tasks      | elapsed:   24.5s
[Parallel(n_jobs=-1)]: Done 876 tasks      | elapsed:   57.2s
[Parallel(n_jobs=-1)]: Done 1576 tasks      | elapsed:  1.7min
[Parallel(n_jobs=-1)]: Done 2250 out of 2250 | elapsed:  2.5min finished



Precision: 

Best score: 0.854

Best parameters set:

	bootstrap: True

	max_depth: 2

	max_features: 3

	min_samples_leaf: 3

	min_samples_split: 8

	n_estimators: 100

Cross validation scores for modelLogisticRegression

Accuracy: 0.7429 (+/- 0.1518)

Cross validation scores for modelLogisticRegression

Accuracy: 0.7429 (+/- 0.1518)

Cross validation scores for modelLogisticRegression

Accuracy: 0.7429 (+/- 0.1518)

Cross validation scores for modelRandomForestClassifier

Accuracy: 0.8508 (+/- 0.2708)

Cross validation scores for modelRandomForestClassifier

Accuracy: 0.8111 (+/- 0.2948)

Cross validation scores for modelRandomForestClassifier

Accuracy: 0.8508 (+/- 0.2708)

*************************
estonian
*************************
# Tuning hyper-parameters for precision

Performing grid search...
parameters:
{}
Fitting 5 folds for each of 1 candidates, totalling 5 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done   5 out of   5 | elapsed:    0.9s finished



Precision: 

Best score: 0.623

Best parameters set:

# Tuning hyper-parameters for recall

Performing grid search...
parameters:
{}
Fitting 5 folds for each of 1 candidates, totalling 5 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done   5 out of   5 | elapsed:    0.9s finished



Precision: 

Best score: 0.680

Best parameters set:

# Tuning hyper-parameters for f1

Performing grid search...
parameters:
{}
Fitting 5 folds for each of 1 candidates, totalling 5 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done   5 out of   5 | elapsed:    0.9s finished



Precision: 

Best score: 0.646

Best parameters set:

# Tuning hyper-parameters for precision

Performing grid search...
parameters:
{'bootstrap': [True],
 'max_depth': [2, 3, 5, 7, 10],
 'max_features': [2, 3],
 'min_samples_leaf': [3, 4, 5],
 'min_samples_split': [2, 5, 8, 10, 12],
 'n_estimators': [50, 100, 200]}
Fitting 5 folds for each of 450 candidates, totalling 2250 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  76 tasks      | elapsed:    6.3s
[Parallel(n_jobs=-1)]: Done 376 tasks      | elapsed:   31.3s
[Parallel(n_jobs=-1)]: Done 876 tasks      | elapsed:  1.3min
[Parallel(n_jobs=-1)]: Done 1301 tasks      | elapsed:  2.0min
[Parallel(n_jobs=-1)]: Done 1751 tasks      | elapsed:  2.8min
[Parallel(n_jobs=-1)]: Done 2250 out of 2250 | elapsed:  3.7min finished
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.



Precision: 

Best score: 0.675

Best parameters set:

	bootstrap: True

	max_depth: 10

	max_features: 3

	min_samples_leaf: 3

	min_samples_split: 10

	n_estimators: 50

# Tuning hyper-parameters for recall

Performing grid search...
parameters:
{'bootstrap': [True],
 'max_depth': [2, 3, 5, 7, 10],
 'max_features': [2, 3],
 'min_samples_leaf': [3, 4, 5],
 'min_samples_split': [2, 5, 8, 10, 12],
 'n_estimators': [50, 100, 200]}
Fitting 5 folds for each of 450 candidates, totalling 2250 fits


[Parallel(n_jobs=-1)]: Done  76 tasks      | elapsed:    7.0s
[Parallel(n_jobs=-1)]: Done 376 tasks      | elapsed:   33.3s
[Parallel(n_jobs=-1)]: Done 848 tasks      | elapsed:  1.3min
[Parallel(n_jobs=-1)]: Done 1198 tasks      | elapsed:  1.9min
[Parallel(n_jobs=-1)]: Done 1648 tasks      | elapsed:  2.6min
[Parallel(n_jobs=-1)]: Done 2198 tasks      | elapsed:  3.6min
[Parallel(n_jobs=-1)]: Done 2250 out of 2250 | elapsed:  3.7min finished
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.



Precision: 

Best score: 0.685

Best parameters set:

	bootstrap: True

	max_depth: 5

	max_features: 2

	min_samples_leaf: 3

	min_samples_split: 5

	n_estimators: 100

# Tuning hyper-parameters for f1

Performing grid search...
parameters:
{'bootstrap': [True],
 'max_depth': [2, 3, 5, 7, 10],
 'max_features': [2, 3],
 'min_samples_leaf': [3, 4, 5],
 'min_samples_split': [2, 5, 8, 10, 12],
 'n_estimators': [50, 100, 200]}
Fitting 5 folds for each of 450 candidates, totalling 2250 fits


[Parallel(n_jobs=-1)]: Done  76 tasks      | elapsed:    6.7s
[Parallel(n_jobs=-1)]: Done 376 tasks      | elapsed:   33.3s
[Parallel(n_jobs=-1)]: Done 876 tasks      | elapsed:  1.3min
[Parallel(n_jobs=-1)]: Done 1576 tasks      | elapsed:  2.5min
[Parallel(n_jobs=-1)]: Done 2250 out of 2250 | elapsed:  3.7min finished



Precision: 

Best score: 0.650

Best parameters set:

	bootstrap: True

	max_depth: 10

	max_features: 2

	min_samples_leaf: 3

	min_samples_split: 2

	n_estimators: 50

Cross validation scores for modelLogisticRegression

Accuracy: 0.6797 (+/- 0.0224)

Cross validation scores for modelLogisticRegression

Accuracy: 0.6797 (+/- 0.0224)

Cross validation scores for modelLogisticRegression

Accuracy: 0.6797 (+/- 0.0224)

Cross validation scores for modelRandomForestClassifier

Accuracy: 0.6743 (+/- 0.0230)

Cross validation scores for modelRandomForestClassifier

Accuracy: 0.6782 (+/- 0.0210)

Cross validation scores for modelRandomForestClassifier

Accuracy: 0.6807 (+/- 0.0265)

*************************
german
*************************
# Tuning hyper-parameters for precision

Performing grid search...
parameters:
{}
Fitting 5 folds for each of 1 candidates, totalling 5 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done   5 out of   5 | elapsed:    0.3s finished
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.



Precision: 

Best score: 0.483

Best parameters set:

# Tuning hyper-parameters for recall

Performing grid search...
parameters:
{}
Fitting 5 folds for each of 1 candidates, totalling 5 fits


[Parallel(n_jobs=-1)]: Done   5 out of   5 | elapsed:    0.3s finished
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.



Precision: 

Best score: 0.482

Best parameters set:

# Tuning hyper-parameters for f1

Performing grid search...
parameters:
{}
Fitting 5 folds for each of 1 candidates, totalling 5 fits


[Parallel(n_jobs=-1)]: Done   5 out of   5 | elapsed:    0.3s finished
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.



Precision: 

Best score: 0.453

Best parameters set:

# Tuning hyper-parameters for precision

Performing grid search...
parameters:
{'bootstrap': [True],
 'max_depth': [2, 3, 5, 7, 10],
 'max_features': [2, 3],
 'min_samples_leaf': [3, 4, 5],
 'min_samples_split': [2, 5, 8, 10, 12],
 'n_estimators': [50, 100, 200]}
Fitting 5 folds for each of 450 candidates, totalling 2250 fits


[Parallel(n_jobs=-1)]: Done  76 tasks      | elapsed:    5.5s
[Parallel(n_jobs=-1)]: Done 376 tasks      | elapsed:   26.8s
[Parallel(n_jobs=-1)]: Done 876 tasks      | elapsed:  1.0min
[Parallel(n_jobs=-1)]: Done 1576 tasks      | elapsed:  1.9min
[Parallel(n_jobs=-1)]: Done 2250 out of 2250 | elapsed:  2.9min finished
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.



Precision: 

Best score: 0.628

Best parameters set:

	bootstrap: True

	max_depth: 3

	max_features: 3

	min_samples_leaf: 3

	min_samples_split: 12

	n_estimators: 50

# Tuning hyper-parameters for recall

Performing grid search...
parameters:
{'bootstrap': [True],
 'max_depth': [2, 3, 5, 7, 10],
 'max_features': [2, 3],
 'min_samples_leaf': [3, 4, 5],
 'min_samples_split': [2, 5, 8, 10, 12],
 'n_estimators': [50, 100, 200]}
Fitting 5 folds for each of 450 candidates, totalling 2250 fits


[Parallel(n_jobs=-1)]: Done  76 tasks      | elapsed:    5.2s
[Parallel(n_jobs=-1)]: Done 376 tasks      | elapsed:   25.7s
[Parallel(n_jobs=-1)]: Done 876 tasks      | elapsed:  1.0min
[Parallel(n_jobs=-1)]: Done 1576 tasks      | elapsed:  1.8min
[Parallel(n_jobs=-1)]: Done 2250 out of 2250 | elapsed:  2.8min finished



Precision: 

Best score: 0.580

Best parameters set:

	bootstrap: True

	max_depth: 10

	max_features: 3

	min_samples_leaf: 3

	min_samples_split: 10

	n_estimators: 200

# Tuning hyper-parameters for f1

Performing grid search...
parameters:
{'bootstrap': [True],
 'max_depth': [2, 3, 5, 7, 10],
 'max_features': [2, 3],
 'min_samples_leaf': [3, 4, 5],
 'min_samples_split': [2, 5, 8, 10, 12],
 'n_estimators': [50, 100, 200]}
Fitting 5 folds for each of 450 candidates, totalling 2250 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  76 tasks      | elapsed:    6.7s
[Parallel(n_jobs=-1)]: Done 376 tasks      | elapsed:   29.7s
[Parallel(n_jobs=-1)]: Done 876 tasks      | elapsed:  1.2min
[Parallel(n_jobs=-1)]: Done 1298 tasks      | elapsed:  1.8min
[Parallel(n_jobs=-1)]: Done 1748 tasks      | elapsed:  2.5min
[Parallel(n_jobs=-1)]: Done 2250 out of 2250 | elapsed:  3.2min finished



Precision: 

Best score: 0.568

Best parameters set:

	bootstrap: True

	max_depth: 10

	max_features: 3

	min_samples_leaf: 3

	min_samples_split: 2

	n_estimators: 50

Cross validation scores for modelLogisticRegression

Accuracy: 0.4822 (+/- 0.1274)

Cross validation scores for modelLogisticRegression

Accuracy: 0.4822 (+/- 0.1274)

Cross validation scores for modelLogisticRegression

Accuracy: 0.4822 (+/- 0.1274)

Cross validation scores for modelRandomForestClassifier

Accuracy: 0.5411 (+/- 0.1120)

Cross validation scores for modelRandomForestClassifier

Accuracy: 0.5594 (+/- 0.0877)

Cross validation scores for modelRandomForestClassifier

Accuracy: 0.5672 (+/- 0.0786)

*************************
hungarian
*************************
# Tuning hyper-parameters for precision

Performing grid search...
parameters:
{}
Fitting 5 folds for each of 1 candidates, totalling 5 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done   5 out of   5 | elapsed:    0.6s finished



Precision: 

Best score: 0.537

Best parameters set:

# Tuning hyper-parameters for recall

Performing grid search...
parameters:
{}
Fitting 5 folds for each of 1 candidates, totalling 5 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done   5 out of   5 | elapsed:    0.8s finished



Precision: 

Best score: 0.597

Best parameters set:

# Tuning hyper-parameters for f1

Performing grid search...
parameters:
{}
Fitting 5 folds for each of 1 candidates, totalling 5 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done   5 out of   5 | elapsed:    0.9s finished



Precision: 

Best score: 0.542

Best parameters set:

# Tuning hyper-parameters for precision

Performing grid search...
parameters:
{'bootstrap': [True],
 'max_depth': [2, 3, 5, 7, 10],
 'max_features': [2, 3],
 'min_samples_leaf': [3, 4, 5],
 'min_samples_split': [2, 5, 8, 10, 12],
 'n_estimators': [50, 100, 200]}
Fitting 5 folds for each of 450 candidates, totalling 2250 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  42 tasks      | elapsed:    3.8s
[Parallel(n_jobs=-1)]: Done 192 tasks      | elapsed:   16.3s
[Parallel(n_jobs=-1)]: Done 442 tasks      | elapsed:   41.7s
[Parallel(n_jobs=-1)]: Done 792 tasks      | elapsed:  1.3min
[Parallel(n_jobs=-1)]: Done 1242 tasks      | elapsed:  2.0min
[Parallel(n_jobs=-1)]: Done 1792 tasks      | elapsed:  2.9min
[Parallel(n_jobs=-1)]: Done 2250 out of 2250 | elapsed:  3.6min finished



Precision: 

Best score: 0.649

Best parameters set:

	bootstrap: True

	max_depth: 3

	max_features: 3

	min_samples_leaf: 5

	min_samples_split: 2

	n_estimators: 200

# Tuning hyper-parameters for recall

Performing grid search...
parameters:
{'bootstrap': [True],
 'max_depth': [2, 3, 5, 7, 10],
 'max_features': [2, 3],
 'min_samples_leaf': [3, 4, 5],
 'min_samples_split': [2, 5, 8, 10, 12],
 'n_estimators': [50, 100, 200]}
Fitting 5 folds for each of 450 candidates, totalling 2250 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  76 tasks      | elapsed:    6.5s
[Parallel(n_jobs=-1)]: Done 376 tasks      | elapsed:   32.0s
[Parallel(n_jobs=-1)]: Done 876 tasks      | elapsed:  1.3min
[Parallel(n_jobs=-1)]: Done 1291 tasks      | elapsed:  1.9min
[Parallel(n_jobs=-1)]: Done 1741 tasks      | elapsed:  2.7min
[Parallel(n_jobs=-1)]: Done 2250 out of 2250 | elapsed:  3.8min finished



Precision: 

Best score: 0.614

Best parameters set:

	bootstrap: True

	max_depth: 5

	max_features: 3

	min_samples_leaf: 4

	min_samples_split: 2

	n_estimators: 200

# Tuning hyper-parameters for f1

Performing grid search...
parameters:
{'bootstrap': [True],
 'max_depth': [2, 3, 5, 7, 10],
 'max_features': [2, 3],
 'min_samples_leaf': [3, 4, 5],
 'min_samples_split': [2, 5, 8, 10, 12],
 'n_estimators': [50, 100, 200]}
Fitting 5 folds for each of 450 candidates, totalling 2250 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  42 tasks      | elapsed:    4.4s
[Parallel(n_jobs=-1)]: Done 192 tasks      | elapsed:   22.2s
[Parallel(n_jobs=-1)]: Done 442 tasks      | elapsed:   45.9s
[Parallel(n_jobs=-1)]: Done 792 tasks      | elapsed:  1.3min
[Parallel(n_jobs=-1)]: Done 1242 tasks      | elapsed:  2.1min
[Parallel(n_jobs=-1)]: Done 1792 tasks      | elapsed:  3.0min
[Parallel(n_jobs=-1)]: Done 2250 out of 2250 | elapsed:  3.8min finished



Precision: 

Best score: 0.571

Best parameters set:

	bootstrap: True

	max_depth: 10

	max_features: 3

	min_samples_leaf: 4

	min_samples_split: 5

	n_estimators: 50

Cross validation scores for modelLogisticRegression

Accuracy: 0.5971 (+/- 0.0527)

Cross validation scores for modelLogisticRegression

Accuracy: 0.5971 (+/- 0.0527)

Cross validation scores for modelLogisticRegression

Accuracy: 0.5971 (+/- 0.0527)

Cross validation scores for modelRandomForestClassifier

Accuracy: 0.5988 (+/- 0.0219)

Cross validation scores for modelRandomForestClassifier

Accuracy: 0.6096 (+/- 0.0486)

Cross validation scores for modelRandomForestClassifier

Accuracy: 0.5986 (+/- 0.0499)

*************************
italian
*************************
# Tuning hyper-parameters for precision

Performing grid search...
parameters:
{}
Fitting 5 folds for each of 1 candidates, totalling 5 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done   5 out of   5 | elapsed:    0.3s finished
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.



Precision: 

Best score: 0.464

Best parameters set:

# Tuning hyper-parameters for recall

Performing grid search...
parameters:
{}
Fitting 5 folds for each of 1 candidates, totalling 5 fits


[Parallel(n_jobs=-1)]: Done   5 out of   5 | elapsed:    0.3s finished
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.



Precision: 

Best score: 0.557

Best parameters set:

# Tuning hyper-parameters for f1

Performing grid search...
parameters:
{}
Fitting 5 folds for each of 1 candidates, totalling 5 fits


[Parallel(n_jobs=-1)]: Done   5 out of   5 | elapsed:    0.3s finished
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.



Precision: 

Best score: 0.470

Best parameters set:

# Tuning hyper-parameters for precision

Performing grid search...
parameters:
{'bootstrap': [True],
 'max_depth': [2, 3, 5, 7, 10],
 'max_features': [2, 3],
 'min_samples_leaf': [3, 4, 5],
 'min_samples_split': [2, 5, 8, 10, 12],
 'n_estimators': [50, 100, 200]}
Fitting 5 folds for each of 450 candidates, totalling 2250 fits


[Parallel(n_jobs=-1)]: Done  76 tasks      | elapsed:    6.2s
[Parallel(n_jobs=-1)]: Done 376 tasks      | elapsed:   31.3s
[Parallel(n_jobs=-1)]: Done 876 tasks      | elapsed:  1.2min
[Parallel(n_jobs=-1)]: Done 1576 tasks      | elapsed:  2.2min
[Parallel(n_jobs=-1)]: Done 2083 tasks      | elapsed:  3.1min
[Parallel(n_jobs=-1)]: Done 2250 out of 2250 | elapsed:  3.4min finished
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.



Precision: 

Best score: 0.484

Best parameters set:

	bootstrap: True

	max_depth: 10

	max_features: 3

	min_samples_leaf: 3

	min_samples_split: 8

	n_estimators: 50

# Tuning hyper-parameters for recall

Performing grid search...
parameters:
{'bootstrap': [True],
 'max_depth': [2, 3, 5, 7, 10],
 'max_features': [2, 3],
 'min_samples_leaf': [3, 4, 5],
 'min_samples_split': [2, 5, 8, 10, 12],
 'n_estimators': [50, 100, 200]}
Fitting 5 folds for each of 450 candidates, totalling 2250 fits


[Parallel(n_jobs=-1)]: Done  76 tasks      | elapsed:    6.0s
[Parallel(n_jobs=-1)]: Done 376 tasks      | elapsed:   29.5s
[Parallel(n_jobs=-1)]: Done 791 tasks      | elapsed:  1.1min
[Parallel(n_jobs=-1)]: Done 1141 tasks      | elapsed:  1.7min
[Parallel(n_jobs=-1)]: Done 1591 tasks      | elapsed:  2.9min
[Parallel(n_jobs=-1)]: Done 2141 tasks      | elapsed:  3.7min
[Parallel(n_jobs=-1)]: Done 2250 out of 2250 | elapsed:  3.9min finished
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.



Precision: 

Best score: 0.560

Best parameters set:

	bootstrap: True

	max_depth: 3

	max_features: 2

	min_samples_leaf: 4

	min_samples_split: 2

	n_estimators: 50

# Tuning hyper-parameters for f1

Performing grid search...
parameters:
{'bootstrap': [True],
 'max_depth': [2, 3, 5, 7, 10],
 'max_features': [2, 3],
 'min_samples_leaf': [3, 4, 5],
 'min_samples_split': [2, 5, 8, 10, 12],
 'n_estimators': [50, 100, 200]}
Fitting 5 folds for each of 450 candidates, totalling 2250 fits


[Parallel(n_jobs=-1)]: Done  76 tasks      | elapsed:    5.7s
[Parallel(n_jobs=-1)]: Done 376 tasks      | elapsed:   28.3s
[Parallel(n_jobs=-1)]: Done 876 tasks      | elapsed:  1.1min
[Parallel(n_jobs=-1)]: Done 1576 tasks      | elapsed:  2.0min
[Parallel(n_jobs=-1)]: Done 2250 out of 2250 | elapsed:  2.9min finished



Precision: 

Best score: 0.478

Best parameters set:

	bootstrap: True

	max_depth: 10

	max_features: 3

	min_samples_leaf: 3

	min_samples_split: 8

	n_estimators: 200

Cross validation scores for modelLogisticRegression

Accuracy: 0.5569 (+/- 0.0191)

Cross validation scores for modelLogisticRegression

Accuracy: 0.5569 (+/- 0.0191)

Cross validation scores for modelLogisticRegression

Accuracy: 0.5569 (+/- 0.0191)

Cross validation scores for modelRandomForestClassifier

Accuracy: 0.5455 (+/- 0.0146)

Cross validation scores for modelRandomForestClassifier

Accuracy: 0.5556 (+/- 0.0386)

Cross validation scores for modelRandomForestClassifier

Accuracy: 0.5494 (+/- 0.0334)

*************************
irish
*************************
# Tuning hyper-parameters for precision

Performing grid search...
parameters:
{}
Fitting 5 folds for each of 1 candidates, totalling 5 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done   5 out of   5 | elapsed:    0.8s finished



Precision: 

Best score: 0.644

Best parameters set:

# Tuning hyper-parameters for recall

Performing grid search...
parameters:
{}
Fitting 5 folds for each of 1 candidates, totalling 5 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done   5 out of   5 | elapsed:    1.0s finished



Precision: 

Best score: 0.745

Best parameters set:

# Tuning hyper-parameters for f1

Performing grid search...
parameters:
{}
Fitting 5 folds for each of 1 candidates, totalling 5 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done   5 out of   5 | elapsed:    0.8s finished



Precision: 

Best score: 0.688

Best parameters set:

# Tuning hyper-parameters for precision

Performing grid search...
parameters:
{'bootstrap': [True],
 'max_depth': [2, 3, 5, 7, 10],
 'max_features': [2, 3],
 'min_samples_leaf': [3, 4, 5],
 'min_samples_split': [2, 5, 8, 10, 12],
 'n_estimators': [50, 100, 200]}
Fitting 5 folds for each of 450 candidates, totalling 2250 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  76 tasks      | elapsed:    6.3s
[Parallel(n_jobs=-1)]: Done 376 tasks      | elapsed:   30.1s
[Parallel(n_jobs=-1)]: Done 876 tasks      | elapsed:  1.2min
[Parallel(n_jobs=-1)]: Done 1576 tasks      | elapsed:  2.2min
[Parallel(n_jobs=-1)]: Done 2250 out of 2250 | elapsed:  3.3min finished
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.



Precision: 

Best score: 0.713

Best parameters set:

	bootstrap: True

	max_depth: 10

	max_features: 2

	min_samples_leaf: 4

	min_samples_split: 12

	n_estimators: 50

# Tuning hyper-parameters for recall

Performing grid search...
parameters:
{'bootstrap': [True],
 'max_depth': [2, 3, 5, 7, 10],
 'max_features': [2, 3],
 'min_samples_leaf': [3, 4, 5],
 'min_samples_split': [2, 5, 8, 10, 12],
 'n_estimators': [50, 100, 200]}
Fitting 5 folds for each of 450 candidates, totalling 2250 fits


[Parallel(n_jobs=-1)]: Done  76 tasks      | elapsed:    6.1s
[Parallel(n_jobs=-1)]: Done 376 tasks      | elapsed:   30.0s
[Parallel(n_jobs=-1)]: Done 876 tasks      | elapsed:  1.2min
[Parallel(n_jobs=-1)]: Done 1576 tasks      | elapsed:  2.2min
[Parallel(n_jobs=-1)]: Done 2250 out of 2250 | elapsed:  3.3min finished



Precision: 

Best score: 0.771

Best parameters set:

	bootstrap: True

	max_depth: 10

	max_features: 3

	min_samples_leaf: 3

	min_samples_split: 8

	n_estimators: 100

# Tuning hyper-parameters for f1

Performing grid search...
parameters:
{'bootstrap': [True],
 'max_depth': [2, 3, 5, 7, 10],
 'max_features': [2, 3],
 'min_samples_leaf': [3, 4, 5],
 'min_samples_split': [2, 5, 8, 10, 12],
 'n_estimators': [50, 100, 200]}
Fitting 5 folds for each of 450 candidates, totalling 2250 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  42 tasks      | elapsed:    3.5s
[Parallel(n_jobs=-1)]: Done 192 tasks      | elapsed:   15.7s
[Parallel(n_jobs=-1)]: Done 442 tasks      | elapsed:   38.5s
[Parallel(n_jobs=-1)]: Done 792 tasks      | elapsed:  1.2min
[Parallel(n_jobs=-1)]: Done 1242 tasks      | elapsed:  2.1min
[Parallel(n_jobs=-1)]: Done 1792 tasks      | elapsed:  3.5min
[Parallel(n_jobs=-1)]: Done 2250 out of 2250 | elapsed:  4.9min finished



Precision: 

Best score: 0.725

Best parameters set:

	bootstrap: True

	max_depth: 10

	max_features: 3

	min_samples_leaf: 4

	min_samples_split: 2

	n_estimators: 100

Cross validation scores for modelLogisticRegression

Accuracy: 0.7441 (+/- 0.0211)

Cross validation scores for modelLogisticRegression

Accuracy: 0.7441 (+/- 0.0211)

Cross validation scores for modelLogisticRegression

Accuracy: 0.7441 (+/- 0.0211)

Cross validation scores for modelRandomForestClassifier

Accuracy: 0.7609 (+/- 0.0315)

Cross validation scores for modelRandomForestClassifier

Accuracy: 0.7667 (+/- 0.0267)

Cross validation scores for modelRandomForestClassifier

Accuracy: 0.7635 (+/- 0.0250)

*************************
portuguese
*************************
# Tuning hyper-parameters for precision

Performing grid search...
parameters:
{}
Fitting 5 folds for each of 1 candidates, totalling 5 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done   5 out of   5 | elapsed:    0.6s finished



Precision: 

Best score: 0.713

Best parameters set:

# Tuning hyper-parameters for recall

Performing grid search...
parameters:
{}
Fitting 5 folds for each of 1 candidates, totalling 5 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done   5 out of   5 | elapsed:    0.7s finished



Precision: 

Best score: 0.823

Best parameters set:

# Tuning hyper-parameters for f1

Performing grid search...
parameters:
{}
Fitting 5 folds for each of 1 candidates, totalling 5 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done   5 out of   5 | elapsed:    0.7s finished



Precision: 

Best score: 0.764

Best parameters set:

# Tuning hyper-parameters for precision

Performing grid search...
parameters:
{'bootstrap': [True],
 'max_depth': [2, 3, 5, 7, 10],
 'max_features': [2, 3],
 'min_samples_leaf': [3, 4, 5],
 'min_samples_split': [2, 5, 8, 10, 12],
 'n_estimators': [50, 100, 200]}
Fitting 5 folds for each of 450 candidates, totalling 2250 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  42 tasks      | elapsed:    5.5s
[Parallel(n_jobs=-1)]: Done 192 tasks      | elapsed:   24.6s
[Parallel(n_jobs=-1)]: Done 442 tasks      | elapsed:   56.2s
[Parallel(n_jobs=-1)]: Done 792 tasks      | elapsed:  1.7min
[Parallel(n_jobs=-1)]: Done 1242 tasks      | elapsed:  2.7min
[Parallel(n_jobs=-1)]: Done 1792 tasks      | elapsed:  4.7min
[Parallel(n_jobs=-1)]: Done 2250 out of 2250 | elapsed:  6.2min finished
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.



Precision: 

Best score: 0.765

Best parameters set:

	bootstrap: True

	max_depth: 10

	max_features: 3

	min_samples_leaf: 3

	min_samples_split: 2

	n_estimators: 50

# Tuning hyper-parameters for recall

Performing grid search...
parameters:
{'bootstrap': [True],
 'max_depth': [2, 3, 5, 7, 10],
 'max_features': [2, 3],
 'min_samples_leaf': [3, 4, 5],
 'min_samples_split': [2, 5, 8, 10, 12],
 'n_estimators': [50, 100, 200]}
Fitting 5 folds for each of 450 candidates, totalling 2250 fits


[Parallel(n_jobs=-1)]: Done  42 tasks      | elapsed:    8.9s
[Parallel(n_jobs=-1)]: Done 192 tasks      | elapsed:   36.4s
[Parallel(n_jobs=-1)]: Done 442 tasks      | elapsed:  1.4min
[Parallel(n_jobs=-1)]: Done 792 tasks      | elapsed:  2.5min
[Parallel(n_jobs=-1)]: Done 1242 tasks      | elapsed:  4.0min
[Parallel(n_jobs=-1)]: Done 1792 tasks      | elapsed: 1341.2min
[Parallel(n_jobs=-1)]: Done 2250 out of 2250 | elapsed: 1341.9min finished
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.



Precision: 

Best score: 0.838

Best parameters set:

	bootstrap: True

	max_depth: 7

	max_features: 2

	min_samples_leaf: 3

	min_samples_split: 2

	n_estimators: 50

# Tuning hyper-parameters for f1

Performing grid search...
parameters:
{'bootstrap': [True],
 'max_depth': [2, 3, 5, 7, 10],
 'max_features': [2, 3],
 'min_samples_leaf': [3, 4, 5],
 'min_samples_split': [2, 5, 8, 10, 12],
 'n_estimators': [50, 100, 200]}
Fitting 5 folds for each of 450 candidates, totalling 2250 fits


[Parallel(n_jobs=-1)]: Done  76 tasks      | elapsed:    5.9s
[Parallel(n_jobs=-1)]: Done 376 tasks      | elapsed:   32.0s
[Parallel(n_jobs=-1)]: Done 876 tasks      | elapsed:  1.2min
[Parallel(n_jobs=-1)]: Done 1576 tasks      | elapsed:  2.1min
[Parallel(n_jobs=-1)]: Done 2250 out of 2250 | elapsed:  3.0min finished



Precision: 

Best score: 0.793

Best parameters set:

	bootstrap: True

	max_depth: 7

	max_features: 3

	min_samples_leaf: 3

	min_samples_split: 5

	n_estimators: 50

Cross validation scores for modelLogisticRegression

Accuracy: 0.8239 (+/- 0.0535)

Cross validation scores for modelLogisticRegression

Accuracy: 0.8239 (+/- 0.0535)

Cross validation scores for modelLogisticRegression

Accuracy: 0.8239 (+/- 0.0535)

Cross validation scores for modelRandomForestClassifier

Accuracy: 0.8335 (+/- 0.0487)

Cross validation scores for modelRandomForestClassifier

Accuracy: 0.8239 (+/- 0.0477)

Cross validation scores for modelRandomForestClassifier

Accuracy: 0.8305 (+/- 0.0238)

*************************
russian
*************************
# Tuning hyper-parameters for precision

Performing grid search...
parameters:
{}
Fitting 5 folds for each of 1 candidates, totalling 5 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done   5 out of   5 | elapsed:    0.3s finished
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.



Precision: 

Best score: 0.473

Best parameters set:

# Tuning hyper-parameters for recall

Performing grid search...
parameters:
{}
Fitting 5 folds for each of 1 candidates, totalling 5 fits


[Parallel(n_jobs=-1)]: Done   5 out of   5 | elapsed:    0.3s finished
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.



Precision: 

Best score: 0.612

Best parameters set:

# Tuning hyper-parameters for f1

Performing grid search...
parameters:
{}
Fitting 5 folds for each of 1 candidates, totalling 5 fits


[Parallel(n_jobs=-1)]: Done   5 out of   5 | elapsed:    0.4s finished
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.



Precision: 

Best score: 0.527

Best parameters set:

# Tuning hyper-parameters for precision

Performing grid search...
parameters:
{'bootstrap': [True],
 'max_depth': [2, 3, 5, 7, 10],
 'max_features': [2, 3],
 'min_samples_leaf': [3, 4, 5],
 'min_samples_split': [2, 5, 8, 10, 12],
 'n_estimators': [50, 100, 200]}
Fitting 5 folds for each of 450 candidates, totalling 2250 fits


[Parallel(n_jobs=-1)]: Done  76 tasks      | elapsed:    5.9s
[Parallel(n_jobs=-1)]: Done 376 tasks      | elapsed:   30.5s
[Parallel(n_jobs=-1)]: Done 876 tasks      | elapsed:  1.2min
[Parallel(n_jobs=-1)]: Done 1406 tasks      | elapsed:  2.0min
[Parallel(n_jobs=-1)]: Done 1856 tasks      | elapsed:  2.7min
[Parallel(n_jobs=-1)]: Done 2250 out of 2250 | elapsed:  3.3min finished
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.



Precision: 

Best score: 0.566

Best parameters set:

	bootstrap: True

	max_depth: 10

	max_features: 3

	min_samples_leaf: 3

	min_samples_split: 2

	n_estimators: 100

# Tuning hyper-parameters for recall

Performing grid search...
parameters:
{'bootstrap': [True],
 'max_depth': [2, 3, 5, 7, 10],
 'max_features': [2, 3],
 'min_samples_leaf': [3, 4, 5],
 'min_samples_split': [2, 5, 8, 10, 12],
 'n_estimators': [50, 100, 200]}
Fitting 5 folds for each of 450 candidates, totalling 2250 fits


[Parallel(n_jobs=-1)]: Done  76 tasks      | elapsed:    5.6s
[Parallel(n_jobs=-1)]: Done 376 tasks      | elapsed:   31.9s
[Parallel(n_jobs=-1)]: Done 876 tasks      | elapsed:  1.2min
[Parallel(n_jobs=-1)]: Done 1576 tasks      | elapsed:  2.3min
[Parallel(n_jobs=-1)]: Done 2250 out of 2250 | elapsed:  3.3min finished
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.



Precision: 

Best score: 0.616

Best parameters set:

	bootstrap: True

	max_depth: 2

	max_features: 3

	min_samples_leaf: 3

	min_samples_split: 12

	n_estimators: 50

# Tuning hyper-parameters for f1

Performing grid search...
parameters:
{'bootstrap': [True],
 'max_depth': [2, 3, 5, 7, 10],
 'max_features': [2, 3],
 'min_samples_leaf': [3, 4, 5],
 'min_samples_split': [2, 5, 8, 10, 12],
 'n_estimators': [50, 100, 200]}
Fitting 5 folds for each of 450 candidates, totalling 2250 fits


[Parallel(n_jobs=-1)]: Done  76 tasks      | elapsed:    6.1s
[Parallel(n_jobs=-1)]: Done 376 tasks      | elapsed:   29.7s
[Parallel(n_jobs=-1)]: Done 876 tasks      | elapsed:  1.2min
[Parallel(n_jobs=-1)]: Done 1576 tasks      | elapsed:  2.2min
[Parallel(n_jobs=-1)]: Done 2250 out of 2250 | elapsed:  3.2min finished



Precision: 

Best score: 0.541

Best parameters set:

	bootstrap: True

	max_depth: 10

	max_features: 3

	min_samples_leaf: 3

	min_samples_split: 2

	n_estimators: 50

Cross validation scores for modelLogisticRegression

Accuracy: 0.6122 (+/- 0.0523)

Cross validation scores for modelLogisticRegression

Accuracy: 0.6122 (+/- 0.0523)

Cross validation scores for modelLogisticRegression

Accuracy: 0.6122 (+/- 0.0523)

Cross validation scores for modelRandomForestClassifier

Accuracy: 0.5980 (+/- 0.0260)

Cross validation scores for modelRandomForestClassifier

Accuracy: 0.6079 (+/- 0.0560)

Cross validation scores for modelRandomForestClassifier

Accuracy: 0.6051 (+/- 0.0414)

*************************
serbian
*************************
# Tuning hyper-parameters for precision

Performing grid search...
parameters:
{}
Fitting 5 folds for each of 1 candidates, totalling 5 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done   5 out of   5 | elapsed:    0.5s finished



Precision: 

Best score: 0.349

Best parameters set:

# Tuning hyper-parameters for recall

Performing grid search...
parameters:
{}
Fitting 5 folds for each of 1 candidates, totalling 5 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done   5 out of   5 | elapsed:    0.5s finished



Precision: 

Best score: 0.482

Best parameters set:

# Tuning hyper-parameters for f1

Performing grid search...
parameters:
{}
Fitting 5 folds for each of 1 candidates, totalling 5 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done   5 out of   5 | elapsed:    0.6s finished



Precision: 

Best score: 0.400

Best parameters set:

# Tuning hyper-parameters for precision

Performing grid search...
parameters:
{'bootstrap': [True],
 'max_depth': [2, 3, 5, 7, 10],
 'max_features': [2, 3],
 'min_samples_leaf': [3, 4, 5],
 'min_samples_split': [2, 5, 8, 10, 12],
 'n_estimators': [50, 100, 200]}
Fitting 5 folds for each of 450 candidates, totalling 2250 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  76 tasks      | elapsed:    6.2s
[Parallel(n_jobs=-1)]: Done 376 tasks      | elapsed:   31.9s
[Parallel(n_jobs=-1)]: Done 872 tasks      | elapsed:  1.3min
[Parallel(n_jobs=-1)]: Done 1222 tasks      | elapsed:  1.9min
[Parallel(n_jobs=-1)]: Done 1672 tasks      | elapsed:  2.6min
[Parallel(n_jobs=-1)]: Done 2222 tasks      | elapsed:  3.5min
[Parallel(n_jobs=-1)]: Done 2250 out of 2250 | elapsed:  3.6min finished



Precision: 

Best score: 0.498

Best parameters set:

	bootstrap: True

	max_depth: 10

	max_features: 3

	min_samples_leaf: 3

	min_samples_split: 2

	n_estimators: 200

# Tuning hyper-parameters for recall

Performing grid search...
parameters:
{'bootstrap': [True],
 'max_depth': [2, 3, 5, 7, 10],
 'max_features': [2, 3],
 'min_samples_leaf': [3, 4, 5],
 'min_samples_split': [2, 5, 8, 10, 12],
 'n_estimators': [50, 100, 200]}
Fitting 5 folds for each of 450 candidates, totalling 2250 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  76 tasks      | elapsed:    7.0s
[Parallel(n_jobs=-1)]: Done 376 tasks      | elapsed:   33.8s
[Parallel(n_jobs=-1)]: Done 876 tasks      | elapsed:  1.3min
[Parallel(n_jobs=-1)]: Done 1576 tasks      | elapsed:  2.3min
[Parallel(n_jobs=-1)]: Done 2250 out of 2250 | elapsed:  3.4min finished
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.



Precision: 

Best score: 0.492

Best parameters set:

	bootstrap: True

	max_depth: 2

	max_features: 2

	min_samples_leaf: 3

	min_samples_split: 10

	n_estimators: 50

# Tuning hyper-parameters for f1

Performing grid search...
parameters:
{'bootstrap': [True],
 'max_depth': [2, 3, 5, 7, 10],
 'max_features': [2, 3],
 'min_samples_leaf': [3, 4, 5],
 'min_samples_split': [2, 5, 8, 10, 12],
 'n_estimators': [50, 100, 200]}
Fitting 5 folds for each of 450 candidates, totalling 2250 fits


[Parallel(n_jobs=-1)]: Done  76 tasks      | elapsed:    6.5s
[Parallel(n_jobs=-1)]: Done 370 tasks      | elapsed:   35.8s
[Parallel(n_jobs=-1)]: Done 620 tasks      | elapsed:   57.8s
[Parallel(n_jobs=-1)]: Done 970 tasks      | elapsed:  1.5min
[Parallel(n_jobs=-1)]: Done 1420 tasks      | elapsed:  2.2min
[Parallel(n_jobs=-1)]: Done 1970 tasks      | elapsed:  3.0min
[Parallel(n_jobs=-1)]: Done 2250 out of 2250 | elapsed:  3.5min finished



Precision: 

Best score: 0.430

Best parameters set:

	bootstrap: True

	max_depth: 10

	max_features: 3

	min_samples_leaf: 3

	min_samples_split: 5

	n_estimators: 100

Cross validation scores for modelLogisticRegression

Accuracy: 0.4826 (+/- 0.0273)

Cross validation scores for modelLogisticRegression

Accuracy: 0.4818 (+/- 0.0258)

Cross validation scores for modelLogisticRegression

Accuracy: 0.4826 (+/- 0.0273)

Cross validation scores for modelRandomForestClassifier

Accuracy: 0.4879 (+/- 0.0166)

Cross validation scores for modelRandomForestClassifier

Accuracy: 0.4835 (+/- 0.0258)

Cross validation scores for modelRandomForestClassifier

Accuracy: 0.4870 (+/- 0.0287)

*************************
slovene
*************************
# Tuning hyper-parameters for precision

Performing grid search...
parameters:
{}
Fitting 5 folds for each of 1 candidates, totalling 5 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done   5 out of   5 | elapsed:    0.8s finished



Precision: 

Best score: 0.468

Best parameters set:

# Tuning hyper-parameters for recall

Performing grid search...
parameters:
{}
Fitting 5 folds for each of 1 candidates, totalling 5 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done   5 out of   5 | elapsed:    0.9s finished



Precision: 

Best score: 0.521

Best parameters set:

# Tuning hyper-parameters for f1

Performing grid search...
parameters:
{}
Fitting 5 folds for each of 1 candidates, totalling 5 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done   5 out of   5 | elapsed:    0.9s finished



Precision: 

Best score: 0.478

Best parameters set:

# Tuning hyper-parameters for precision

Performing grid search...
parameters:
{'bootstrap': [True],
 'max_depth': [2, 3, 5, 7, 10],
 'max_features': [2, 3],
 'min_samples_leaf': [3, 4, 5],
 'min_samples_split': [2, 5, 8, 10, 12],
 'n_estimators': [50, 100, 200]}
Fitting 5 folds for each of 450 candidates, totalling 2250 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  76 tasks      | elapsed:    8.1s
[Parallel(n_jobs=-1)]: Done 234 tasks      | elapsed:   21.9s
[Parallel(n_jobs=-1)]: Done 484 tasks      | elapsed:   44.0s
[Parallel(n_jobs=-1)]: Done 834 tasks      | elapsed:  1.3min
[Parallel(n_jobs=-1)]: Done 1284 tasks      | elapsed:  2.0min
[Parallel(n_jobs=-1)]: Done 1834 tasks      | elapsed:  3.0min
[Parallel(n_jobs=-1)]: Done 2250 out of 2250 | elapsed:  3.7min finished
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.



Precision: 

Best score: 0.598

Best parameters set:

	bootstrap: True

	max_depth: 10

	max_features: 3

	min_samples_leaf: 3

	min_samples_split: 2

	n_estimators: 100

# Tuning hyper-parameters for recall

Performing grid search...
parameters:
{'bootstrap': [True],
 'max_depth': [2, 3, 5, 7, 10],
 'max_features': [2, 3],
 'min_samples_leaf': [3, 4, 5],
 'min_samples_split': [2, 5, 8, 10, 12],
 'n_estimators': [50, 100, 200]}
Fitting 5 folds for each of 450 candidates, totalling 2250 fits


[Parallel(n_jobs=-1)]: Done  76 tasks      | elapsed:    6.5s
[Parallel(n_jobs=-1)]: Done 376 tasks      | elapsed:   32.5s
[Parallel(n_jobs=-1)]: Done 876 tasks      | elapsed:  1.3min
[Parallel(n_jobs=-1)]: Done 1576 tasks      | elapsed:  2.4min
[Parallel(n_jobs=-1)]: Done 2225 tasks      | elapsed:  3.4min
[Parallel(n_jobs=-1)]: Done 2250 out of 2250 | elapsed:  3.5min finished
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.



Precision: 

Best score: 0.603

Best parameters set:

	bootstrap: True

	max_depth: 10

	max_features: 3

	min_samples_leaf: 3

	min_samples_split: 2

	n_estimators: 100

# Tuning hyper-parameters for f1

Performing grid search...
parameters:
{'bootstrap': [True],
 'max_depth': [2, 3, 5, 7, 10],
 'max_features': [2, 3],
 'min_samples_leaf': [3, 4, 5],
 'min_samples_split': [2, 5, 8, 10, 12],
 'n_estimators': [50, 100, 200]}
Fitting 5 folds for each of 450 candidates, totalling 2250 fits


[Parallel(n_jobs=-1)]: Done  76 tasks      | elapsed:    7.1s
[Parallel(n_jobs=-1)]: Done 376 tasks      | elapsed:   32.8s
[Parallel(n_jobs=-1)]: Done 750 tasks      | elapsed:  1.2min
[Parallel(n_jobs=-1)]: Done 1100 tasks      | elapsed:  1.7min
[Parallel(n_jobs=-1)]: Done 1550 tasks      | elapsed:  2.4min
[Parallel(n_jobs=-1)]: Done 2100 tasks      | elapsed:  3.4min
[Parallel(n_jobs=-1)]: Done 2250 out of 2250 | elapsed:  3.7min finished



Precision: 

Best score: 0.568

Best parameters set:

	bootstrap: True

	max_depth: 10

	max_features: 3

	min_samples_leaf: 3

	min_samples_split: 8

	n_estimators: 100

Cross validation scores for modelLogisticRegression

Accuracy: 0.5204 (+/- 0.0666)

Cross validation scores for modelLogisticRegression

Accuracy: 0.5204 (+/- 0.0666)

Cross validation scores for modelLogisticRegression

Accuracy: 0.5204 (+/- 0.0666)

Cross validation scores for modelRandomForestClassifier

Accuracy: 0.5990 (+/- 0.0317)

Cross validation scores for modelRandomForestClassifier

Accuracy: 0.6010 (+/- 0.0395)

Cross validation scores for modelRandomForestClassifier

Accuracy: 0.5925 (+/- 0.0345)

