## Preprocessing

At this stage, first of all, wrapper functions of multiple different stemming algorithms were written for the data we have.

Before a preprocess operation, separators, operators, punctuations and non-printable characters were removed. Then on the basis of being optional, a normalization can be performed with or without a stop count, or with a different stemming type option.

As a matter of fact, at the last stage, we will compare the result with different parameters in this way.

In [11]:
import re
from trstop import trstop
import string
from typing import List
from jpype import JClass, JString, getDefaultJVMPath, shutdownJVM, startJVM, java
from examples import DATA_PATH, ZEMBEREK_PATH
from pathlib import Path

startJVM(getDefaultJVMPath(), '-ea',
         '-Djava.class.path=%s' % (ZEMBEREK_PATH))

TurkishMorphology: JClass = JClass('zemberek.morphology.TurkishMorphology')
TurkishSentenceNormalizer: JClass = JClass(
    'zemberek.normalization.TurkishSentenceNormalizer'
)


Paths: JClass = JClass('java.nio.file.Paths')


def stem(text: str) -> str:
    morphology = TurkishMorphology.createWithDefaults()

    analysis: java.util.ArrayList = (
        morphology.analyzeAndDisambiguate(text).bestAnalysis()
    )

    pos: List[str] = []
    for i, analysis in enumerate(analysis, start=1):
        pos.append(
            f'{str(analysis.getLemmas()[0])}'
        )
    return ' '.join(pos)


def normalize(text: str) -> str:

    normalizer = TurkishSentenceNormalizer(
        TurkishMorphology.createWithDefaults(),
        Paths.get(str(DATA_PATH.joinpath('normalization'))),
        Paths.get(str(DATA_PATH.joinpath('lm', 'lm.2gram.slm'))),
    )

    return normalizer.normalize(JString(text))


def fps(text: str, n) -> str:
    return ' '.join([w[: n] for w in text.split()])


def preprocess(x, stemming=None):
    x = x.strip()
    x = normalize(x)
    x = remove_punctuation(x)
    x = tokenize(x)
    x = remove_stopwords(x)
    if stemming == 'zemberek':
        x = tokenize(stem(' '.join(x)))
    elif stemming == 'fps5':
        x = tokenize(fps(' '.join(x), 5))
    elif stemming == 'fps7':
        x = tokenize(fps(' '.join(x), 7))

    return x


def remove_punctuation(x):
    return ''.join([w for w in x if w not in string.punctuation])


def tokenize(x):
    return re.split(r'\W+', x)


def remove_stopwords(x):
    return [w for w in x if not trstop.is_stop_word(w)]


## Example for Preprocessing

In [3]:
file_name = 'TTC-3600/TTC-3600_Orj/ekonomi/c (1).txt'

with open(file_name) as file:
    text = file.read()


text

'\ufeff \n\t\t\t\t\t\t\t\n\t\t\t\t\t\t\t\n\t\t\t\t\t\t\t\t\t\t\t\t\n\t\t\t\t\t\t HGS İhlalli Geçiş Bilgileri Sorgulama Hizmeti ile ihlalli geçişlerin olup olmadığının sorgulanacağının belirtildiği açıklamada, "İhlalli geçişleri var ise son 5 tanesini ve ihlalli geçiş ücretleriyle ihlal ceza tutarlarını görebilmektedir.   Bu hizmetle HGS kullanıcılarının geçiş tarihinden itibaren 15 günlük yasal süresi içinde varsa ihlallerini görerek yeterli bakiyeyi yatırması ve geçişlerin cezaya girmesinin önlenmesi amaçlanmaktadır" ifadesi kullanıldı. \t\t\t\t\t\t\t\n\t\t\t\t\t\t\n\t\t\t\t\t\t\t\n\t\t\t\t\t\t\n\t\t\t\t\t\t\n\t\t\t\t\t\t\t'

In [4]:
preprocess(text)

['',
 'hgs',
 'ihlalli',
 'ihlali',
 'geçişlerin',
 'sorgulanacağının',
 'belirtildiği',
 'ihlali',
 'geçişleri',
 '5',
 'tanesini',
 'ihlali',
 'ücretleriyle',
 'tutarlarını',
 'görebilmektedir',
 'hizmetle',
 'hgs',
 'kullanıcılarının',
 '15',
 'ihlallerini',
 'bakiyeyi',
 'yatırması',
 'geçişlerin',
 'cezaya',
 'girmesinin',
 'önlenmesi',
 'amaçlanmaktadır',
 '']

In [5]:
preprocess(text, stemming='zemberek')

['hgs',
 'ihlal',
 'ihlal',
 'geç',
 'sorgula',
 'belir',
 'ihlal',
 'geç',
 '5',
 'tane',
 'ihlal',
 'ücret',
 'tutar',
 'gör',
 'hizmet',
 'hgs',
 'kullan',
 '15',
 'ihlal',
 'bakiye',
 'yatır',
 'geç',
 'ceza',
 'gir',
 'önle',
 'amaçla']

In [6]:
preprocess(text, stemming='fps5')

['hgs',
 'ihlal',
 'ihlal',
 'geçiş',
 'sorgu',
 'belir',
 'ihlal',
 'geçiş',
 '5',
 'tanes',
 'ihlal',
 'ücret',
 'tutar',
 'göreb',
 'hizme',
 'hgs',
 'kulla',
 '15',
 'ihlal',
 'bakiy',
 'yatır',
 'geçiş',
 'cezay',
 'girme',
 'önlen',
 'amaçl']

In [7]:
preprocess(text, stemming='fps7')

['hgs',
 'ihlalli',
 'ihlali',
 'geçişle',
 'sorgula',
 'belirti',
 'ihlali',
 'geçişle',
 '5',
 'tanesin',
 'ihlali',
 'ücretle',
 'tutarla',
 'görebil',
 'hizmetl',
 'hgs',
 'kullanı',
 '15',
 'ihlalle',
 'bakiyey',
 'yatırma',
 'geçişle',
 'cezaya',
 'girmesi',
 'önlenme',
 'amaçlan']

In [1]:
from sklearn.feature_selection import SelectKBest, chi2

from time import time

from collections import Counter

import pandas as pd

from sklearn import metrics
import matplotlib.pyplot as plt
from sklearn.pipeline import make_pipeline
import seaborn as sns
from sklearn.metrics import confusion_matrix, accuracy_score
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.datasets import fetch_20newsgroups
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import SVC, LinearSVC
from sklearn.model_selection import KFold, train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
from sklearn.neighbors import KNeighborsClassifier, NearestCentroid
from sklearn.tree import DecisionTreeClassifier
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.exceptions import ConvergenceWarning

import os
import numpy as np
import pickle
from test import *



def save_dataset(preprocess, save=None):
    dataset = 'TTC-3600/TTC-3600_Orj'

    X, y = [],  []
    for root, directories, files in os.walk(dataset):
        for directory in directories:
            for parent, _, files in os.walk(dataset + '/' + directory):
                y += [directories.index(directory)] * len(files)
                i = 0
                for file in files:
                    with open(parent + '/' + file) as f:
                        X.append(preprocess(f.read()))
                    print('{}/{}'.format(len(X), 3600))

    y = np.array(y)

    if save:
        with open(save, 'wb') as file:
            pickle.dump((X, y), file)

    return X, y


def load_dataset(ds):
    objects = []
    with (open('processed/' + ds, 'rb')) as openfile:
        while True:
            try:
                objects.append(pickle.load(openfile))
            except EOFError:
                break
    return objects[0]

In [2]:
import warnings
with warnings.catch_warnings():
    warnings.filterwarnings("ignore", category=ConvergenceWarning)

In [3]:
class Classifier:
    __RANDOM_STATE = 42

    def __init__(self, dataset, method, max_features=8000, n_fea='all', vector='tfidf'):
        self.dataset = dataset
        self.vector = vector
        self.X, self.y = load_dataset(dataset)
        self.X = np.array([strip_numbers(x) for x in self.X])
        self.max_features = max_features
        self.n_fea = n_fea
        self.method = method

        if method == 'NB':
            self.model = MultinomialNB()
        elif method == 'RF':
            self.model = RandomForestClassifier(
                max_depth=128, random_state=self.__RANDOM_STATE)
        elif method == 'SVM LINEAR':
            self.model = LinearSVC(max_iter=1000)
        elif method == 'SVM RBF':
            self.model = SVC(kernel='rbf', gamma=1, cache_size=7000)
        elif method == 'KNN':
            self.model = KNeighborsClassifier(n_neighbors=5)
        elif method == 'CART':
            self.model = DecisionTreeClassifier()
        elif method == 'ROCCHIO':
            self.model = NearestCentroid()
        elif method == 'LR':
            self.model = LogisticRegression(C=1.0)


        if vector == 'tfidf':
            self.vectorizer = TfidfVectorizer(max_features=self.max_features)
        elif vector == 'bagofwords':
            self.vectorizer = CountVectorizer(max_features=self.max_features)

    def fit(self):
        cv = KFold(n_splits=10, shuffle=True, random_state=0)
        score, train_time, test_time = [], [], []
        for train_index, test_index in cv.split(self.X):
            X_train, X_test, y_train, y_test = self.X[train_index], self.X[
                test_index], self.y[train_index], self.y[test_index]

            X_train = self.vectorizer.fit_transform(X_train).toarray()
            X_test = self.vectorizer.transform(X_test).toarray()

            X_train, X_test = self.select_features(
                X_train, y_train, X_test, k=self.n_fea)

            results = self.benchmark(X_train, y_train, X_test, y_test)
            score.append(results[0])
            train_time.append(results[1])
            test_time.append(results[2])

        self.print_benchmark(np.mean(score), np.mean(
            train_time), np.mean(test_time))

    def cfmatrix(self, y_test, y_pred):
        # plot the confusion matrix
        mat = confusion_matrix(y_test, y_pred)
        sns.heatmap(mat.T, square=True, annot=True, fmt='d',
                    xticklabels=target_names, yticklabels=target_names)
        plt.xlabel('true labels')
        plt.ylabel('predicted label')
        plt.show()

    def select_features(self, X_train, y_train, X_test, k):
        if k == 'all':
            return X_train, X_test

        selector = SelectKBest(chi2, k=k)
        selector.fit(X_train, y_train)
        X_train = selector.transform(X_train)
        X_test = selector.transform(X_test)
        return X_train, X_test

    def benchmark(self, X_train, y_train, X_test, y_test):
        '''
        benchmark based on f1 score
        '''
        t0 = time()
        self.model.fit(X_train, y_train)
        train_time = time() - t0

        t0 = time()
        y_pred = self.model.predict(X_test)
        test_time = time() - t0

        score = metrics.f1_score(y_test, y_pred, average='micro')
        return score, train_time, test_time

    def print_benchmark(self, score, train_time, test_time):
        print('\nmethod: ', self.method)
        print('dataset: ' + self.dataset)
        print('vector: ' + self.vector)
        print('features:\t{0}'.format(self.n_fea))
        print('train time: {0:0.4f}s'.format(train_time))
        print('test time:  {0:0.4f}s'.format(test_time))
        print('f1-score:   {0:0.4f}'.format(score))

In [4]:
def score(method):
    # preprocessing features
    datasets = ['originalds', 'zembds', 'f5ds', 'f7ds', 'originalds_stopword',
                'zembds_stopword', 'f5ds_stopword', 'f7ds_stopword']

    # post processing features
    vectors = ['tfidf', 'bagofwords']
    n_features = [500, 1000, 2000, 5000, 'all']

    
    for dataset in datasets:
        for vector in vectors:
            for n_fea in n_features:
                cf = Classifier(dataset=dataset, method=method,
                                vector=vector, n_fea=n_fea)
                cf.fit()

In [8]:
score('NB')


method:  NB
dataset: originalds
vector: tfidf
features:	500
train time: 0.0076s
test time:  0.0008s
f1-score:   0.8903

method:  NB
dataset: originalds
vector: tfidf
features:	1000
train time: 0.0253s
test time:  0.0021s
f1-score:   0.9089

method:  NB
dataset: originalds
vector: tfidf
features:	2000
train time: 0.0376s
test time:  0.0024s
f1-score:   0.9208

method:  NB
dataset: originalds
vector: tfidf
features:	5000
train time: 0.0791s
test time:  0.0054s
f1-score:   0.9261

method:  NB
dataset: originalds
vector: tfidf
features:	all
train time: 0.1686s
test time:  0.0078s
f1-score:   0.9256

method:  NB
dataset: originalds
vector: bagofwords
features:	500
train time: 0.0137s
test time:  0.0011s
f1-score:   0.8914

method:  NB
dataset: originalds
vector: bagofwords
features:	1000
train time: 0.0289s
test time:  0.0026s
f1-score:   0.9083

method:  NB
dataset: originalds
vector: bagofwords
features:	2000
train time: 0.0605s
test time:  0.0045s
f1-score:   0.9178

method:  NB
dataset


method:  NB
dataset: f5ds_stopword
vector: bagofwords
features:	2000
train time: 0.0614s
test time:  0.0043s
f1-score:   0.9081

method:  NB
dataset: f5ds_stopword
vector: bagofwords
features:	5000
train time: 0.1814s
test time:  0.0155s
f1-score:   0.9194

method:  NB
dataset: f5ds_stopword
vector: bagofwords
features:	all
train time: 0.8055s
test time:  0.0110s
f1-score:   0.9228

method:  NB
dataset: f7ds_stopword
vector: tfidf
features:	500
train time: 0.0194s
test time:  0.0028s
f1-score:   0.8581

method:  NB
dataset: f7ds_stopword
vector: tfidf
features:	1000
train time: 0.0220s
test time:  0.0024s
f1-score:   0.8850

method:  NB
dataset: f7ds_stopword
vector: tfidf
features:	2000
train time: 0.0636s
test time:  0.0047s
f1-score:   0.9047

method:  NB
dataset: f7ds_stopword
vector: tfidf
features:	5000
train time: 0.1389s
test time:  0.0115s
f1-score:   0.9189

method:  NB
dataset: f7ds_stopword
vector: tfidf
features:	all
train time: 0.1527s
test time:  0.0084s
f1-score:   0.9

In [13]:
score('RF')


method:  RF
dataset: originalds
vector: tfidf
features:	500
train time: 1.6664s
test time:  0.0307s
f1-score:   0.8783

method:  RF
dataset: originalds
vector: tfidf
features:	1000
train time: 1.9665s
test time:  0.0270s
f1-score:   0.8825

method:  RF
dataset: originalds
vector: tfidf
features:	2000
train time: 2.7796s
test time:  0.0282s
f1-score:   0.8875

method:  RF
dataset: originalds
vector: tfidf
features:	5000
train time: 4.0341s
test time:  0.0304s
f1-score:   0.8958

method:  RF
dataset: originalds
vector: tfidf
features:	all
train time: 8.0976s
test time:  0.0506s
f1-score:   0.8903

method:  RF
dataset: originalds
vector: bagofwords
features:	500
train time: 1.2587s
test time:  0.0220s
f1-score:   0.8658

method:  RF
dataset: originalds
vector: bagofwords
features:	1000
train time: 1.7219s
test time:  0.0244s
f1-score:   0.8769

method:  RF
dataset: originalds
vector: bagofwords
features:	2000
train time: 2.4830s
test time:  0.0242s
f1-score:   0.8856

method:  RF
dataset


method:  RF
dataset: f5ds_stopword
vector: bagofwords
features:	2000
train time: 2.6286s
test time:  0.0329s
f1-score:   0.8606

method:  RF
dataset: f5ds_stopword
vector: bagofwords
features:	5000
train time: 3.8209s
test time:  0.0301s
f1-score:   0.8731

method:  RF
dataset: f5ds_stopword
vector: bagofwords
features:	all
train time: 10.6496s
test time:  0.0488s
f1-score:   0.8769

method:  RF
dataset: f7ds_stopword
vector: tfidf
features:	500
train time: 1.3216s
test time:  0.0207s
f1-score:   0.8344

method:  RF
dataset: f7ds_stopword
vector: tfidf
features:	1000
train time: 2.0248s
test time:  0.0253s
f1-score:   0.8517

method:  RF
dataset: f7ds_stopword
vector: tfidf
features:	2000
train time: 2.8132s
test time:  0.0297s
f1-score:   0.8581

method:  RF
dataset: f7ds_stopword
vector: tfidf
features:	5000
train time: 4.2138s
test time:  0.0368s
f1-score:   0.8656

method:  RF
dataset: f7ds_stopword
vector: tfidf
features:	all
train time: 13.5708s
test time:  0.0641s
f1-score:   0

In [27]:
score('SVM LINEAR')


method:  SVM LINEAR
dataset: originalds
vector: tfidf
features:	500
train time: 0.0528s
test time:  0.0009s
f1-score:   0.8969

method:  SVM LINEAR
dataset: originalds
vector: tfidf
features:	1000
train time: 0.0897s
test time:  0.0015s
f1-score:   0.9211

method:  SVM LINEAR
dataset: originalds
vector: tfidf
features:	2000
train time: 0.1311s
test time:  0.0024s
f1-score:   0.9300

method:  SVM LINEAR
dataset: originalds
vector: tfidf
features:	5000
train time: 0.5094s
test time:  0.0075s
f1-score:   0.9356

method:  SVM LINEAR
dataset: originalds
vector: tfidf
features:	all
train time: 0.5851s
test time:  0.0120s
f1-score:   0.9389





method:  SVM LINEAR
dataset: originalds
vector: bagofwords
features:	500
train time: 0.3707s
test time:  0.0020s
f1-score:   0.8731





method:  SVM LINEAR
dataset: originalds
vector: bagofwords
features:	1000
train time: 0.4107s
test time:  0.0033s
f1-score:   0.8772





method:  SVM LINEAR
dataset: originalds
vector: bagofwords
features:	2000
train time: 0.8623s
test time:  0.0112s
f1-score:   0.8806





method:  SVM LINEAR
dataset: originalds
vector: bagofwords
features:	5000
train time: 1.4281s
test time:  0.0129s
f1-score:   0.8942





method:  SVM LINEAR
dataset: originalds
vector: bagofwords
features:	all
train time: 3.0536s
test time:  0.0197s
f1-score:   0.9050

method:  SVM LINEAR
dataset: zembds
vector: tfidf
features:	500
train time: 0.0827s
test time:  0.0034s
f1-score:   0.9236

method:  SVM LINEAR
dataset: zembds
vector: tfidf
features:	1000
train time: 0.0946s
test time:  0.0015s
f1-score:   0.9361

method:  SVM LINEAR
dataset: zembds
vector: tfidf
features:	2000
train time: 0.1847s
test time:  0.0029s
f1-score:   0.9442

method:  SVM LINEAR
dataset: zembds
vector: tfidf
features:	5000
train time: 0.6140s
test time:  0.0080s
f1-score:   0.9492

method:  SVM LINEAR
dataset: zembds
vector: tfidf
features:	all
train time: 0.5000s
test time:  0.0092s
f1-score:   0.9508





method:  SVM LINEAR
dataset: zembds
vector: bagofwords
features:	500
train time: 0.4220s
test time:  0.0029s
f1-score:   0.8844





method:  SVM LINEAR
dataset: zembds
vector: bagofwords
features:	1000
train time: 0.4837s
test time:  0.0036s
f1-score:   0.9011





method:  SVM LINEAR
dataset: zembds
vector: bagofwords
features:	2000
train time: 0.8685s
test time:  0.0063s
f1-score:   0.9094





method:  SVM LINEAR
dataset: zembds
vector: bagofwords
features:	5000
train time: 1.6461s
test time:  0.0156s
f1-score:   0.9192





method:  SVM LINEAR
dataset: zembds
vector: bagofwords
features:	all
train time: 2.4897s
test time:  0.0225s
f1-score:   0.9208

method:  SVM LINEAR
dataset: f5ds
vector: tfidf
features:	500
train time: 0.1163s
test time:  0.0028s
f1-score:   0.9222

method:  SVM LINEAR
dataset: f5ds
vector: tfidf
features:	1000
train time: 0.1055s
test time:  0.0032s
f1-score:   0.9361

method:  SVM LINEAR
dataset: f5ds
vector: tfidf
features:	2000
train time: 0.1711s
test time:  0.0031s
f1-score:   0.9433

method:  SVM LINEAR
dataset: f5ds
vector: tfidf
features:	5000
train time: 0.7552s
test time:  0.0121s
f1-score:   0.9475

method:  SVM LINEAR
dataset: f5ds
vector: tfidf
features:	all
train time: 0.5938s
test time:  0.0128s
f1-score:   0.9506





method:  SVM LINEAR
dataset: f5ds
vector: bagofwords
features:	500
train time: 0.4479s
test time:  0.0044s
f1-score:   0.8869





method:  SVM LINEAR
dataset: f5ds
vector: bagofwords
features:	1000
train time: 0.7096s
test time:  0.0081s
f1-score:   0.8975





method:  SVM LINEAR
dataset: f5ds
vector: bagofwords
features:	2000
train time: 0.9066s
test time:  0.0061s
f1-score:   0.9081





method:  SVM LINEAR
dataset: f5ds
vector: bagofwords
features:	5000
train time: 1.1229s
test time:  0.0129s
f1-score:   0.9122





method:  SVM LINEAR
dataset: f5ds
vector: bagofwords
features:	all
train time: 1.4754s
test time:  0.0157s
f1-score:   0.9186

method:  SVM LINEAR
dataset: f7ds
vector: tfidf
features:	500
train time: 0.0568s
test time:  0.0008s
f1-score:   0.9211

method:  SVM LINEAR
dataset: f7ds
vector: tfidf
features:	1000
train time: 0.1073s
test time:  0.0034s
f1-score:   0.9356

method:  SVM LINEAR
dataset: f7ds
vector: tfidf
features:	2000
train time: 0.1541s
test time:  0.0027s
f1-score:   0.9386

method:  SVM LINEAR
dataset: f7ds
vector: tfidf
features:	5000
train time: 0.3794s
test time:  0.0065s
f1-score:   0.9458

method:  SVM LINEAR
dataset: f7ds
vector: tfidf
features:	all
train time: 0.3744s
test time:  0.0071s
f1-score:   0.9483





method:  SVM LINEAR
dataset: f7ds
vector: bagofwords
features:	500
train time: 0.2374s
test time:  0.0016s
f1-score:   0.8833





method:  SVM LINEAR
dataset: f7ds
vector: bagofwords
features:	1000
train time: 0.3046s
test time:  0.0018s
f1-score:   0.8925





method:  SVM LINEAR
dataset: f7ds
vector: bagofwords
features:	2000
train time: 0.5018s
test time:  0.0036s
f1-score:   0.8994





method:  SVM LINEAR
dataset: f7ds
vector: bagofwords
features:	5000
train time: 1.0438s
test time:  0.0104s
f1-score:   0.9025





method:  SVM LINEAR
dataset: f7ds
vector: bagofwords
features:	all
train time: 1.7560s
test time:  0.0120s
f1-score:   0.9156

method:  SVM LINEAR
dataset: originalds_stopword
vector: tfidf
features:	500
train time: 0.0460s
test time:  0.0011s
f1-score:   0.7881

method:  SVM LINEAR
dataset: originalds_stopword
vector: tfidf
features:	1000
train time: 0.0642s
test time:  0.0018s
f1-score:   0.8306

method:  SVM LINEAR
dataset: originalds_stopword
vector: tfidf
features:	2000
train time: 0.1051s
test time:  0.0037s
f1-score:   0.8725

method:  SVM LINEAR
dataset: originalds_stopword
vector: tfidf
features:	5000
train time: 0.2452s
test time:  0.0055s
f1-score:   0.8944

method:  SVM LINEAR
dataset: originalds_stopword
vector: tfidf
features:	all
train time: 0.2087s
test time:  0.0076s
f1-score:   0.8961





method:  SVM LINEAR
dataset: originalds_stopword
vector: bagofwords
features:	500
train time: 0.3662s
test time:  0.0027s
f1-score:   0.7356





method:  SVM LINEAR
dataset: originalds_stopword
vector: bagofwords
features:	1000
train time: 0.3500s
test time:  0.0025s
f1-score:   0.7886





method:  SVM LINEAR
dataset: originalds_stopword
vector: bagofwords
features:	2000
train time: 0.3979s
test time:  0.0059s
f1-score:   0.8147





method:  SVM LINEAR
dataset: originalds_stopword
vector: bagofwords
features:	5000
train time: 0.7392s
test time:  0.0135s
f1-score:   0.8394





method:  SVM LINEAR
dataset: originalds_stopword
vector: bagofwords
features:	all
train time: 0.9920s
test time:  0.0129s
f1-score:   0.8444

method:  SVM LINEAR
dataset: zembds_stopword
vector: tfidf
features:	500
train time: 0.0484s
test time:  0.0010s
f1-score:   0.8858

method:  SVM LINEAR
dataset: zembds_stopword
vector: tfidf
features:	1000
train time: 0.0717s
test time:  0.0015s
f1-score:   0.9022

method:  SVM LINEAR
dataset: zembds_stopword
vector: tfidf
features:	2000
train time: 0.1095s
test time:  0.0036s
f1-score:   0.9169

method:  SVM LINEAR
dataset: zembds_stopword
vector: tfidf
features:	5000
train time: 0.2562s
test time:  0.0049s
f1-score:   0.9267

method:  SVM LINEAR
dataset: zembds_stopword
vector: tfidf
features:	all
train time: 0.2288s
test time:  0.0067s
f1-score:   0.9278





method:  SVM LINEAR
dataset: zembds_stopword
vector: bagofwords
features:	500
train time: 0.2546s
test time:  0.0015s
f1-score:   0.8508





method:  SVM LINEAR
dataset: zembds_stopword
vector: bagofwords
features:	1000
train time: 0.2828s
test time:  0.0022s
f1-score:   0.8592





method:  SVM LINEAR
dataset: zembds_stopword
vector: bagofwords
features:	2000
train time: 0.3795s
test time:  0.0044s
f1-score:   0.8692





method:  SVM LINEAR
dataset: zembds_stopword
vector: bagofwords
features:	5000
train time: 0.7582s
test time:  0.0110s
f1-score:   0.8778





method:  SVM LINEAR
dataset: zembds_stopword
vector: bagofwords
features:	all
train time: 0.9693s
test time:  0.0119s
f1-score:   0.8814

method:  SVM LINEAR
dataset: f5ds_stopword
vector: tfidf
features:	500
train time: 0.0467s
test time:  0.0014s
f1-score:   0.8772

method:  SVM LINEAR
dataset: f5ds_stopword
vector: tfidf
features:	1000
train time: 0.0679s
test time:  0.0017s
f1-score:   0.8972

method:  SVM LINEAR
dataset: f5ds_stopword
vector: tfidf
features:	2000
train time: 0.0947s
test time:  0.0022s
f1-score:   0.9147

method:  SVM LINEAR
dataset: f5ds_stopword
vector: tfidf
features:	5000
train time: 0.2143s
test time:  0.0050s
f1-score:   0.9217

method:  SVM LINEAR
dataset: f5ds_stopword
vector: tfidf
features:	all
train time: 0.1958s
test time:  0.0060s
f1-score:   0.9256





method:  SVM LINEAR
dataset: f5ds_stopword
vector: bagofwords
features:	500
train time: 0.2337s
test time:  0.0011s
f1-score:   0.8311





method:  SVM LINEAR
dataset: f5ds_stopword
vector: bagofwords
features:	1000
train time: 0.2572s
test time:  0.0019s
f1-score:   0.8556





method:  SVM LINEAR
dataset: f5ds_stopword
vector: bagofwords
features:	2000
train time: 0.3489s
test time:  0.0037s
f1-score:   0.8603





method:  SVM LINEAR
dataset: f5ds_stopword
vector: bagofwords
features:	5000
train time: 0.6409s
test time:  0.0095s
f1-score:   0.8736





method:  SVM LINEAR
dataset: f5ds_stopword
vector: bagofwords
features:	all
train time: 0.8561s
test time:  0.0100s
f1-score:   0.8817

method:  SVM LINEAR
dataset: f7ds_stopword
vector: tfidf
features:	500
train time: 0.0424s
test time:  0.0007s
f1-score:   0.8611

method:  SVM LINEAR
dataset: f7ds_stopword
vector: tfidf
features:	1000
train time: 0.0574s
test time:  0.0012s
f1-score:   0.8903

method:  SVM LINEAR
dataset: f7ds_stopword
vector: tfidf
features:	2000
train time: 0.0898s
test time:  0.0021s
f1-score:   0.9042

method:  SVM LINEAR
dataset: f7ds_stopword
vector: tfidf
features:	5000
train time: 0.2051s
test time:  0.0042s
f1-score:   0.9206

method:  SVM LINEAR
dataset: f7ds_stopword
vector: tfidf
features:	all
train time: 0.1870s
test time:  0.0058s
f1-score:   0.9231





method:  SVM LINEAR
dataset: f7ds_stopword
vector: bagofwords
features:	500
train time: 0.2315s
test time:  0.0012s
f1-score:   0.8344





method:  SVM LINEAR
dataset: f7ds_stopword
vector: bagofwords
features:	1000
train time: 0.2375s
test time:  0.0019s
f1-score:   0.8475





method:  SVM LINEAR
dataset: f7ds_stopword
vector: bagofwords
features:	2000
train time: 0.3019s
test time:  0.0037s
f1-score:   0.8608





method:  SVM LINEAR
dataset: f7ds_stopword
vector: bagofwords
features:	5000
train time: 0.6027s
test time:  0.0096s
f1-score:   0.8778





method:  SVM LINEAR
dataset: f7ds_stopword
vector: bagofwords
features:	all
train time: 0.8418s
test time:  0.0098s
f1-score:   0.8900




In [None]:
score('SVM RBF')

In [29]:
score('KNN')


method:  KNN
dataset: originalds
vector: tfidf
features:	500
train time: 0.1858s
test time:  0.7457s
f1-score:   0.7861

method:  KNN
dataset: originalds
vector: tfidf
features:	1000
train time: 0.3561s
test time:  1.9400s
f1-score:   0.7394

method:  KNN
dataset: originalds
vector: tfidf
features:	2000
train time: 0.6450s
test time:  4.1620s
f1-score:   0.6092

method:  KNN
dataset: originalds
vector: tfidf
features:	5000
train time: 1.5859s
test time:  9.5485s
f1-score:   0.7356

method:  KNN
dataset: originalds
vector: tfidf
features:	all
train time: 2.3214s
test time:  13.6347s
f1-score:   0.8767

method:  KNN
dataset: originalds
vector: bagofwords
features:	500
train time: 0.1347s
test time:  0.7299s
f1-score:   0.6311

method:  KNN
dataset: originalds
vector: bagofwords
features:	1000
train time: 0.2697s
test time:  1.5336s
f1-score:   0.5994

method:  KNN
dataset: originalds
vector: bagofwords
features:	2000
train time: 0.5449s
test time:  3.0730s
f1-score:   0.5744

method:  K


method:  KNN
dataset: f5ds_stopword
vector: bagofwords
features:	2000
train time: 0.5904s
test time:  3.6153s
f1-score:   0.6717

method:  KNN
dataset: f5ds_stopword
vector: bagofwords
features:	5000
train time: 1.5651s
test time:  9.2849s
f1-score:   0.5064

method:  KNN
dataset: f5ds_stopword
vector: bagofwords
features:	all
train time: 2.8716s
test time:  16.1892s
f1-score:   0.4522

method:  KNN
dataset: f7ds_stopword
vector: tfidf
features:	500
train time: 0.1818s
test time:  0.6725s
f1-score:   0.7906

method:  KNN
dataset: f7ds_stopword
vector: tfidf
features:	1000
train time: 0.3099s
test time:  1.4094s
f1-score:   0.7739

method:  KNN
dataset: f7ds_stopword
vector: tfidf
features:	2000
train time: 0.5776s
test time:  3.2061s
f1-score:   0.5958

method:  KNN
dataset: f7ds_stopword
vector: tfidf
features:	5000
train time: 1.5464s
test time:  9.0631s
f1-score:   0.5303

method:  KNN
dataset: f7ds_stopword
vector: tfidf
features:	all
train time: 2.4104s
test time:  14.5951s
f1-sc

In [30]:
score('CART')


method:  CART
dataset: originalds
vector: tfidf
features:	500
train time: 0.2279s
test time:  0.0007s
f1-score:   0.7850

method:  CART
dataset: originalds
vector: tfidf
features:	1000
train time: 0.4033s
test time:  0.0012s
f1-score:   0.7592

method:  CART
dataset: originalds
vector: tfidf
features:	2000
train time: 0.7516s
test time:  0.0022s
f1-score:   0.7472

method:  CART
dataset: originalds
vector: tfidf
features:	5000
train time: 1.8819s
test time:  0.0049s
f1-score:   0.7244

method:  CART
dataset: originalds
vector: tfidf
features:	all
train time: 6.8439s
test time:  0.0082s
f1-score:   0.7339

method:  CART
dataset: originalds
vector: bagofwords
features:	500
train time: 0.2072s
test time:  0.0008s
f1-score:   0.7503

method:  CART
dataset: originalds
vector: bagofwords
features:	1000
train time: 0.4080s
test time:  0.0012s
f1-score:   0.7503

method:  CART
dataset: originalds
vector: bagofwords
features:	2000
train time: 0.7328s
test time:  0.0022s
f1-score:   0.7592

met


method:  CART
dataset: f5ds_stopword
vector: bagofwords
features:	1000
train time: 0.4478s
test time:  0.0012s
f1-score:   0.7489

method:  CART
dataset: f5ds_stopword
vector: bagofwords
features:	2000
train time: 0.8287s
test time:  0.0024s
f1-score:   0.7417

method:  CART
dataset: f5ds_stopword
vector: bagofwords
features:	5000
train time: 1.8903s
test time:  0.0060s
f1-score:   0.7386

method:  CART
dataset: f5ds_stopword
vector: bagofwords
features:	all
train time: 7.7957s
test time:  0.0091s
f1-score:   0.7397

method:  CART
dataset: f7ds_stopword
vector: tfidf
features:	500
train time: 0.2729s
test time:  0.0008s
f1-score:   0.7558

method:  CART
dataset: f7ds_stopword
vector: tfidf
features:	1000
train time: 0.5624s
test time:  0.0012s
f1-score:   0.7567

method:  CART
dataset: f7ds_stopword
vector: tfidf
features:	2000
train time: 1.0832s
test time:  0.0023s
f1-score:   0.7572

method:  CART
dataset: f7ds_stopword
vector: tfidf
features:	5000
train time: 2.6918s
test time:  0

In [5]:
score('ROCCHIO')


method:  ROCCHIO
dataset: originalds
vector: tfidf
features:	500
train time: 0.0120s
test time:  0.0043s
f1-score:   0.8342

method:  ROCCHIO
dataset: originalds
vector: tfidf
features:	1000
train time: 0.0215s
test time:  0.0029s
f1-score:   0.8608

method:  ROCCHIO
dataset: originalds
vector: tfidf
features:	2000
train time: 0.0411s
test time:  0.0046s
f1-score:   0.8781

method:  ROCCHIO
dataset: originalds
vector: tfidf
features:	5000
train time: 0.0909s
test time:  0.0078s
f1-score:   0.8931

method:  ROCCHIO
dataset: originalds
vector: tfidf
features:	all
train time: 0.0951s
test time:  0.0106s
f1-score:   0.8958

method:  ROCCHIO
dataset: originalds
vector: bagofwords
features:	500
train time: 0.0081s
test time:  0.0014s
f1-score:   0.4389

method:  ROCCHIO
dataset: originalds
vector: bagofwords
features:	1000
train time: 0.0171s
test time:  0.0026s
f1-score:   0.4467

method:  ROCCHIO
dataset: originalds
vector: bagofwords
features:	2000
train time: 0.0312s
test time:  0.0047s


method:  ROCCHIO
dataset: f5ds_stopword
vector: bagofwords
features:	500
train time: 0.0078s
test time:  0.0014s
f1-score:   0.6439

method:  ROCCHIO
dataset: f5ds_stopword
vector: bagofwords
features:	1000
train time: 0.0161s
test time:  0.0029s
f1-score:   0.6608

method:  ROCCHIO
dataset: f5ds_stopword
vector: bagofwords
features:	2000
train time: 0.0304s
test time:  0.0045s
f1-score:   0.6739

method:  ROCCHIO
dataset: f5ds_stopword
vector: bagofwords
features:	5000
train time: 0.0900s
test time:  0.0102s
f1-score:   0.6853

method:  ROCCHIO
dataset: f5ds_stopword
vector: bagofwords
features:	all
train time: 0.1053s
test time:  0.0145s
f1-score:   0.6883

method:  ROCCHIO
dataset: f7ds_stopword
vector: tfidf
features:	500
train time: 0.0110s
test time:  0.0015s
f1-score:   0.7808

method:  ROCCHIO
dataset: f7ds_stopword
vector: tfidf
features:	1000
train time: 0.0158s
test time:  0.0020s
f1-score:   0.8158

method:  ROCCHIO
dataset: f7ds_stopword
vector: tfidf
features:	2000
train

In [6]:
score('LR')


method:  LR
dataset: originalds
vector: tfidf
features:	500
train time: 0.8509s
test time:  0.0012s
f1-score:   0.8861

method:  LR
dataset: originalds
vector: tfidf
features:	1000
train time: 1.8525s
test time:  0.0014s
f1-score:   0.9081

method:  LR
dataset: originalds
vector: tfidf
features:	2000
train time: 3.4325s
test time:  0.0023s
f1-score:   0.9219

method:  LR
dataset: originalds
vector: tfidf
features:	5000
train time: 7.5231s
test time:  0.0048s
f1-score:   0.9278

method:  LR
dataset: originalds
vector: tfidf
features:	all
train time: 12.9132s
test time:  0.0067s
f1-score:   0.9300


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt


method:  LR
dataset: originalds
vector: bagofwords
features:	500
train time: 1.0437s
test time:  0.0013s
f1-score:   0.8839


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt


method:  LR
dataset: originalds
vector: bagofwords
features:	1000
train time: 2.2148s
test time:  0.0023s
f1-score:   0.8950


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt


method:  LR
dataset: originalds
vector: bagofwords
features:	2000
train time: 5.2576s
test time:  0.0048s
f1-score:   0.9003


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt


method:  LR
dataset: originalds
vector: bagofwords
features:	5000
train time: 11.5690s
test time:  0.0149s
f1-score:   0.9094


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt


method:  LR
dataset: originalds
vector: bagofwords
features:	all
train time: 18.9473s
test time:  0.0105s
f1-score:   0.9111

method:  LR
dataset: zembds
vector: tfidf
features:	500
train time: 0.6681s
test time:  0.0008s
f1-score:   0.9158

method:  LR
dataset: zembds
vector: tfidf
features:	1000
train time: 1.5946s
test time:  0.0013s
f1-score:   0.9297

method:  LR
dataset: zembds
vector: tfidf
features:	2000
train time: 3.4029s
test time:  0.0023s
f1-score:   0.9367

method:  LR
dataset: zembds
vector: tfidf
features:	5000
train time: 7.6869s
test time:  0.0048s
f1-score:   0.9428

method:  LR
dataset: zembds
vector: tfidf
features:	all
train time: 12.2547s
test time:  0.0066s
f1-score:   0.9433


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt


method:  LR
dataset: zembds
vector: bagofwords
features:	500
train time: 1.0412s
test time:  0.0012s
f1-score:   0.9011


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt


method:  LR
dataset: zembds
vector: bagofwords
features:	1000
train time: 2.2046s
test time:  0.0023s
f1-score:   0.9147


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt


method:  LR
dataset: zembds
vector: bagofwords
features:	2000
train time: 5.6579s
test time:  0.0045s
f1-score:   0.9189


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt


method:  LR
dataset: zembds
vector: bagofwords
features:	5000
train time: 11.3998s
test time:  0.0129s
f1-score:   0.9247


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt


method:  LR
dataset: zembds
vector: bagofwords
features:	all
train time: 18.9231s
test time:  0.0113s
f1-score:   0.9275

method:  LR
dataset: f5ds
vector: tfidf
features:	500
train time: 0.6969s
test time:  0.0008s
f1-score:   0.9142

method:  LR
dataset: f5ds
vector: tfidf
features:	1000
train time: 1.6312s
test time:  0.0013s
f1-score:   0.9253

method:  LR
dataset: f5ds
vector: tfidf
features:	2000
train time: 3.9706s
test time:  0.0022s
f1-score:   0.9342

method:  LR
dataset: f5ds
vector: tfidf
features:	5000
train time: 7.6193s
test time:  0.0049s
f1-score:   0.9392

method:  LR
dataset: f5ds
vector: tfidf
features:	all
train time: 12.7226s
test time:  0.0068s
f1-score:   0.9417


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt


method:  LR
dataset: f5ds
vector: bagofwords
features:	500
train time: 1.0425s
test time:  0.0013s
f1-score:   0.8994


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt


method:  LR
dataset: f5ds
vector: bagofwords
features:	1000
train time: 2.2217s
test time:  0.0023s
f1-score:   0.9119


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt


method:  LR
dataset: f5ds
vector: bagofwords
features:	2000
train time: 5.5105s
test time:  0.0044s
f1-score:   0.9203


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt


method:  LR
dataset: f5ds
vector: bagofwords
features:	5000
train time: 11.5353s
test time:  0.0132s
f1-score:   0.9244


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt


method:  LR
dataset: f5ds
vector: bagofwords
features:	all
train time: 18.9632s
test time:  0.0117s
f1-score:   0.9269

method:  LR
dataset: f7ds
vector: tfidf
features:	500
train time: 0.6677s
test time:  0.0008s
f1-score:   0.9086

method:  LR
dataset: f7ds
vector: tfidf
features:	1000
train time: 1.6069s
test time:  0.0013s
f1-score:   0.9231

method:  LR
dataset: f7ds
vector: tfidf
features:	2000
train time: 3.4939s
test time:  0.0024s
f1-score:   0.9328

method:  LR
dataset: f7ds
vector: tfidf
features:	5000
train time: 7.3835s
test time:  0.0050s
f1-score:   0.9367

method:  LR
dataset: f7ds
vector: tfidf
features:	all
train time: 12.6742s
test time:  0.0067s
f1-score:   0.9389


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt


method:  LR
dataset: f7ds
vector: bagofwords
features:	500
train time: 1.0230s
test time:  0.0012s
f1-score:   0.8964


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt


method:  LR
dataset: f7ds
vector: bagofwords
features:	1000
train time: 2.2516s
test time:  0.0025s
f1-score:   0.9097


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt


method:  LR
dataset: f7ds
vector: bagofwords
features:	2000
train time: 5.2677s
test time:  0.0048s
f1-score:   0.9178


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt


method:  LR
dataset: f7ds
vector: bagofwords
features:	5000
train time: 11.0154s
test time:  0.0134s
f1-score:   0.9225


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt


method:  LR
dataset: f7ds
vector: bagofwords
features:	all
train time: 22.1739s
test time:  0.0142s
f1-score:   0.9247

method:  LR
dataset: originalds_stopword
vector: tfidf
features:	500
train time: 0.6867s
test time:  0.0008s
f1-score:   0.7786

method:  LR
dataset: originalds_stopword
vector: tfidf
features:	1000
train time: 1.6767s
test time:  0.0013s
f1-score:   0.8186

method:  LR
dataset: originalds_stopword
vector: tfidf
features:	2000
train time: 3.9188s
test time:  0.0023s
f1-score:   0.8636

method:  LR
dataset: originalds_stopword
vector: tfidf
features:	5000
train time: 8.4440s
test time:  0.0063s
f1-score:   0.8933

method:  LR
dataset: originalds_stopword
vector: tfidf
features:	all
train time: 14.8743s
test time:  0.0079s
f1-score:   0.9006

method:  LR
dataset: originalds_stopword
vector: bagofwords
features:	500
train time: 1.3019s
test time:  0.0070s
f1-score:   0.7361

method:  LR
dataset: originalds_stopword
vector: bagofwords
features:	1000
train time: 2.6036s
t

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt


method:  LR
dataset: f5ds_stopword
vector: bagofwords
features:	500
train time: 1.1727s
test time:  0.0013s
f1-score:   0.8461

method:  LR
dataset: f5ds_stopword
vector: bagofwords
features:	1000
train time: 2.7841s
test time:  0.0042s
f1-score:   0.8731

method:  LR
dataset: f5ds_stopword
vector: bagofwords
features:	2000
train time: 4.9442s
test time:  0.0068s
f1-score:   0.8858

method:  LR
dataset: f5ds_stopword
vector: bagofwords
features:	5000
train time: 7.9352s
test time:  0.0160s
f1-score:   0.8942

method:  LR
dataset: f5ds_stopword
vector: bagofwords
features:	all
train time: 13.4200s
test time:  0.0123s
f1-score:   0.8972

method:  LR
dataset: f7ds_stopword
vector: tfidf
features:	500
train time: 0.9632s
test time:  0.0010s
f1-score:   0.8511

method:  LR
dataset: f7ds_stopword
vector: tfidf
features:	1000
train time: 2.2759s
test time:  0.0036s
f1-score:   0.8797

method:  LR
dataset: f7ds_stopword
vector: tfidf
features:	2000
train time: 5.3516s
test time:  0.0027s
f1-s