In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import re
%matplotlib inline

In [None]:
from sklearn.datasets import fetch_20newsgroups
news = fetch_20newsgroups(subset='all')
news.keys()

Downloading 20news dataset. This may take a few minutes.
Downloading dataset from https://ndownloader.figshare.com/files/5975967 (14 MB)


In [3]:
print(news.data[0])

From: Mamatha Devineni Ratnam <mr47+@andrew.cmu.edu>
Subject: Pens fans reactions
Organization: Post Office, Carnegie Mellon, Pittsburgh, PA
Lines: 12
NNTP-Posting-Host: po4.andrew.cmu.edu



I am sure some bashers of Pens fans are pretty confused about the lack
of any kind of posts about the recent Pens massacre of the Devils. Actually,
I am  bit puzzled too and a bit relieved. However, I am going to put an end
to non-PIttsburghers' relief with a bit of praise for the Pens. Man, they
are killing those Devils worse than I thought. Jagr just showed you why
he is much better than his regular season stats. He is also a lot
fo fun to watch in the playoffs. Bowman should let JAgr have a lot of
fun in the next couple of games since the Pens are going to beat the pulp out of Jersey anyway. I was very disappointed not to see the Islanders lose the final
regular season game.          PENS RULE!!!




In [4]:
news.target[0]

10

In [5]:
news.target_names

['alt.atheism',
 'comp.graphics',
 'comp.os.ms-windows.misc',
 'comp.sys.ibm.pc.hardware',
 'comp.sys.mac.hardware',
 'comp.windows.x',
 'misc.forsale',
 'rec.autos',
 'rec.motorcycles',
 'rec.sport.baseball',
 'rec.sport.hockey',
 'sci.crypt',
 'sci.electronics',
 'sci.med',
 'sci.space',
 'soc.religion.christian',
 'talk.politics.guns',
 'talk.politics.mideast',
 'talk.politics.misc',
 'talk.religion.misc']

## 뉴스 분류기 모델 만들기

* 데이터 파악
* 전처리(Preprocessing)

    * 필요없는 단어 제거 (Data Cleansing)
    * CountVectorizer & Tf-idfVectorizer

---
    
* Modeling : BernoulliNB, MultinomialNB 사용
  * Cross Validation(Kfold 이용)
  
---

* Pipeline 이용

---

* Assignment Description
     * 위 신문 데이터를 바탕으로 신문 내용별 분류기를 개발하라
     * 위 데이터를 Traing / Test Dataset으로 나눠서 5-fold cross validation(5번 데이터를 training / testset으로 나눔, KV 활용)
     * Naive Bayesian Classifier와 Count Vector를 활용하여 각각 성능을 테스트하라
         * NB는 multinomial과 bernuoil 분포를 모두 사용하라
     * 가능할 경우, TF-IDF vector를 활용해 볼것 (검색어 - tf-idf scikit-learn)

# Dataset
   * 18846개의 데이터

In [6]:
news_df = pd.DataFrame({'News' : news.data, 'Target' : news.target})

In [7]:
news_df.head()

Unnamed: 0,News,Target
0,From: Mamatha Devineni Ratnam <mr47+@andrew.cm...,10
1,From: mblawson@midway.ecn.uoknor.edu (Matthew ...,3
2,From: hilmi-er@dsv.su.se (Hilmi Eren)\nSubject...,17
3,From: guyd@austin.ibm.com (Guy Dawson)\nSubjec...,3
4,From: Alexander Samuel McDiarmid <am2o+@andrew...,4


In [8]:
# Target 데이터 -> 문자 라벨링(뉴스마다 어떤 뉴스인지 보기 편하도록 만들기 위해서)
def word_labeling(lst, df):
    for idx, name in enumerate(lst):
        target_data = df['Target']
        for idx_, num_label in enumerate(target_data):
            if num_label == idx:
                df.loc[idx_, 'Target'] = name
    return df
news_df = word_labeling(news['target_names'], news_df)
news_df.head()

Unnamed: 0,News,Target
0,From: Mamatha Devineni Ratnam <mr47+@andrew.cm...,rec.sport.hockey
1,From: mblawson@midway.ecn.uoknor.edu (Matthew ...,comp.sys.ibm.pc.hardware
2,From: hilmi-er@dsv.su.se (Hilmi Eren)\nSubject...,talk.politics.mideast
3,From: guyd@austin.ibm.com (Guy Dawson)\nSubjec...,comp.sys.ibm.pc.hardware
4,From: Alexander Samuel McDiarmid <am2o+@andrew...,comp.sys.mac.hardware


* Data Cleansing
    * 이메일 제거
    * 불필요 숫자 제거
    * 문자 아닌 특수문자 제거
    * 단어 사이 공백 제거 : 띄어쓰기 별로 split해주고 join

In [9]:
def data_cleansing(df):
    delete_email = re.sub(r'\b[\w\+]+@[\w]+.[\w]+.[\w]+.[\w]+\b', ' ', df)
    delete_number = re.sub(r'\b|\d+|\b', ' ',delete_email)
    delete_non_word = re.sub(r'\b[\W]+\b', ' ', delete_number)
    cleaning_result = ' '.join(delete_non_word.split())
    return cleaning_result 

In [10]:
news_df.loc[:, 'News'] = news_df['News'].apply(data_cleansing)
news_df.head()

Unnamed: 0,News,Target
0,From Mamatha Devineni Ratnam Subject Pens fans...,rec.sport.hockey
1,From Matthew B Lawson Subject Which high perfo...,comp.sys.ibm.pc.hardware
2,From hilmi Hilmi Eren Subject Re ARMENIA SAYS ...,talk.politics.mideast
3,From Guy Dawson Subject Re IDE vs SCSI DMA and...,comp.sys.ibm.pc.hardware
4,From Alexander Samuel McDiarmid Subject driver...,comp.sys.mac.hardware


# Vectorizer
* CountVectorizer 
  * 문서 집합으로부터 단어의 수를 세어 카운트 행렬을 만듦
* TfidfVectorizer 
    * 단어를 갯수 그대로 카운트하지 않고 모든 문서에 공통적으로 들어있는 단어의 경우 문서 구별 능력이 떨어진다고 보아 가중치를 축소하는 방법
    * TF(Term Frequency) : 문서에서 해당 단어가 얼마나 나왔는지 나타내주는 빈도 수
    * DF(Document Frequency) : 해당 단어가 있는 문서의 수
    * IDF(Inverse Document Frequency) 해당 단어가 있는 문서의 수가 높아질 수록 가중치를 축소해주기 위해 역수 취해줌
        * log(N / (1 + DF))      
            * N : 전체 문서의 수
    * TF-IDF = TF * IDF
* CustomizedVectorizer - StemmedCounterVectorizer, StemmedTfidfVectorizer 

In [11]:
!pip install nltk

[33mYou are using pip version 9.0.1, however version 9.0.3 is available.
You should consider upgrading via the 'pip install --upgrade pip' command.[0m


In [12]:
from nltk import stem
stmmer = stem.SnowballStemmer("english")
sentence = 'looking looks looked'
[stmmer.stem(word) for word in sentence.split()]

['look', 'look', 'look']

In [13]:
stmmer.stem("images"), stmmer.stem("imaging"), stmmer.stem("imagination")  

('imag', 'imag', 'imagin')

In [14]:
!which pip

/usr/local/bin/pip


In [15]:
from sklearn.feature_extraction.text import CountVectorizer
import  nltk
enlish_stemmer = nltk.stem.SnowballStemmer("english")
class StemmedCountVectorizer(CountVectorizer):
    def build_analyzer(self):
        analyzer = super(StemmedCountVectorizer,self).build_analyzer()
        return lambda doc: (enlish_stemmer.stem(w) for w in analyzer(doc))

In [16]:
StemmedCountVectorizer(min_df=1, stop_words="english").fit([sentence]).vocabulary_

{'look': 0}

In [17]:
CountVectorizer(min_df=1, stop_words="english").fit([sentence]).vocabulary_

{'looked': 0, 'looking': 1, 'looks': 2}

In [18]:
from sklearn.feature_extraction.text import TfidfVectorizer

enlish_stemmer = nltk.stem.SnowballStemmer("english")
class StemmedTfidfVectorizer(TfidfVectorizer):
    def build_analyzer(self):
        analyzer = super(StemmedTfidfVectorizer,self).build_analyzer()
        return lambda doc: (enlish_stemmer.stem(w) for w in analyzer(doc))

# Modeling
* Pipeline
* Gridsearch
* Cross Validation

In [30]:
from nltk import ngrams
sentence = 'this is a foo bar sentences and i want to ngramize it'
n = 6
sixgrams = ngrams(sentence.split(), 3)
for grams in sixgrams:
  print (grams)

('this', 'is', 'a')
('is', 'a', 'foo')
('a', 'foo', 'bar')
('foo', 'bar', 'sentences')
('bar', 'sentences', 'and')
('sentences', 'and', 'i')
('and', 'i', 'want')
('i', 'want', 'to')
('want', 'to', 'ngramize')
('to', 'ngramize', 'it')


In [31]:
from sklearn.base import TransformerMixin, BaseEstimator
class DenseTransformer(BaseEstimator, TransformerMixin):

    def transform(self, X, y=None, **fit_params):
        return X.todense()

    def fit_transform(self, X, y=None, **fit_params):
        self.fit(X, y, **fit_params)
        return self.transform(X)

    def fit(self, X, y=None, **fit_params):
        return self

In [32]:
from sklearn.naive_bayes import MultinomialNB, BernoulliNB,GaussianNB
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline
from sklearn.pipeline import make_pipeline

vectorizer = [CountVectorizer(), TfidfVectorizer(), StemmedCountVectorizer(), StemmedTfidfVectorizer()]
# algorithms = [BernoulliNB(), MultinomialNB(), GaussianNB(), LogisticRegression()]
algorithms = [MultinomialNB(), LogisticRegression()]

pipelines  = [] 


import itertools
for case in list(itertools.product(vectorizer, algorithms)):
    if isinstance(case[1], GaussianNB):
        case = list(case)
        case.insert(1,  DenseTransformer())
    pipelines.append(make_pipeline(*case))
pipelines

[Pipeline(memory=None,
      steps=[('countvectorizer', CountVectorizer(analyzer='word', binary=False, decode_error='strict',
         dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
         lowercase=True, max_df=1.0, max_features=None, min_df=1,
         ngram_range=(1, 1), preprocessor=None, stop_words=None,
         strip_accents=None, token_pattern='(?u)\\b\\w\\w+\\b',
         tokenizer=None, vocabulary=None)), ('multinomialnb', MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True))]),
 Pipeline(memory=None,
      steps=[('countvectorizer', CountVectorizer(analyzer='word', binary=False, decode_error='strict',
         dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
         lowercase=True, max_df=1.0, max_features=None, min_df=1,
         ngram_range=(1, 1), preprocessor=None, stop_words=None,
   ...ty='l2', random_state=None, solver='liblinear', tol=0.0001,
           verbose=0, warm_start=False))]),
 Pipeline(memory=None,
      steps=[('tfi

In [33]:
# Vectorizer Common params
ngrams_params = [(1,1),(1,3)]
stopword_params = ["english"]
lowercase_params = [True, False]
max_df_params = np.linspace(0.4, 0.6, num=6)
min_df_params = np.linspace(0.0, 0.0, num=1)

attributes = {"ngram_range":ngrams_params, "max_df":max_df_params,"min_df":min_df_params,
              "lowercase":lowercase_params,"stop_words":stopword_params}
vectorizer_names = ["countvectorizer","tfidfvectorizer","stemmedcountvectorizer","stemmedtfidfvectorizer"]
vectorizer_params_dict = {}

for vect_name in vectorizer_names:
    vectorizer_params_dict[vect_name] = {}
    for key, value in attributes.items():
        param_name = vect_name + "__" + key
        vectorizer_params_dict[vect_name][param_name] =  value

In [36]:
# Algorithms parameters
algorithm_names = ["bernoullinb","multinomialnb","gaussiannb","logisticregression"]
algorithm_names = ["multinomialnb", "logisticregression"]

algorithm_params_dict = {}


#'bernoullinb', BernoulliNB(alpha=1.0, binarize=0.0, class_prior=None, fit_prior=True))])
alpha_params = np.linspace(1.0, 1.0, num=1)
for i in range(1):
    algorithm_params_dict[algorithm_names[i]] = {
    algorithm_names[i]+ "__alpha" : alpha_params    
    }
# algorithm_params_dict[algorithm_names[2]] = {}


# LogisticRegression    
# multi_class : str, {‘ovr’, ‘multinomial’}, default: ‘ovr’
# C : float, default: 1.0
# solver : {‘newton-cg’, ‘lbfgs’, ‘liblinear’, ‘sag’, ‘saga’},
# n_jobs : int, default: 1
# penalty : str, ‘l1’ or ‘l2’, default: ‘l2’

# multi_class_params = ["ovr", "multinomial"]
c_params = [0.1,  5.0, 7.0, 10.0, 15.0, 20.0, 100.0]



algorithm_params_dict[algorithm_names[1]] = [{
    "logisticregression__multi_class" : ["multinomial"],
    "logisticregression__solver" : ["saga"],
    "logisticregression__penalty" : ["l1"],
    "logisticregression__C" : c_params
    },{
    "logisticregression__multi_class" : ["ovr"],
    "logisticregression__solver" : ['liblinear'],
    "logisticregression__penalty" : ["l2"],
    "logisticregression__C" : c_params
    }
    ]
algorithm_params_dict

{'logisticregression': [{'logisticregression__C': [0.1,
    5.0,
    7.0,
    10.0,
    15.0,
    20.0,
    100.0],
   'logisticregression__multi_class': ['multinomial'],
   'logisticregression__penalty': ['l1'],
   'logisticregression__solver': ['saga']},
  {'logisticregression__C': [0.1, 5.0, 7.0, 10.0, 15.0, 20.0, 100.0],
   'logisticregression__multi_class': ['ovr'],
   'logisticregression__penalty': ['l2'],
   'logisticregression__solver': ['liblinear']}],
 'multinomialnb': {'multinomialnb__alpha': array([ 1.])}}

In [37]:
pipeline_params= []
for case in list(itertools.product(vectorizer_names, algorithm_names)):
    vect_params = vectorizer_params_dict[case[0]].copy()
    algo_params = algorithm_params_dict[case[1]]
    
    if isinstance(algo_params, dict):
        vect_params.update(algo_params)
        pipeline_params.append(vect_params)
    else:
        temp = []
        for param in algo_params:
            vect_params.update(param)
            temp.append(vect_params)
        pipeline_params.append(temp)
pipeline_params

[{'countvectorizer__lowercase': [True, False],
  'countvectorizer__max_df': array([ 0.4 ,  0.44,  0.48,  0.52,  0.56,  0.6 ]),
  'countvectorizer__min_df': array([ 0.]),
  'countvectorizer__ngram_range': [(1, 1), (1, 3)],
  'countvectorizer__stop_words': ['english'],
  'multinomialnb__alpha': array([ 1.])},
 [{'countvectorizer__lowercase': [True, False],
   'countvectorizer__max_df': array([ 0.4 ,  0.44,  0.48,  0.52,  0.56,  0.6 ]),
   'countvectorizer__min_df': array([ 0.]),
   'countvectorizer__ngram_range': [(1, 1), (1, 3)],
   'countvectorizer__stop_words': ['english'],
   'logisticregression__C': [0.1, 5.0, 7.0, 10.0, 15.0, 20.0, 100.0],
   'logisticregression__multi_class': ['ovr'],
   'logisticregression__penalty': ['l2'],
   'logisticregression__solver': ['liblinear']},
  {'countvectorizer__lowercase': [True, False],
   'countvectorizer__max_df': array([ 0.4 ,  0.44,  0.48,  0.52,  0.56,  0.6 ]),
   'countvectorizer__min_df': array([ 0.]),
   'countvectorizer__ngram_range': [(

# Learn! Learn!

In [38]:
from sklearn.preprocessing import LabelEncoder

X_data = news_df.loc[:, 'News'].tolist()
y_data = news_df['Target'].tolist()
y = LabelEncoder().fit_transform(y_data)

In [None]:
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import classification_report, accuracy_score

scoring = ['accuracy']
estimator_results = []
for i, (estimator, params) in enumerate(zip(pipelines,pipeline_params)):
    n_jobs = 36
#     if i+1 % 3 == 0:
#         n_jobs = 2
    gs_estimator = GridSearchCV(
            refit="accuracy", estimator=estimator,param_grid=params, scoring=scoring, cv=5, verbose=1, n_jobs=n_jobs)
    print(gs_estimator)

    gs_estimator.fit(X_data, y)
    estimator_results.append(gs_estimator)

GridSearchCV(cv=5, error_score='raise',
       estimator=Pipeline(memory=None,
     steps=[('countvectorizer', CountVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), p..., vocabulary=None)), ('multinomialnb', MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True))]),
       fit_params=None, iid=True, n_jobs=36,
       param_grid={'countvectorizer__stop_words': ['english'], 'multinomialnb__alpha': array([ 1.]), 'countvectorizer__ngram_range': [(1, 1), (1, 3)], 'countvectorizer__max_df': array([ 0.4 ,  0.44,  0.48,  0.52,  0.56,  0.6 ]), 'countvectorizer__lowercase': [True, False], 'countvectorizer__min_df': array([ 0.])},
       pre_dispatch='2*n_jobs', refit='accuracy',
       return_train_score='warn', scoring=['accuracy'], verbose=1)
Fitting 5 folds for each of 24 candidates, totalling 120 fits


[Parallel(n_jobs=36)]: Done 120 out of 120 | elapsed:  2.9min finished


GridSearchCV(cv=5, error_score='raise',
       estimator=Pipeline(memory=None,
     steps=[('countvectorizer', CountVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), preprocessor=None, stop_words=None,
  ...ty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False))]),
       fit_params=None, iid=True, n_jobs=36,
       param_grid=[{'logisticregression__C': [0.1, 5.0, 7.0, 10.0, 15.0, 20.0, 100.0], 'logisticregression__solver': ['liblinear'], 'logisticregression__multi_class': ['ovr'], 'logisticregression__penalty': ['l2'], 'countvectorizer__stop_words': ['english'], 'countvectorizer__ngram_range': [(1, 1), (1, 3)]....56,  0.6 ]), 'countvectorizer__lowercase': [True, False], 'countvectorizer__min_df': array([ 0.])}],
       pre_dispatch='2*n_jobs', refit='accuracy',
   

[Parallel(n_jobs=36)]: Done 128 tasks      | elapsed: 85.8min
[Parallel(n_jobs=36)]: Done 378 tasks      | elapsed: 223.1min
[Parallel(n_jobs=36)]: Done 728 tasks      | elapsed: 448.5min
[Parallel(n_jobs=36)]: Done 1178 tasks      | elapsed: 750.6min
[Parallel(n_jobs=36)]: Done 1680 out of 1680 | elapsed: 1070.9min finished


GridSearchCV(cv=5, error_score='raise',
       estimator=Pipeline(memory=None,
     steps=[('tfidfvectorizer', TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), norm='l2', preprocessor=None, smooth_i...   vocabulary=None)), ('multinomialnb', MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True))]),
       fit_params=None, iid=True, n_jobs=36,
       param_grid={'tfidfvectorizer__ngram_range': [(1, 1), (1, 3)], 'tfidfvectorizer__stop_words': ['english'], 'tfidfvectorizer__min_df': array([ 0.]), 'multinomialnb__alpha': array([ 1.]), 'tfidfvectorizer__lowercase': [True, False], 'tfidfvectorizer__max_df': array([ 0.4 ,  0.44,  0.48,  0.52,  0.56,  0.6 ])},
       pre_dispatch='2*n_jobs', refit='accuracy',
       return_train_score='warn', scoring=['accuracy'], verbose=1)
Fitting 5 folds for each of 24 c

[Parallel(n_jobs=36)]: Done 120 out of 120 | elapsed:  3.0min finished


GridSearchCV(cv=5, error_score='raise',
       estimator=Pipeline(memory=None,
     steps=[('tfidfvectorizer', TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), norm='l2', preprocessor=None, smooth_i...ty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False))]),
       fit_params=None, iid=True, n_jobs=36,
       param_grid=[{'tfidfvectorizer__min_df': array([ 0.]), 'logisticregression__penalty': ['l2'], 'tfidfvectorizer__lowercase': [True, False], 'logisticregression__multi_class': ['ovr'], 'tfidfvectorizer__ngram_range': [(1, 1), (1, 3)], 'tfidfvectorizer__stop_words': ['english'], 'logisticregression__C':...lver': ['liblinear'], 'tfidfvectorizer__max_df': array([ 0.4 ,  0.44,  0.48,  0.52,  0.56,  0.6 ])}],
       pre_dispatch='2*n_jobs', refit='accuracy',
   

[Parallel(n_jobs=36)]: Done 128 tasks      | elapsed: 11.5min
[Parallel(n_jobs=36)]: Done 378 tasks      | elapsed: 51.2min
[Parallel(n_jobs=36)]: Done 728 tasks      | elapsed: 112.3min
[Parallel(n_jobs=36)]: Done 1178 tasks      | elapsed: 178.7min
[Parallel(n_jobs=36)]: Done 1680 out of 1680 | elapsed: 269.1min finished


GridSearchCV(cv=5, error_score='raise',
       estimator=Pipeline(memory=None,
     steps=[('stemmedcountvectorizer', StemmedCountVectorizer(analyzer='word', binary=False, decode_error='strict',
            dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
            lowercase=True, max_df=1.0, max_features=None, min_df=1,
            ngram_range=(1, 1), preprocessor...e, vocabulary=None)), ('multinomialnb', MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True))]),
       fit_params=None, iid=True, n_jobs=36,
       param_grid={'multinomialnb__alpha': array([ 1.]), 'stemmedcountvectorizer__max_df': array([ 0.4 ,  0.44,  0.48,  0.52,  0.56,  0.6 ]), 'stemmedcountvectorizer__lowercase': [True, False], 'stemmedcountvectorizer__stop_words': ['english'], 'stemmedcountvectorizer__min_df': array([ 0.]), 'stemmedcountvectorizer__ngram_range': [(1, 1), (1, 3)]},
       pre_dispatch='2*n_jobs', refit='accuracy',
       return_train_score='warn', scoring=['accuracy'], verbose=

[Parallel(n_jobs=36)]: Done 120 out of 120 | elapsed: 22.8min finished


GridSearchCV(cv=5, error_score='raise',
       estimator=Pipeline(memory=None,
     steps=[('stemmedcountvectorizer', StemmedCountVectorizer(analyzer='word', binary=False, decode_error='strict',
            dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
            lowercase=True, max_df=1.0, max_features=None, min_df=1,
            ngram_range=(1, 1), preprocessor...ty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False))]),
       fit_params=None, iid=True, n_jobs=36,
       param_grid=[{'stemmedcountvectorizer__max_df': array([ 0.4 ,  0.44,  0.48,  0.52,  0.56,  0.6 ]), 'stemmedcountvectorizer__lowercase': [True, False], 'stemmedcountvectorizer__stop_words': ['english'], 'logisticregression__solver': ['liblinear'], 'logisticregression__multi_class': ['ovr'], 'stemmedco...sion__C': [0.1, 5.0, 7.0, 10.0, 15.0, 20.0, 100.0], 'stemmedcountvectorizer__min_df': array([ 0.])}],
       pre_dispatch='2*n_jobs', refit='accuracy',
   

In [46]:
import pandas as pd
from pandas import DataFrame
result_df_dict = {}
result_attributes = ["vectorizer", "model", "accuracy", "recall_macro","precision_macro" , "min_df", 
                     "lowercase", "max_df", "binarize", "alpha", "ngram_range"
                     "multi_class", "penalty", "solver", "C"]

pieline_list =  list(itertools.product(vectorizer_names, algorithm_names))

for att in result_attributes:
    result_df_dict[att] = [None for i in range(16)]

result_df = DataFrame(result_df_dict)
result_df

Unnamed: 0,C,accuracy,alpha,binarize,lowercase,max_df,min_df,model,ngram_rangemulti_class,penalty,precision_macro,recall_macro,solver,vectorizer
0,,,,,,,,,,,,,,
1,,,,,,,,,,,,,,
2,,,,,,,,,,,,,,
3,,,,,,,,,,,,,,
4,,,,,,,,,,,,,,
5,,,,,,,,,,,,,,
6,,,,,,,,,,,,,,
7,,,,,,,,,,,,,,
8,,,,,,,,,,,,,,
9,,,,,,,,,,,,,,


In [47]:
pieline_list

[('countvectorizer', 'multinomialnb'),
 ('countvectorizer', 'logisticregression'),
 ('tfidfvectorizer', 'multinomialnb'),
 ('tfidfvectorizer', 'logisticregression'),
 ('stemmedcountvectorizer', 'multinomialnb'),
 ('stemmedcountvectorizer', 'logisticregression'),
 ('stemmedtfidfvectorizer', 'multinomialnb'),
 ('stemmedtfidfvectorizer', 'logisticregression')]

In [48]:
for i, estiamtor in enumerate(estimator_results):
    best_estimator = estiamtor.best_estimator_
    best_index = estiamtor.best_index_
    result_df_dict["vectorizer"][i] = pieline_list[i][0]
    result_df_dict["model"][i] = pieline_list[i][1]
    result_df_dict["accuracy"][i] = estiamtor.best_score_
#     result_df_dict["recall_micro"][i] = estiamtor.cv_results_["mean_test_recall_micro"][best_index]
#     result_df_dict["precision_micro"][i] = estiamtor.cv_results_["mean_test_precision_micro"][best_index]
    for key, value in estiamtor.best_params_.items():
        if key.split("__")[1] in result_df_dict:
            name = key.split("__")[1]
            result_df_dict[key.split("__")[1]][i] = value
#     print(estiamtor.best_params_)
#     print(a.named_steps)

In [49]:
result_df = DataFrame(result_df_dict, columns=result_attributes)
result_df.sort_values("accuracy",ascending=False)

Unnamed: 0,vectorizer,model,accuracy,recall_macro,precision_macro,min_df,lowercase,max_df,binarize,alpha,ngram_rangemulti_class,penalty,solver,C
7,stemmedtfidfvectorizer,logisticregression,0.936432,,,0.0,False,0.52,,,,l2,liblinear,100.0
3,tfidfvectorizer,logisticregression,0.936061,,,0.0,True,0.44,,,,l2,liblinear,100.0
1,countvectorizer,logisticregression,0.921097,,,0.0,True,0.4,,,,l2,liblinear,15.0
5,stemmedcountvectorizer,logisticregression,0.920885,,,0.0,False,0.4,,,,l2,liblinear,20.0
0,countvectorizer,multinomialnb,0.908893,,,0.0,True,0.44,,1.0,,,,
4,stemmedcountvectorizer,multinomialnb,0.90624,,,0.0,False,0.44,,1.0,,,,
2,tfidfvectorizer,multinomialnb,0.903056,,,0.0,True,0.44,,1.0,,,,
6,stemmedtfidfvectorizer,multinomialnb,0.900934,,,0.0,False,0.4,,1.0,,,,
8,,,,,,,,,,,,,,
9,,,,,,,,,,,,,,
