In [1]:
import os
import pandas as pd
import numpy as np
import re
import itertools
from datetime import datetime
import traceback
import pickle
import warnings

#model valuation
from sklearn.model_selection import train_test_split

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.feature_selection import SelectKBest
from sklearn.model_selection import cross_validate
from sklearn.feature_selection import f_classif
from sklearn.model_selection import GridSearchCV

import random as rand

from sklearn.pipeline import Pipeline
from sklearn.preprocessing import normalize, Normalizer, MaxAbsScaler, MinMaxScaler
from sklearn.decomposition import PCA
from sklearn.dummy import DummyClassifier

In [2]:
import platform
import scipy
import sklearn
print(platform.platform())
print("NumPy", np.__version__)
print("SciPy", scipy.__version__)
print("Scikit-Learn", sklearn.__version__)

Linux-4.4.0-1112-aws-x86_64-with-debian-stretch-sid
NumPy 1.14.3
SciPy 1.1.0
Scikit-Learn 0.19.1


In [3]:
from sklearn.base import BaseEstimator
from scipy.sparse import issparse

class DenseTransformer(BaseEstimator):
    def __init__(self, return_copy=True):
        self.return_copy = return_copy
        self.is_fitted = False

    def transform(self, X, y=None):
        if issparse(X):
            return X.toarray()
        elif self.return_copy:
            return X.copy()
        else:
            return X

    def fit(self, X, y=None):
        self.is_fitted = True
        return self

    def fit_transform(self, X, y=None):
        return self.transform(X=X, y=y)

In [4]:
from sklearn.base import BaseEstimator
from scipy.sparse import issparse

class ObfuscationTransformer(BaseEstimator):
    def __init__(self,re_from=r'(\b)(\w{0,2})\w+(\w{1,3})(\b)', re_to=r'\1\2XX\3\4', return_copy=True):
        self.re_from = re_from
        self.re_to = re_to

    def transform(self, X, y=None):
        X = np.array(X).copy();
        for i in range(len(X)):
            X[i] = re.sub(self.re_from,self.re_to, X[i])
        
        return X;

    def fit(self, X, y=None):
        return self

    def fit_transform(self, X, y=None):
        return self.transform(X=X, y=y)

In [5]:
import warnings
from sklearn.metrics import f1_score, precision_score, recall_score, accuracy_score
from sklearn.preprocessing import LabelEncoder

def eval_measures(gt, pred):
    with warnings.catch_warnings():
        warnings.simplefilter('ignore')
        f1 = f1_score(gt,
                  pred,
                  labels=list(set(gt)),
                  average='macro')
        precision = precision_score(gt,
                  pred,
                  labels=list(set(gt)),
                  average='macro')
        recall = recall_score(gt,
                  pred,
                  labels=list(set(gt)),
                  average='macro')
        accuracy = accuracy_score(gt,
                  pred)

    return f1,precision,recall,accuracy

In [6]:
pipelineCharacter = Pipeline([
    ('vect',   TfidfVectorizer(
            analyzer='char',
            min_df=0.05,
            max_df=1.0,
            ngram_range=(2,5),
            lowercase=False,
            norm='l2',
            sublinear_tf=True)),
    ('dense',  DenseTransformer()),
    ('scaler', MaxAbsScaler()),
    ('transf', PCA(0.999)),
    ('clf', LogisticRegression(random_state=0,multi_class='multinomial', solver='newton-cg')),
])

pipelineObfuscator = Pipeline([
        ('obs',ObfuscationTransformer(re_from=r'\w',re_to='x')),
        ('vect',   TfidfVectorizer(
                analyzer='char',
                min_df=0.05,
                max_df=1.0,
                ngram_range=(2,5),
                lowercase=False,
                norm='l2',
                sublinear_tf=True)),
        ('dense',  DenseTransformer()),
        ('scaler', MaxAbsScaler()),
        ('transf', PCA(0.999)),
        ('clf', LogisticRegression(random_state=0,multi_class='multinomial', solver='newton-cg')),
    ])

pipelineWord = Pipeline([
        ('vect',   TfidfVectorizer(
                analyzer='word',
                min_df=0.05,
                max_df=1.0,
                ngram_range=(1,3),
                lowercase=True,
                norm='l2',
                sublinear_tf=True)),
        ('dense',  DenseTransformer()),
        ('scaler', MaxAbsScaler()),
        ('transf', PCA(0.999)),
        ('clf', LogisticRegression(random_state=0, multi_class='multinomial', solver='newton-cg')),
    ]);

In [7]:
os.chdir('/home/ubuntu/lab/Experimentos/experimento_3_corpus_blogs')
input_file = os.path.join('input', 'output_concat.pickle')

with open(input_file, 'rb') as handle:
    df = pickle.load(handle)

In [8]:
df = df.rename(columns={'gender_final': 'gender',
                       'age_final': 'ap_age',
                       'post_concat': 'text'})

In [9]:
df['id_author'] = df['filename'].str.split('.', expand=True)[0]
del df['filename']

In [10]:
df['text'] = df['text'].str.lower()

In [11]:
df['count_delimiter'] = df['text'].str.split("\|<->\|").apply(len)

In [12]:
df_final_statistics = df.copy()
df_final_statistics['number_of_words'] = df['text'].apply(lambda x: len(x.split()))

In [13]:
df_final_statistics.head()

Unnamed: 0,text,gender,ap_age,id_author,count_delimiter,number_of_words
0,"well, everyone got up and going this morning. ...",0,2,1000331,13,1127
1,"yeah, sorry for not writing for a whole there...",0,0,1000866,771,126790
2,"cupid,please hear my cry, cupid, please let yo...",1,1,1004904,52,3536
3,and did i mention that i no longer have to dea...,0,1,1005076,85,4337
4,b-logs: the business blogs paradox urllink...,1,1,1005545,80,16310


In [14]:
print("Total de unidades: " + str(sum(df_final_statistics['count_delimiter'])))
print("Total de palavras: " + str(sum(df_final_statistics['number_of_words'])))
print("Total de palavras/unidade: " + str(sum(df_final_statistics['number_of_words'])/sum(df_final_statistics['count_delimiter'])))  

Total de unidades: 918298
Total de palavras: 133727748
Total de palavras/unidade: 145.62565528837044


In [15]:
## selecao de autores com mais textos
limit_authors_aa_selection = 20
#list_authors_aa_selection = list(df_final['id_author'].value_counts()[:limit_authors_aa_selection].index)
list_authors_aa_selection = list(df.sort_values(by='count_delimiter', ascending=False)[:limit_authors_aa_selection]['id_author'])

In [16]:
#Divisao de bases
df_aa = df[df['id_author'].isin(list_authors_aa_selection)]
df_ca = df[~df['id_author'].isin(list_authors_aa_selection)]

In [17]:
pipeline_kbest = Pipeline([
    ('vect', TfidfVectorizer()), 
    ('k_best', SelectKBest(f_classif)),
    ('clf', LogisticRegression(random_state=42, n_jobs=6))
])
    
parameters_kbest = {    
    'k_best__k': (range(3000,20000,1000))
}

grid_search_resp = GridSearchCV(pipeline_kbest,
                               parameters_kbest,
                               cv=2,
                               scoring='f1_macro',
                               n_jobs=4,
                               verbose=10
                               )

In [18]:
#ca_list = ['gender', 'ap_age']
ca_list = ['ap_age']

In [19]:
#k_best_dict = {}

#for ca in ca_list:
   
    #warnings.filterwarnings("ignore")
#    grid_search_resp.fit(df_ca['text'], df_ca[ca])
#    print('Finalizado, tarefa: ' + str(ca))

#    k_best_dict.update({ca: grid_search_resp.best_estimator_.get_params()['k_best__k']})
#print(k_best_dict)

#### O resultado deste for é:
k_best_dict = {'gender': 17000,
               'ap_age': 19000}

In [20]:
print(k_best_dict)

{'gender': 17000, 'ap_age': 19000}


In [21]:
vect = TfidfVectorizer()
clf = LogisticRegression(class_weight='balanced')

# estimado
#k_best_dict = {'gender': 1800,
#               'ap_age': 9800}

In [22]:
text_vectorized_train = vect.fit_transform(df_ca['text'])
text_vectorized_test = vect.transform(df_aa['text'])

In [23]:
df_test_ca = pd.DataFrame()
df_test_ca['id_author'] = df_aa['id_author']

In [24]:
#df_ca.head()
#df_ca['text'].iloc[0]
#df_ca.shape
df_ca['id_author'].unique()

array(['1000331', '1000866', '1004904', ..., '998237', '998966', '999503'],
      dtype=object)

In [25]:
df_test_ca = pd.DataFrame()
df_test_ca['id_author'] = list(df_aa['id_author'])

print('ca;accuracy;precision_macro;recall_macro;f1_macro')

for ca_var in k_best_dict:

    k_best = k_best_dict[ca_var]
    sel = SelectKBest(k = k_best)
    ft = sel.fit(text_vectorized_train, df_ca[ca_var])
    train_best = ft.transform(text_vectorized_train)

    clf.fit(train_best, df_ca[ca_var])

    test_best = ft.transform(text_vectorized_test)

    predicted = clf.predict(test_best) #com variavel categorica
    predicted_prob = clf.predict_proba(test_best) #com probabilidade de variavel
    
    f1, precision, recall, accuracy = eval_measures(df_aa[ca_var], predicted)

    print(pd.DataFrame(data=[[ca_var, f1, precision, recall, accuracy]], 
             columns=['ca_var', 'f1', 'precision', 'recall', 'accuracy']))

    df_test_ca[ca_var + '_predict'] = predicted
    
    # probs
    df_temp = pd.DataFrame(predicted_prob)
    df_temp = df_temp.add_prefix(ca_var + '_')
    df_test_ca = df_test_ca.join(df_temp)

ca;accuracy;precision_macro;recall_macro;f1_macro
   ca_var        f1  precision    recall  accuracy
0  gender  0.649123   0.666667  0.661616      0.65
   ca_var       f1  precision    recall  accuracy
0  ap_age  0.76122   0.731481  0.814815      0.75


In [26]:
clf_dummy = DummyClassifier(strategy='stratified') #most_frequent, #stratified
for ca_var in k_best_dict:
    clf_dummy.fit(df_ca['id_author'].values.reshape(-1,1), df_ca[ca_var].values)
    predicted = clf_dummy.predict(df_aa['id_author'].values.reshape(-1,1))
    df_test_ca[ca_var + '_dummy_stratified'] = predicted

In [27]:
clf_dummy = DummyClassifier(strategy='most_frequent') #most_frequent, #stratified
for ca_var in k_best_dict:
    clf_dummy.fit(df_ca['id_author'].values.reshape(-1,1), df_ca[ca_var].values)
    predicted = clf_dummy.predict(df_aa['id_author'].values.reshape(-1,1))
    df_test_ca[ca_var + '_dummy_most_frequent'] = predicted

In [28]:
df_test_ca_baseline = df_test_ca.copy() 

In [29]:
df_test_ca_baseline = df_test_ca_baseline.merge(df_aa[['id_author', 'gender', 'ap_age']],
                          left_on='id_author', right_on='id_author', how='left')

In [30]:
#baseline CA
#print('ca;f1;precision_macro;recall_macro;accuracy')

for ca_var in k_best_dict:
    
    f1, precision, recall, accuracy = eval_measures(df_test_ca_baseline[ca_var], df_test_ca_baseline[ca_var + '_dummy_most_frequent'])

    print(pd.DataFrame(data=[[ca_var, f1, precision, recall, accuracy]], 
             columns=['ca_var', 'f1', 'precision', 'recall', 'accuracy']))

   ca_var        f1  precision  recall  accuracy
0  gender  0.310345      0.225     0.5      0.45
   ca_var        f1  precision    recall  accuracy
0  ap_age  0.060606   0.033333  0.333333       0.1


In [31]:
df_final = pd.DataFrame()

for i in range(0, len(df_aa)):
    text_list = df_aa['text'].iloc[i].split('|<->|')
    author_list = [df_aa['id_author'].iloc[i]]*len(text_list)
    df_inter = pd.DataFrame(data=[text_list, author_list]).T
    df_inter = df_inter.rename(columns={0: 'text', 1: 'id_author'})
    
    df_final = df_final.append(df_inter)

In [32]:
df_final = df_final[df_final['text'].apply(len)>20]

In [33]:
df_final_aa = df_final[df_final['id_author'].isin(list_authors_aa_selection)]
df_final_aa['id_author'] = df_final_aa['id_author'].astype(int)

In [34]:
df_test_ca['id_author'] = df_test_ca['id_author'].astype(int)
df['id_author'] = df['id_author'].astype(int)

In [35]:
df_final_aa = df_final_aa.merge(df_test_ca, how='left', left_on='id_author', right_on='id_author')
df_final_aa = df_final_aa.merge(df[['id_author', 'gender','ap_age']], how='left', left_on='id_author', right_on='id_author')

In [36]:
clf_final = LogisticRegression(random_state=42, multi_class='multinomial', solver='newton-cg')
scoring = ['accuracy','precision_macro', 'recall_macro', 'f1_macro']

In [37]:
df_final_aa.shape

(32652, 15)

In [49]:
df_final_aa.drop_duplicates(subset=['id_author'])[['id_author', 'gender', 'ap_age']]

Unnamed: 0,id_author,gender,ap_age
0,106651,1,1
1245,1417798,0,2
2290,1470319,0,1
3405,1472995,0,1
4520,1596188,0,2
5506,1784456,0,0
7282,1975546,0,1
9523,2200026,1,2
10649,271835,1,1
11806,2866266,1,0


In [38]:
def run_simulation(df_train, author, simulation, ca_list, author_list):
    ### train
    try:
        x_train, x_test, y_train, y_test = train_test_split(df_train[['text'] + ca_list], 
                                                        df_train['id_author'], 
                                                    test_size=0.3,random_state=rand.randint(0,900))

        pipelines = [pipelineCharacter, pipelineObfuscator, pipelineWord]
        for p in pipelines:
            p.fit(x_train['text'], y_train)

        predict_text_train_aa_mix = np.hstack([p.predict_proba(x_train['text']) for p in pipelines])
        df_train_mix = pd.DataFrame(data=predict_text_train_aa_mix)
        if 'gender_0' in x_train.columns:
            df_train_mix['gender_0'] = x_train['gender_0'].values
        if 'gender_1' in x_train.columns:
            df_train_mix['gender_1'] = x_train['gender_1'].values
        if 'ap_age_0' in x_train.columns:
            df_train_mix['ap_age_0'] = x_train['ap_age_0'].values
        if 'ap_age_1' in x_train.columns:
            df_train_mix['ap_age_1'] = x_train['ap_age_1'].values
        if 'ap_age_2' in x_train.columns:
            df_train_mix['ap_age_2'] = x_train['ap_age_2'].values

        if 'gender' in x_train.columns:
            df_train_mix['gender'] = x_train['gender'].values
        if 'ap_age' in x_train.columns:
            df_train_mix['ap_age'] = x_train['ap_age'].values
            
        if 'gender_dummy_most_frequent' in x_train.columns:
            df_train_mix['gender_dummy_most_frequent'] = x_train['gender_dummy_most_frequent'].values
        if 'ap_age_dummy_most_frequent' in x_train.columns:
            df_train_mix['ap_age_dummy_most_frequent'] = x_train['ap_age_dummy_most_frequent'].values

        if 'gender_dummy_stratified' in x_train.columns:
            df_train_mix['gender_dummy_stratified'] = x_train['gender_dummy_stratified'].values
        if 'ap_age_dummy_stratified' in x_train.columns:
            df_train_mix['ap_age_dummy_stratified'] = x_train['ap_age_dummy_stratified'].values
            
        if 'gender_predict' in x_train.columns:
            df_train_mix['gender_predict'] = x_train['gender_predict'].values
        if 'ap_age_predict' in x_train.columns:
            df_train_mix['ap_age_predict'] = x_train['ap_age_predict'].values

        clf_final.fit(df_train_mix, y_train)

        #test
        predict_text_test_aa_mix = np.hstack([p.predict_proba(x_test['text']) for p in pipelines])
        df_test_mix = pd.DataFrame(data=predict_text_test_aa_mix)
        if 'gender_0' in x_test.columns:
            df_test_mix['gender_0'] = x_test['gender_0'].values
        if 'gender_1' in x_test.columns:
            df_test_mix['gender_1'] = x_test['gender_1'].values
        if 'ap_age_0' in x_test.columns:
            df_test_mix['ap_age_0'] = x_test['ap_age_0'].values
        if 'ap_age_1' in x_test.columns:
            df_test_mix['ap_age_1'] = x_test['ap_age_1'].values
        if 'ap_age_2' in x_test.columns:
            df_test_mix['ap_age_2'] = x_test['ap_age_2'].values

        if 'gender' in x_test.columns:
            df_test_mix['gender'] = x_test['gender'].values
        if 'ap_age' in x_test.columns:
            df_test_mix['ap_age'] = x_test['ap_age'].values
            
        if 'gender_dummy_most_frequent' in x_test.columns:
            df_test_mix['gender_dummy_most_frequent'] = x_test['gender_dummy_most_frequent'].values
        if 'ap_age_dummy_most_frequent' in x_test.columns:
            df_test_mix['ap_age_dummy_most_frequent'] = x_test['ap_age_dummy_most_frequent'].values

        if 'gender_dummy_stratified' in x_test.columns:
            df_test_mix['gender_dummy_stratified'] = x_test['gender_dummy_stratified'].values
        if 'ap_age_dummy_stratified' in x_test.columns:
            df_test_mix['ap_age_dummy_stratified'] = x_test['ap_age_dummy_stratified'].values

        if 'gender_predict' in x_test.columns:
            df_test_mix['gender_predict'] = x_test['gender_predict'].values
        if 'ap_age_predict' in x_test.columns:
            df_test_mix['ap_age_predict'] = x_test['ap_age_predict'].values
            
        test_pred = clf_final.predict(df_test_mix)

        f1, precision, recall, accuracy = eval_measures(y_test, test_pred)
        print("Executado para : " + str(author) + ' autores - ' + str(now.strftime("%d/%m/%Y %H:%M:%S")))

        return pd.DataFrame(data=[[author, f1, precision, recall, accuracy, simulation, str(ca_list), str(author_list)]], 
                 columns=['autors_selected', 'f1', 'precision', 'recall', 'accuracy', 'simulation', 'ca_list', 'author_list'])
    except Exception as e:
        #print('erro na simulacao')
        print(traceback.print_exc())
        return pd.DataFrame(data=[[0.0, 0.0, 0.0, 0.0, 0.0, simulation, str(ca_list), str(author_list)]], 
                 columns=['autors_selected', 'f1', 'precision', 'recall', 'accuracy', 'simulation', 'ca_list', 'author_list'])

In [39]:
ca_list_11 = []

#individuais
ca_list_02 = ['gender']
ca_list_04 = ['ap_age']

ca_list_12 = ['gender_predict']
ca_list_14 = ['ap_age_predict']

ca_list_221 = ['gender_dummy_stratified']
ca_list_241 = ['ap_age_dummy_stratified']

ca_list_222 = ['gender_dummy_most_frequent']
ca_list_242 = ['ap_age_dummy_most_frequent']

ca_list_32 = ['gender_0', 'gender_1']
ca_list_34 = ['ap_age_0', 'ap_age_1', 'ap_age_2']

In [40]:
#df_filter = df_final_aa[df_final_aa['id_author'].isin(author_list_filter)]
#rand.shuffle(list_authors_aa_selection)
#author_list_filter = list_authors_aa_selection[:2]

#run_simulation(df_filter, 1, 1, ca_list_36, author_list_filter)

In [101]:
df_metrics = pd.DataFrame()

for i in range(0, 20):
    rand.shuffle(list_authors_aa_selection)

    now = datetime.now()

    print("Executing simulation number: " + str(i) + " data: " + str(now.strftime("%d/%m/%Y %H:%M:%S")))

    for j in range(2, limit_authors_aa_selection -10 + 2, 2):
        
        author_list_filter = list_authors_aa_selection[:j]

        df_filter = df_final_aa[df_final_aa['id_author'].isin(author_list_filter)]
        
        df_metrics = df_metrics.append(run_simulation(df_filter, j, i, ca_list_11, author_list_filter)) # baseline
        df_metrics = df_metrics.append(run_simulation(df_filter, j, i, ca_list_02, author_list_filter)) # baseline
        df_metrics = df_metrics.append(run_simulation(df_filter, j, i, ca_list_12, author_list_filter)) # baseline
        #df_metrics = df_metrics.append(run_simulation(df_filter, j, i, ca_list_221, author_list_filter)) # baseline
        #df_metrics = df_metrics.append(run_simulation(df_filter, j, i, ca_list_222, author_list_filter)) # baseline
        df_metrics = df_metrics.append(run_simulation(df_filter, j, i, ca_list_32, author_list_filter)) # baseline

        df_metrics = df_metrics.append(run_simulation(df_filter, j, i, ca_list_04, author_list_filter)) # baseline
        df_metrics = df_metrics.append(run_simulation(df_filter, j, i, ca_list_14, author_list_filter)) # baseline
        #df_metrics = df_metrics.append(run_simulation(df_filter, j, i, ca_list_241, author_list_filter)) # baseline
        #df_metrics = df_metrics.append(run_simulation(df_filter, j, i, ca_list_242, author_list_filter)) # baseline
        df_metrics = df_metrics.append(run_simulation(df_filter, j, i, ca_list_34, author_list_filter)) # baseline
    df_metrics.to_excel(os.path.join('output', 'kbest_proprio_' + str(now.strftime("%d_%m_%Y__%H_%M_%S")) + ".xlsx"))

        #df_metrics = df_metrics.append(run_simulation(df_train, j, i, ca_list))
        #print(str(authors_shuffle[:j]))


Executing simulation number: 0 data: 04/04/2020 20:44:22
Executado para : 2 autores - 04_04_2020__20_44_22
Executado para : 2 autores - 04_04_2020__20_44_22
Executado para : 2 autores - 04_04_2020__20_44_22
Executado para : 2 autores - 04_04_2020__20_44_22
Executado para : 2 autores - 04_04_2020__20_44_22
Executado para : 2 autores - 04_04_2020__20_44_22
Executado para : 2 autores - 04_04_2020__20_44_22
Executado para : 2 autores - 04_04_2020__20_44_22
Executado para : 2 autores - 04_04_2020__20_44_22
Executado para : 2 autores - 04_04_2020__20_44_22
Executado para : 2 autores - 04_04_2020__20_44_22
Executado para : 4 autores - 04_04_2020__20_44_22
Executado para : 4 autores - 04_04_2020__20_44_22
Executado para : 4 autores - 04_04_2020__20_44_22
Executado para : 4 autores - 04_04_2020__20_44_22
Executado para : 4 autores - 04_04_2020__20_44_22
Executado para : 4 autores - 04_04_2020__20_44_22
Executado para : 4 autores - 04_04_2020__20_44_22
Executado para : 4 autores - 04_04_2020__20

KeyboardInterrupt: 

### Teste McNemar's

In [41]:
from statsmodels.stats.contingency_tables import mcnemar

In [42]:
def test_mcnemar(j, ca_classf_list_1, ca_classf_list_2):

    clf_final_classif_1 = LogisticRegression(random_state=42, multi_class='multinomial', solver='newton-cg')
    clf_final_classif_2 = LogisticRegression(random_state=42, multi_class='multinomial', solver='newton-cg')

    author_list_filter = list_authors_aa_selection[:j]

    df_filter = df_final_aa[df_final_aa['id_author'].isin(author_list_filter)]

    x_train, x_test, y_train, y_test = train_test_split(df_filter[['text'] + ca_classf_list_1], 
                                                    df_filter['id_author'], 
                                                test_size=0.3,random_state=42)

    pipelines = [pipelineCharacter, pipelineObfuscator, pipelineWord]

    for p in pipelines:
        p.fit(x_train['text'], y_train)

    predict_text_train_aa_mix = np.hstack([p.predict_proba(x_train['text']) for p in pipelines])

    df_train_mix = pd.DataFrame(data=predict_text_train_aa_mix)

    for ca in ca_classf_list_1:
        df_train_mix[ca] = x_train[ca].values

    clf_final_classif_1.fit(df_train_mix, y_train)

    predict_text_test_aa_mix = np.hstack([p.predict_proba(x_test['text']) for p in pipelines])
    df_test_mix = pd.DataFrame(data=predict_text_test_aa_mix)

    for ca in ca_classf_list_1:
        df_test_mix[ca] = x_test[ca].values

    test_pred = clf_final_classif_1.predict(df_test_mix)

    x_train, x_test, y_train, y_test = train_test_split(df_filter[['text'] + ca_classf_list_2], 
                                                    df_filter['id_author'], 
                                                test_size=0.3,random_state=42)

    pipelines = [pipelineCharacter, pipelineObfuscator, pipelineWord]

    for p in pipelines:
        p.fit(x_train['text'], y_train)

    predict_text_train_aa_mix = np.hstack([p.predict_proba(x_train['text']) for p in pipelines])

    df_train_mix = pd.DataFrame(data=predict_text_train_aa_mix)

    for ca in ca_classf_list_2:
        df_train_mix[ca] = x_train[ca].values

    clf_final_classif_2.fit(df_train_mix, y_train)

    predict_text_test_aa_mix = np.hstack([p.predict_proba(x_test['text']) for p in pipelines])
    df_test_mix = pd.DataFrame(data=predict_text_test_aa_mix)

    for ca in ca_classf_list_2:
        df_test_mix[ca] = x_test[ca].values

    test_pred2 = clf_final_classif_2.predict(df_test_mix)

    df_mcnemar_test = pd.DataFrame()
    df_mcnemar_test['model_pred_classif_1'] = test_pred
    df_mcnemar_test['model_pred_classif_2'] = test_pred2
    df_mcnemar_test['original_label'] = y_test.values

    df_mcnemar_test['classf_1'] = np.where(df_mcnemar_test['model_pred_classif_1']==df_mcnemar_test['original_label'], 0, 1)
    df_mcnemar_test['classf_2'] = np.where(df_mcnemar_test['model_pred_classif_2']==df_mcnemar_test['original_label'], 0, 1)

    print("classf_1 acc: " + str(1-sum(df_mcnemar_test['classf_1'])/len(df_mcnemar_test)))
    print("classf_2 acc: " + str(1-sum(df_mcnemar_test['classf_2'])/len(df_mcnemar_test)))

    data_crosstab = pd.crosstab(df_mcnemar_test['classf_1'],  
                                df_mcnemar_test['classf_2'], 
                                margins = False) 
    print(data_crosstab)

    
    count_contingence_table = np.where(data_crosstab>25, 1, 0).sum()
    
    if count_contingence_table==4:
        result = mcnemar(data_crosstab, exact=False, correction=True)
        print("Todos os termos maiores que 25")
    else:
        print("Algum termo menor que 25")
        result = mcnemar(data_crosstab, exact=True)

    print('statistic=%.3f, p-value=%.3f' % (result.statistic, result.pvalue))
    # interpret the p-value
    alpha = 0.05
    if result.pvalue > alpha:
        print('Same proportions of errors (fail to reject H0)')
    else:
        print('Different proportions of errors (reject H0)')

In [43]:
test_mcnemar(20, ca_list_11, ca_list_14)

classf_1 acc: 0.4923438138015517
classf_2 acc: 0.5261331155573703
classf_2     0     1
classf_1            
0         4822     1
1          332  4641
Algum termo menor que 25
statistic=1.000, p-value=0.000
Different proportions of errors (reject H0)


In [44]:
test_mcnemar(20, ca_list_11, ca_list_34)

classf_1 acc: 0.4923438138015517
classf_2 acc: 0.5164352797060024
classf_2     0     1
classf_1            
0         4821     2
1          238  4735
Algum termo menor que 25
statistic=2.000, p-value=0.000
Different proportions of errors (reject H0)


In [45]:
test_mcnemar(20, ca_list_14, ca_list_34)

classf_1 acc: 0.5261331155573703
classf_2 acc: 0.5164352797060024
classf_2     0     1
classf_1            
0         4990   164
1           69  4573
Todos os termos maiores que 25
statistic=37.923, p-value=0.000
Different proportions of errors (reject H0)
