In [1]:
import os
import pandas as pd
import numpy as np
import re
import itertools
from datetime import datetime
import traceback
import pickle
import warnings

#model valuation
from sklearn.model_selection import train_test_split

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.feature_selection import SelectKBest
from sklearn.model_selection import cross_validate
from sklearn.feature_selection import f_classif
from sklearn.model_selection import GridSearchCV

import random as rand

from sklearn.pipeline import Pipeline
from sklearn.preprocessing import normalize, Normalizer, MaxAbsScaler, MinMaxScaler
from sklearn.decomposition import PCA
from sklearn.dummy import DummyClassifier

In [2]:
import platform
import scipy
import sklearn
print(platform.platform())
print("NumPy", np.__version__)
print("SciPy", scipy.__version__)
print("Scikit-Learn", sklearn.__version__)

Linux-4.4.0-1112-aws-x86_64-with-debian-stretch-sid
NumPy 1.14.3
SciPy 1.1.0
Scikit-Learn 0.19.1


In [3]:
from sklearn.base import BaseEstimator
from scipy.sparse import issparse

class DenseTransformer(BaseEstimator):
    def __init__(self, return_copy=True):
        self.return_copy = return_copy
        self.is_fitted = False

    def transform(self, X, y=None):
        if issparse(X):
            return X.toarray()
        elif self.return_copy:
            return X.copy()
        else:
            return X

    def fit(self, X, y=None):
        self.is_fitted = True
        return self

    def fit_transform(self, X, y=None):
        return self.transform(X=X, y=y)

In [4]:
from sklearn.base import BaseEstimator
from scipy.sparse import issparse

class ObfuscationTransformer(BaseEstimator):
    def __init__(self,re_from=r'(\b)(\w{0,2})\w+(\w{1,3})(\b)', re_to=r'\1\2XX\3\4', return_copy=True):
        self.re_from = re_from
        self.re_to = re_to

    def transform(self, X, y=None):
        X = np.array(X).copy();
        for i in range(len(X)):
            X[i] = re.sub(self.re_from,self.re_to, X[i])
        
        return X;

    def fit(self, X, y=None):
        return self

    def fit_transform(self, X, y=None):
        return self.transform(X=X, y=y)

In [5]:
import warnings
from sklearn.metrics import f1_score, precision_score, recall_score, accuracy_score
from sklearn.preprocessing import LabelEncoder

def eval_measures(gt, pred):
    with warnings.catch_warnings():
        warnings.simplefilter('ignore')
        f1 = f1_score(gt,
                  pred,
                  labels=list(set(gt)),
                  average='macro')
        precision = precision_score(gt,
                  pred,
                  labels=list(set(gt)),
                  average='macro')
        recall = recall_score(gt,
                  pred,
                  labels=list(set(gt)),
                  average='macro')
        accuracy = accuracy_score(gt,
                  pred)

    return f1,precision,recall,accuracy

In [6]:
pipelineCharacter = Pipeline([
    ('vect',   TfidfVectorizer(
            analyzer='char',
            min_df=0.05,
            max_df=1.0,
            ngram_range=(2,5),
            lowercase=False,
            norm='l2',
            sublinear_tf=True)),
    ('dense',  DenseTransformer()),
    ('scaler', MaxAbsScaler()),
    ('transf', PCA(0.999)),
    ('clf', LogisticRegression(random_state=0,multi_class='multinomial', solver='newton-cg')),
])

pipelineObfuscator = Pipeline([
        ('obs',ObfuscationTransformer(re_from=r'\w',re_to='x')),
        ('vect',   TfidfVectorizer(
                analyzer='char',
                min_df=0.05,
                max_df=1.0,
                ngram_range=(2,5),
                lowercase=False,
                norm='l2',
                sublinear_tf=True)),
        ('dense',  DenseTransformer()),
        ('scaler', MaxAbsScaler()),
        ('transf', PCA(0.999)),
        ('clf', LogisticRegression(random_state=0,multi_class='multinomial', solver='newton-cg')),
    ])

pipelineWord = Pipeline([
        ('vect',   TfidfVectorizer(
                analyzer='word',
                min_df=0.05,
                max_df=1.0,
                ngram_range=(1,3),
                lowercase=True,
                norm='l2',
                sublinear_tf=True)),
        ('dense',  DenseTransformer()),
        ('scaler', MaxAbsScaler()),
        ('transf', PCA(0.999)),
        ('clf', LogisticRegression(random_state=0, multi_class='multinomial', solver='newton-cg')),
    ]);

In [301]:
language = 'fr'

corpus_type = 'tweets_' + language + '.csv'

os.chdir('/home/ubuntu/lab/Experimentos/experimento_3_twisty')
df = pd.read_csv('input/' + corpus_type, sep=';', encoding='cp1252')

In [302]:
df = df.rename(columns={'author': 'id_author'})

In [303]:
df_count = df.groupby('id_author').size().reset_index(name='counts')

In [304]:
df['gender'] = np.where(df['gender']=='M', 1, 0)

In [305]:
#df['id_author'] = df['filename'].str.split('.', expand=True)[0]
#del df['filename']

In [306]:
df['text'] = df['text'].str.lower()

In [307]:
#df['count_delimiter'] = df['text'].str.split("\|<->\|").apply(len)

In [308]:
df_final_statistics = df.copy()
df_final_statistics['number_of_words'] = df['text'].apply(lambda x: len(x.split()))

In [309]:
print("Total de unidades: " + str(len(df_final_statistics)))
print("Total de palavras: " + str(sum(df_final_statistics['number_of_words'])))
print("Total de palavras/unidade: " + str(sum(df_final_statistics['number_of_words'])/len(df_final_statistics)))  

Total de unidades: 1044604
Total de palavras: 14002319
Total de palavras/unidade: 13.404427898036003


In [310]:
## selecao de autores com mais textos
limit_authors_aa_selection = 20
#list_authors_aa_selection = list(df_final['id_author'].value_counts()[:limit_authors_aa_selection].index)
list_authors_aa_selection = list(df_count.sort_values(by='counts', ascending=False)[:limit_authors_aa_selection]['id_author'])

In [311]:
#Divisao de bases
df_aa = df[df['id_author'].isin(list_authors_aa_selection)]
df_ca = df[~df['id_author'].isin(list_authors_aa_selection)]

In [312]:
df_ca_group = df_ca.groupby(['id_author'])['text'].apply('|<->|'.join).reset_index()
df_ca_group = df_ca_group.merge(df_ca[['id_author', 'gender']].drop_duplicates())

df_aa_group = df_aa.groupby(['id_author'])['text'].apply('|<->|'.join).reset_index()
df_aa_group = df_aa_group.merge(df_aa[['id_author', 'gender']].drop_duplicates())

In [313]:
df_ca = df_ca_group
df_aa = df_aa_group

In [314]:
pipeline_kbest = Pipeline([
    ('vect', TfidfVectorizer()), 
    ('k_best', SelectKBest(f_classif)),
    ('clf', LogisticRegression(random_state=42, n_jobs=6))
])
    
parameters_kbest = {    
    'k_best__k': (range(3000,20000,1000))
}

grid_search_resp = GridSearchCV(pipeline_kbest,
                               parameters_kbest,
                               cv=10,
                               scoring='f1_macro',
                               n_jobs=4,
                               verbose=10
                               )

In [315]:
ca_list = ['gender']

In [316]:
#k_best_dict = {}

#for ca in ca_list:
   
    #warnings.filterwarnings("ignore")
#    grid_search_resp.fit(df_ca['text'], df_ca_group[ca])
#    print('Finalizado, tarefa: ' + str(ca))

#    k_best_dict.update({ca: grid_search_resp.best_estimator_.get_params()['k_best__k']})
#print(k_best_dict)

#### O resultado deste for é:
#k_best_dict = { #'gender': 8000, ## ES,
                #'gender': 8000, ## DE,
                #'gender': 3000, ## IT,
                #'gender': 6000, ## NL,
                #'gender': 6000, ## PT,
                #'gender': 19000, ## FR
#}

if language=='es':
    k_best_dict = {'gender': 8000} ## ES
if language=='de':
    k_best_dict = {'gender': 8000} ## DE
if language=='it':
    k_best_dict = {'gender': 3000} ## IT
if language=='nl':
    k_best_dict = {'gender': 6000} ## NL
if language=='pt':
    k_best_dict = {'gender': 6000} ## PT
if language=='fr':
    k_best_dict = {'gender': 19000} ## FR
    


In [317]:
print(k_best_dict)

{'gender': 19000}


In [318]:
vect = TfidfVectorizer()
clf = LogisticRegression(class_weight='balanced')

# estimado
#k_best_dict = {'gender': 1800,
#               'ap_age': 9800}

In [319]:
text_vectorized_train = vect.fit_transform(df_ca['text'])
text_vectorized_test = vect.transform(df_aa['text'])

In [320]:
df_test_ca = pd.DataFrame()
df_test_ca['id_author'] = df_aa['id_author']

In [321]:
df_aa[['id_author', 'gender']]

Unnamed: 0,id_author,gender
0,6364932,1
1,8421442,0
2,27342429,1
3,35814511,0
4,55523877,1
5,76294960,0
6,113392228,0
7,172903185,0
8,296921560,0
9,338349721,1


In [300]:
#df_ca.head()
#df_ca['text'].iloc[0]
#df_ca.shape
#df_ca['id_author'].unique()

In [257]:
df_test_ca = pd.DataFrame()
df_test_ca['id_author'] = list(df_aa['id_author'])

#print('ca;accuracy;precision_macro;recall_macro;f1_macro')

for ca_var in k_best_dict:

    k_best = k_best_dict[ca_var]
    sel = SelectKBest(k = k_best)
    ft = sel.fit(text_vectorized_train, df_ca[ca_var])
    train_best = ft.transform(text_vectorized_train)

    clf.fit(train_best, df_ca[ca_var])

    test_best = ft.transform(text_vectorized_test)

    predicted = clf.predict(test_best) #com variavel categorica
    predicted_prob = clf.predict_proba(test_best) #com probabilidade de variavel
    
    f1, precision, recall, accuracy = eval_measures(df_aa[ca_var], predicted)

    print(pd.DataFrame(data=[[ca_var, f1, precision, recall, accuracy]], 
             columns=['ca_var', 'f1', 'precision', 'recall', 'accuracy']))

    df_test_ca[ca_var + '_predict'] = predicted
    
    # probs
    df_temp = pd.DataFrame(predicted_prob)
    df_temp = df_temp.add_prefix(ca_var + '_')
    df_test_ca = df_test_ca.join(df_temp)

   ca_var        f1  precision    recall  accuracy
0  gender  0.435737   0.460784  0.479167      0.55


In [258]:
clf_dummy = DummyClassifier(strategy='stratified') #most_frequent, #stratified
for ca_var in k_best_dict:
    clf_dummy.fit(df_ca['id_author'].values.reshape(-1,1), df_ca[ca_var].values)
    predicted = clf_dummy.predict(df_aa['id_author'].values.reshape(-1,1))
    df_test_ca[ca_var + '_dummy_stratified'] = predicted

In [259]:
clf_dummy = DummyClassifier(strategy='most_frequent') #most_frequent, #stratified
for ca_var in k_best_dict:
    clf_dummy.fit(df_ca['id_author'].values.reshape(-1,1), df_ca[ca_var].values)
    predicted = clf_dummy.predict(df_aa['id_author'].values.reshape(-1,1))
    df_test_ca[ca_var + '_dummy_most_frequent'] = predicted

In [260]:
df_test_ca_baseline = df_test_ca.copy() 

In [261]:
df_test_ca_baseline = df_test_ca_baseline.merge(df_aa[['id_author', 'gender']],
                          left_on='id_author', right_on='id_author', how='left')

In [262]:
#baseline CA
#print('ca;f1;precision_macro;recall_macro;accuracy')

for ca_var in k_best_dict:
    
    f1, precision, recall, accuracy = eval_measures(df_test_ca_baseline[ca_var], df_test_ca_baseline[ca_var + '_dummy_most_frequent'])

    print(pd.DataFrame(data=[[ca_var, f1, precision, recall, accuracy]], 
             columns=['ca_var', 'f1', 'precision', 'recall', 'accuracy']))

   ca_var     f1  precision  recall  accuracy
0  gender  0.375        0.3     0.5       0.6


In [263]:
#df_final = df_aa

In [264]:
df_final = pd.DataFrame()

for i in range(0, len(df_aa)):
    text_list = df_aa['text'].iloc[i].split('|<->|')
    author_list = [df_aa['id_author'].iloc[i]]*len(text_list)
    df_inter = pd.DataFrame(data=[text_list, author_list]).T
    df_inter = df_inter.rename(columns={0: 'text', 1: 'id_author'})
    
    df_final = df_final.append(df_inter)

In [265]:
df_final = df_final[df_final['text'].apply(len)>20]

In [266]:
df_final.shape

(50606, 2)

In [267]:
df_final_aa = df_final[df_final['id_author'].isin(list_authors_aa_selection)]
df_final_aa['id_author'] = df_final_aa['id_author'].astype(int)

In [268]:
df_test_ca['id_author'] = df_test_ca['id_author'].astype(int)
df['id_author'] = df['id_author'].astype(int)

In [269]:
df_final_aa = df_final_aa.merge(df_test_ca, how='left', left_on='id_author', right_on='id_author')
#df_final_aa = df_final_aa.merge(df[['id_author', 'gender']], how='left', left_on='id_author', right_on='id_author')

In [270]:
df_final_aa.head()

Unnamed: 0,text,id_author,gender_predict,gender_0,gender_1,gender_dummy_stratified,gender_dummy_most_frequent
0,"@alexvansteen bier is frisdrank, wijn is goden...",12369212,0,0.527632,0.472368,0,0
1,@janroegiers veel plezier daar. mooie stad,12369212,0,0.527632,0.472368,0,0
2,bedankt regen voor het fijne welkomstcomité hi...,12369212,0,0.527632,0.472368,0,0
3,valies uitpakken en een beetje acclimatiseren ...,12369212,0,0.527632,0.472368,0,0
4,"@pieternagels dank u, ik kijk uit naar onze we...",12369212,0,0.527632,0.472368,0,0


In [271]:
#df_final_aa['id_author'].unique()
df_final_aa.shape

(50606, 7)

In [272]:
#df_test_ca['id_author'].unique()
df_test_ca.head()

Unnamed: 0,id_author,gender_predict,gender_0,gender_1,gender_dummy_stratified,gender_dummy_most_frequent
0,12369212,0,0.527632,0.472368,0,0
1,12749932,0,0.528933,0.471067,1,0
2,14141508,0,0.534758,0.465242,1,0
3,15280197,0,0.518255,0.481745,0,0
4,19082430,0,0.520016,0.479984,1,0


In [273]:
clf_final = LogisticRegression(random_state=42, multi_class='multinomial', solver='newton-cg')
scoring = ['accuracy','precision_macro', 'recall_macro', 'f1_macro']

In [274]:
df_final_aa.drop_duplicates(subset=['id_author'])
#[['id_author', 'gender']]

Unnamed: 0,text,id_author,gender_predict,gender_0,gender_1,gender_dummy_stratified,gender_dummy_most_frequent
0,"@alexvansteen bier is frisdrank, wijn is goden...",12369212,0,0.527632,0.472368,0,0
2811,wat is een gebeurtenis die erg bepalend is gew...,12749932,0,0.528933,0.471067,1,0
5204,leest: 'elke tweede verkochte auto is duits' ...,14141508,0,0.534758,0.465242,1,0
7752,dus rt @josheymans: top van lansschot krijgt v...,15280197,0,0.518255,0.481745,0,0
10191,eerlijke bankwijzer over #triodos. veruit te v...,19082430,0,0.520016,0.479984,1,0
12768,"@levensdocument dankjewel, carolien voelt heel...",81581036,0,0.530692,0.469308,1,0
15301,was een gezellig avondje met @laisabje @larisj...,95502094,0,0.595105,0.404895,1,0
17511,@robertmazier geen kwaad woord over kliederkerk,101546714,1,0.499577,0.500423,1,0
19903,@luciahardonk @elsbethgruteke @joosthsmit de l...,103001137,0,0.538818,0.461182,0,0
22355,"@silvanvijver ik ben er weer, dus jij mag zo b...",106374560,0,0.550053,0.449947,1,0


In [275]:
def run_simulation(df_train, author, simulation, ca_list, author_list):
    ### train
    try:
        x_train, x_test, y_train, y_test = train_test_split(df_train[['text'] + ca_list], 
                                                        df_train['id_author'], 
                                                    test_size=0.3,random_state=rand.randint(0,900))

        pipelines = [pipelineCharacter, pipelineObfuscator, pipelineWord]
        for p in pipelines:
            p.fit(x_train['text'], y_train)

        predict_text_train_aa_mix = np.hstack([p.predict_proba(x_train['text']) for p in pipelines])
        df_train_mix = pd.DataFrame(data=predict_text_train_aa_mix)
        if 'gender_0' in x_train.columns:
            df_train_mix['gender_0'] = x_train['gender_0'].values
        if 'gender_1' in x_train.columns:
            df_train_mix['gender_1'] = x_train['gender_1'].values
        if 'ap_age_0' in x_train.columns:
            df_train_mix['ap_age_0'] = x_train['ap_age_0'].values
        if 'ap_age_1' in x_train.columns:
            df_train_mix['ap_age_1'] = x_train['ap_age_1'].values
        if 'ap_age_2' in x_train.columns:
            df_train_mix['ap_age_2'] = x_train['ap_age_2'].values

        if 'gender' in x_train.columns:
            df_train_mix['gender'] = x_train['gender'].values
        if 'ap_age' in x_train.columns:
            df_train_mix['ap_age'] = x_train['ap_age'].values
            
        if 'gender_dummy_most_frequent' in x_train.columns:
            df_train_mix['gender_dummy_most_frequent'] = x_train['gender_dummy_most_frequent'].values
        if 'ap_age_dummy_most_frequent' in x_train.columns:
            df_train_mix['ap_age_dummy_most_frequent'] = x_train['ap_age_dummy_most_frequent'].values

        if 'gender_dummy_stratified' in x_train.columns:
            df_train_mix['gender_dummy_stratified'] = x_train['gender_dummy_stratified'].values
        if 'ap_age_dummy_stratified' in x_train.columns:
            df_train_mix['ap_age_dummy_stratified'] = x_train['ap_age_dummy_stratified'].values
            
        if 'gender_predict' in x_train.columns:
            df_train_mix['gender_predict'] = x_train['gender_predict'].values
        if 'ap_age_predict' in x_train.columns:
            df_train_mix['ap_age_predict'] = x_train['ap_age_predict'].values

        clf_final.fit(df_train_mix, y_train)

        #test
        predict_text_test_aa_mix = np.hstack([p.predict_proba(x_test['text']) for p in pipelines])
        df_test_mix = pd.DataFrame(data=predict_text_test_aa_mix)
        if 'gender_0' in x_test.columns:
            df_test_mix['gender_0'] = x_test['gender_0'].values
        if 'gender_1' in x_test.columns:
            df_test_mix['gender_1'] = x_test['gender_1'].values
        if 'ap_age_0' in x_test.columns:
            df_test_mix['ap_age_0'] = x_test['ap_age_0'].values
        if 'ap_age_1' in x_test.columns:
            df_test_mix['ap_age_1'] = x_test['ap_age_1'].values
        if 'ap_age_2' in x_test.columns:
            df_test_mix['ap_age_2'] = x_test['ap_age_2'].values

        if 'gender' in x_test.columns:
            df_test_mix['gender'] = x_test['gender'].values
        if 'ap_age' in x_test.columns:
            df_test_mix['ap_age'] = x_test['ap_age'].values
            
        if 'gender_dummy_most_frequent' in x_test.columns:
            df_test_mix['gender_dummy_most_frequent'] = x_test['gender_dummy_most_frequent'].values
        if 'ap_age_dummy_most_frequent' in x_test.columns:
            df_test_mix['ap_age_dummy_most_frequent'] = x_test['ap_age_dummy_most_frequent'].values

        if 'gender_dummy_stratified' in x_test.columns:
            df_test_mix['gender_dummy_stratified'] = x_test['gender_dummy_stratified'].values
        if 'ap_age_dummy_stratified' in x_test.columns:
            df_test_mix['ap_age_dummy_stratified'] = x_test['ap_age_dummy_stratified'].values

        if 'gender_predict' in x_test.columns:
            df_test_mix['gender_predict'] = x_test['gender_predict'].values
        if 'ap_age_predict' in x_test.columns:
            df_test_mix['ap_age_predict'] = x_test['ap_age_predict'].values
            
        test_pred = clf_final.predict(df_test_mix)

        f1, precision, recall, accuracy = eval_measures(y_test, test_pred)
        print("Executado para : " + str(author) + ' autores - ' + str(now.strftime("%d/%m/%Y %H:%M:%S")))

        return pd.DataFrame(data=[[author, f1, precision, recall, accuracy, simulation, str(ca_list), str(author_list)]], 
                 columns=['autors_selected', 'f1', 'precision', 'recall', 'accuracy', 'simulation', 'ca_list', 'author_list'])
    except Exception as e:
        #print('erro na simulacao')
        print(traceback.print_exc())
        return pd.DataFrame(data=[[0.0, 0.0, 0.0, 0.0, 0.0, simulation, str(ca_list), str(author_list)]], 
                 columns=['autors_selected', 'f1', 'precision', 'recall', 'accuracy', 'simulation', 'ca_list', 'author_list'])

In [93]:
ca_list_11 = []

#individuais
ca_list_02 = ['gender']
ca_list_04 = ['ap_age']

ca_list_12 = ['gender_predict']
ca_list_14 = ['ap_age_predict']

ca_list_221 = ['gender_dummy_stratified']
ca_list_241 = ['ap_age_dummy_stratified']

ca_list_222 = ['gender_dummy_most_frequent']
ca_list_242 = ['ap_age_dummy_most_frequent']

ca_list_32 = ['gender_0', 'gender_1']
ca_list_34 = ['ap_age_0', 'ap_age_1', 'ap_age_2']

In [44]:
#df_filter = df_final_aa[df_final_aa['id_author'].isin(author_list_filter)]
#rand.shuffle(list_authors_aa_selection)
#author_list_filter = list_authors_aa_selection[:2]

#run_simulation(df_filter, 1, 1, ca_list_36, author_list_filter)

In [45]:
df_metrics = pd.DataFrame()

for i in range(0, 20):
    rand.shuffle(list_authors_aa_selection)

    now = datetime.now()

    print("Executing simulation number: " + str(i) + " data: " + str(now.strftime("%d/%m/%Y %H:%M:%S")))

    for j in range(2, limit_authors_aa_selection + 2, 2):
        
        author_list_filter = list_authors_aa_selection[:j]

        df_filter = df_final_aa[df_final_aa['id_author'].isin(author_list_filter)]
        
        df_metrics = df_metrics.append(run_simulation(df_filter, j, i, ca_list_11, author_list_filter)) # baseline
        #df_metrics = df_metrics.append(run_simulation(df_filter, j, i, ca_list_02, author_list_filter)) # baseline
        df_metrics = df_metrics.append(run_simulation(df_filter, j, i, ca_list_12, author_list_filter)) # baseline
        df_metrics = df_metrics.append(run_simulation(df_filter, j, i, ca_list_221, author_list_filter)) # baseline
        df_metrics = df_metrics.append(run_simulation(df_filter, j, i, ca_list_222, author_list_filter)) # baseline
        df_metrics = df_metrics.append(run_simulation(df_filter, j, i, ca_list_32, author_list_filter)) # baseline

        #df_metrics = df_metrics.append(run_simulation(df_filter, j, i, ca_list_04, author_list_filter)) # baseline
        #df_metrics = df_metrics.append(run_simulation(df_filter, j, i, ca_list_14, author_list_filter)) # baseline
        #df_metrics = df_metrics.append(run_simulation(df_filter, j, i, ca_list_241, author_list_filter)) # baseline
        #df_metrics = df_metrics.append(run_simulation(df_filter, j, i, ca_list_242, author_list_filter)) # baseline
        #df_metrics = df_metrics.append(run_simulation(df_filter, j, i, ca_list_34, author_list_filter)) # baseline
    df_metrics.to_excel(os.path.join('output', 'kbest_proprio_' + language + "_" + str(now.strftime("%d_%m_%Y__%H_%M_%S")) + ".xlsx"))

        #df_metrics = df_metrics.append(run_simulation(df_train, j, i, ca_list))
        #print(str(authors_shuffle[:j]))


Executing simulation number: 0 data: 03/05/2020 18:17:14
Executado para : 2 autores - 03/05/2020 18:17:14
Executado para : 2 autores - 03/05/2020 18:17:14
Executado para : 2 autores - 03/05/2020 18:17:14
Executado para : 2 autores - 03/05/2020 18:17:14
Executado para : 2 autores - 03/05/2020 18:17:14
Executado para : 4 autores - 03/05/2020 18:17:14
Executado para : 4 autores - 03/05/2020 18:17:14
Executado para : 4 autores - 03/05/2020 18:17:14
Executado para : 4 autores - 03/05/2020 18:17:14
Executado para : 4 autores - 03/05/2020 18:17:14
Executado para : 6 autores - 03/05/2020 18:17:14
Executado para : 6 autores - 03/05/2020 18:17:14
Executado para : 6 autores - 03/05/2020 18:17:14
Executado para : 6 autores - 03/05/2020 18:17:14
Executado para : 6 autores - 03/05/2020 18:17:14
Executado para : 8 autores - 03/05/2020 18:17:14
Executado para : 8 autores - 03/05/2020 18:17:14
Executado para : 8 autores - 03/05/2020 18:17:14
Executado para : 8 autores - 03/05/2020 18:17:14
Executado pa

FileNotFoundError: [Errno 2] No such file or directory: 'output/kbest_proprio_es_03_05_2020__18_17_14.xlsx'

### Teste McNemar's

In [48]:
from statsmodels.stats.contingency_tables import mcnemar

In [49]:
def test_mcnemar(j, ca_classf_list_1, ca_classf_list_2):

    clf_final_classif_1 = LogisticRegression(random_state=42, multi_class='multinomial', solver='newton-cg')
    clf_final_classif_2 = LogisticRegression(random_state=42, multi_class='multinomial', solver='newton-cg')

    author_list_filter = list_authors_aa_selection[:j]

    df_filter = df_final_aa[df_final_aa['id_author'].isin(author_list_filter)]

    x_train, x_test, y_train, y_test = train_test_split(df_filter[['text'] + ca_classf_list_1], 
                                                    df_filter['id_author'], 
                                                test_size=0.3,random_state=42)

    pipelines = [pipelineCharacter, pipelineObfuscator, pipelineWord]

    for p in pipelines:
        p.fit(x_train['text'], y_train)

    predict_text_train_aa_mix = np.hstack([p.predict_proba(x_train['text']) for p in pipelines])

    df_train_mix = pd.DataFrame(data=predict_text_train_aa_mix)

    for ca in ca_classf_list_1:
        df_train_mix[ca] = x_train[ca].values

    clf_final_classif_1.fit(df_train_mix, y_train)

    predict_text_test_aa_mix = np.hstack([p.predict_proba(x_test['text']) for p in pipelines])
    df_test_mix = pd.DataFrame(data=predict_text_test_aa_mix)

    for ca in ca_classf_list_1:
        df_test_mix[ca] = x_test[ca].values

    test_pred = clf_final_classif_1.predict(df_test_mix)

    x_train, x_test, y_train, y_test = train_test_split(df_filter[['text'] + ca_classf_list_2], 
                                                    df_filter['id_author'], 
                                                test_size=0.3,random_state=42)

    pipelines = [pipelineCharacter, pipelineObfuscator, pipelineWord]

    for p in pipelines:
        p.fit(x_train['text'], y_train)

    predict_text_train_aa_mix = np.hstack([p.predict_proba(x_train['text']) for p in pipelines])

    df_train_mix = pd.DataFrame(data=predict_text_train_aa_mix)

    for ca in ca_classf_list_2:
        df_train_mix[ca] = x_train[ca].values

    clf_final_classif_2.fit(df_train_mix, y_train)

    predict_text_test_aa_mix = np.hstack([p.predict_proba(x_test['text']) for p in pipelines])
    df_test_mix = pd.DataFrame(data=predict_text_test_aa_mix)

    for ca in ca_classf_list_2:
        df_test_mix[ca] = x_test[ca].values

    test_pred2 = clf_final_classif_2.predict(df_test_mix)

    df_mcnemar_test = pd.DataFrame()
    df_mcnemar_test['model_pred_classif_1'] = test_pred
    df_mcnemar_test['model_pred_classif_2'] = test_pred2
    df_mcnemar_test['original_label'] = y_test.values

    df_mcnemar_test['classf_1'] = np.where(df_mcnemar_test['model_pred_classif_1']==df_mcnemar_test['original_label'], 0, 1)
    df_mcnemar_test['classf_2'] = np.where(df_mcnemar_test['model_pred_classif_2']==df_mcnemar_test['original_label'], 0, 1)

    print("classf_1 acc: " + str(1-sum(df_mcnemar_test['classf_1'])/len(df_mcnemar_test)))
    print("classf_2 acc: " + str(1-sum(df_mcnemar_test['classf_2'])/len(df_mcnemar_test)))

    data_crosstab = pd.crosstab(df_mcnemar_test['classf_1'],  
                                df_mcnemar_test['classf_2'], 
                                margins = False) 
    print(data_crosstab)

    
    count_contingence_table = np.where(data_crosstab>25, 1, 0).sum()
    
    if count_contingence_table==4:
        result = mcnemar(data_crosstab, exact=False, correction=True)
        print("Todos os termos maiores que 25")
    else:
        print("Algum termo menor que 25")
        result = mcnemar(data_crosstab, exact=True)

    print('statistic=%.3f, p-value=%.3f' % (result.statistic, result.pvalue))
    # interpret the p-value
    alpha = 0.05
    if result.pvalue > alpha:
        print('Same proportions of errors (fail to reject H0)')
    else:
        print('Different proportions of errors (reject H0)')

### Espanhol

In [50]:
test_mcnemar(20, ca_list_11, ca_list_12)

classf_1 acc: 0.5756411925530529
classf_2 acc: 0.6724384845723212
classf_2     0     1
classf_1            
0         8809    34
1         1521  4998
Todos os termos maiores que 25
statistic=1420.062, p-value=0.000
Different proportions of errors (reject H0)


In [51]:
test_mcnemar(20, ca_list_11, ca_list_32)

classf_1 acc: 0.5756411925530529
classf_2 acc: 0.7474287202187215
classf_2     0     1
classf_1            
0         8837     6
1         2645  3874
Algum termo menor que 25
statistic=6.000, p-value=0.000
Different proportions of errors (reject H0)


In [52]:
test_mcnemar(20, ca_list_12, ca_list_32)

classf_1 acc: 0.6724384845723212
classf_2 acc: 0.7474287202187215
classf_2      0     1
classf_1             
0         10074   256
1          1408  3624
Todos os termos maiores que 25
statistic=796.154, p-value=0.000
Different proportions of errors (reject H0)


### Francês

In [94]:
test_mcnemar(20, ca_list_11, ca_list_12)

classf_1 acc: 0.5820118484830351
classf_2 acc: 0.6435282149482378
classf_2     0     1
classf_1            
0         9705    21
1         1049  5936
Algum termo menor que 25
statistic=21.000, p-value=0.000
Different proportions of errors (reject H0)


In [95]:
test_mcnemar(20, ca_list_11, ca_list_32)

classf_1 acc: 0.5820118484830351
classf_2 acc: 0.7244330081981928
classf_2     0     1
classf_1            
0         9715    11
1         2391  4594
Algum termo menor que 25
statistic=11.000, p-value=0.000
Different proportions of errors (reject H0)


In [96]:
test_mcnemar(20, ca_list_12, ca_list_32)

classf_1 acc: 0.6435282149482378
classf_2 acc: 0.7244330081981928
classf_2      0     1
classf_1             
0         10467   287
1          1639  4318
Todos os termos maiores que 25
statistic=947.664, p-value=0.000
Different proportions of errors (reject H0)


### Português

In [137]:
test_mcnemar(20, ca_list_11, ca_list_12)

classf_1 acc: 0.5672690763052208
classf_2 acc: 0.6373185047883843
classf_2     0     1
classf_1            
0         7328    17
1          924  4679
Algum termo menor que 25
statistic=17.000, p-value=0.000
Different proportions of errors (reject H0)


In [138]:
test_mcnemar(20, ca_list_11, ca_list_32)

classf_1 acc: 0.5672690763052208
classf_2 acc: 0.6713005869632376
classf_2     0     1
classf_1            
0         7338     7
1         1354  4249
Algum termo menor que 25
statistic=7.000, p-value=0.000
Different proportions of errors (reject H0)


In [139]:
test_mcnemar(20, ca_list_12, ca_list_32)

classf_1 acc: 0.6373185047883843
classf_2 acc: 0.6713005869632376
classf_2     0     1
classf_1            
0         7919   333
1          773  3923
Todos os termos maiores que 25
statistic=174.250, p-value=0.000
Different proportions of errors (reject H0)


### Italiano

In [188]:
test_mcnemar(20, ca_list_11, ca_list_12)

classf_1 acc: 0.4990723562152134
classf_2 acc: 0.5252690166975882
classf_2     0     1
classf_1            
0         6710    15
1          368  6382
Algum termo menor que 25
statistic=15.000, p-value=0.000
Different proportions of errors (reject H0)


In [189]:
test_mcnemar(20, ca_list_11, ca_list_32)

classf_1 acc: 0.4990723562152134
classf_2 acc: 0.5539146567717996
classf_2     0     1
classf_1            
0         6720     5
1          744  6006
Algum termo menor que 25
statistic=5.000, p-value=0.000
Different proportions of errors (reject H0)


In [190]:
test_mcnemar(20, ca_list_12, ca_list_32)

classf_1 acc: 0.5252690166975882
classf_2 acc: 0.5539146567717996
classf_2     0     1
classf_1            
0         6993    85
1          471  5926
Todos os termos maiores que 25
statistic=266.592, p-value=0.000
Different proportions of errors (reject H0)


### Alemão

In [232]:
test_mcnemar(20, ca_list_11, ca_list_12)

classf_1 acc: 0.5395596419066053
classf_2 acc: 0.6304742317928865
classf_2     0     1
classf_1            
0         8886    34
1         1537  6075
Todos os termos maiores que 25
statistic=1436.031, p-value=0.000
Different proportions of errors (reject H0)


In [233]:
test_mcnemar(20, ca_list_11, ca_list_32)

classf_1 acc: 0.5395596419066053
classf_2 acc: 0.6846721509799177
classf_2     0     1
classf_1            
0         8906    14
1         2413  5199
Algum termo menor que 25
statistic=14.000, p-value=0.000
Different proportions of errors (reject H0)


In [234]:
test_mcnemar(20, ca_list_12, ca_list_32)

classf_1 acc: 0.6304742317928865
classf_2 acc: 0.6846721509799177
classf_2     0     1
classf_1            
0         9991   432
1         1328  4781
Todos os termos maiores que 25
statistic=455.128, p-value=0.000
Different proportions of errors (reject H0)


### Holândes

In [276]:
test_mcnemar(20, ca_list_11, ca_list_12)

classf_1 acc: 0.5322091950994599
classf_2 acc: 0.5893162956132262
classf_2     0     1
classf_1            
0         8067    13
1          880  6222
Algum termo menor que 25
statistic=13.000, p-value=0.000
Different proportions of errors (reject H0)


In [277]:
test_mcnemar(20, ca_list_11, ca_list_32)

classf_1 acc: 0.5322091950994599
classf_2 acc: 0.5937952838888156
classf_2     0     1
classf_1            
0         8074     6
1          941  6161
Algum termo menor que 25
statistic=6.000, p-value=0.000
Different proportions of errors (reject H0)


In [278]:
test_mcnemar(20, ca_list_12, ca_list_32)

classf_1 acc: 0.5893162956132262
classf_2 acc: 0.5937952838888156
classf_2     0     1
classf_1            
0         8382   565
1          633  5602
Todos os termos maiores que 25
statistic=3.747, p-value=0.053
Same proportions of errors (fail to reject H0)
