In [1]:
import os
import pandas as pd
import numpy as np
import re
import itertools
from datetime import datetime
import traceback
import warnings

#model valuation
from sklearn.model_selection import train_test_split

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.feature_selection import SelectKBest
from sklearn.model_selection import cross_validate
from sklearn.feature_selection import f_classif
from sklearn.model_selection import GridSearchCV

import random as rand

from sklearn.pipeline import Pipeline
from sklearn.preprocessing import normalize, Normalizer, MaxAbsScaler, MinMaxScaler
from sklearn.decomposition import PCA
from sklearn.dummy import DummyClassifier

In [2]:
import platform
import scipy
import sklearn
print(platform.platform())
print("NumPy", np.__version__)
print("SciPy", scipy.__version__)
print("Scikit-Learn", sklearn.__version__)

Linux-4.4.0-1112-aws-x86_64-with-debian-stretch-sid
NumPy 1.14.3
SciPy 1.1.0
Scikit-Learn 0.19.1


In [3]:
from sklearn.base import BaseEstimator
from scipy.sparse import issparse

class DenseTransformer(BaseEstimator):
    def __init__(self, return_copy=True):
        self.return_copy = return_copy
        self.is_fitted = False

    def transform(self, X, y=None):
        if issparse(X):
            return X.toarray()
        elif self.return_copy:
            return X.copy()
        else:
            return X

    def fit(self, X, y=None):
        self.is_fitted = True
        return self

    def fit_transform(self, X, y=None):
        return self.transform(X=X, y=y)

In [4]:
from sklearn.base import BaseEstimator
from scipy.sparse import issparse

class ObfuscationTransformer(BaseEstimator):
    def __init__(self,re_from=r'(\b)(\w{0,2})\w+(\w{1,3})(\b)', re_to=r'\1\2XX\3\4', return_copy=True):
        self.re_from = re_from
        self.re_to = re_to

    def transform(self, X, y=None):
        X = np.array(X).copy();
        for i in range(len(X)):
            X[i] = re.sub(self.re_from,self.re_to, X[i])
        
        return X;

    def fit(self, X, y=None):
        return self

    def fit_transform(self, X, y=None):
        return self.transform(X=X, y=y)

In [5]:
import warnings
from sklearn.metrics import f1_score, precision_score, recall_score, accuracy_score
from sklearn.preprocessing import LabelEncoder

def eval_measures(gt, pred):
    with warnings.catch_warnings():
        warnings.simplefilter('ignore')
        f1 = f1_score(gt,
                  pred,
                  labels=list(set(gt)),
                  average='macro')
        precision = precision_score(gt,
                  pred,
                  labels=list(set(gt)),
                  average='macro')
        recall = recall_score(gt,
                  pred,
                  labels=list(set(gt)),
                  average='macro')
        accuracy = accuracy_score(gt,
                  pred)

    return f1,precision,recall,accuracy

In [6]:
pipelineCharacter = Pipeline([
    ('vect',   TfidfVectorizer(
            analyzer='char',
            min_df=0.05,
            max_df=1.0,
            ngram_range=(2,5),
            lowercase=False,
            norm='l2',
            sublinear_tf=True)),
    ('dense',  DenseTransformer()),
    ('scaler', MaxAbsScaler()),
    ('transf', PCA(0.999)),
    ('clf', LogisticRegression(random_state=0,multi_class='multinomial', solver='newton-cg')),
])

pipelineObfuscator = Pipeline([
        ('obs',ObfuscationTransformer(re_from=r'\w',re_to='x')),
        ('vect',   TfidfVectorizer(
                analyzer='char',
                min_df=0.05,
                max_df=1.0,
                ngram_range=(2,5),
                lowercase=False,
                norm='l2',
                sublinear_tf=True)),
        ('dense',  DenseTransformer()),
        ('scaler', MaxAbsScaler()),
        ('transf', PCA(0.999)),
        ('clf', LogisticRegression(random_state=0,multi_class='multinomial', solver='newton-cg')),
    ])

pipelineWord = Pipeline([
        ('vect',   TfidfVectorizer(
                analyzer='word',
                min_df=0.05,
                max_df=1.0,
                ngram_range=(1,3),
                lowercase=True,
                norm='l2',
                sublinear_tf=True)),
        ('dense',  DenseTransformer()),
        ('scaler', MaxAbsScaler()),
        ('transf', PCA(0.999)),
        ('clf', LogisticRegression(random_state=0, multi_class='multinomial', solver='newton-cg')),
    ]);

In [7]:
os.chdir('/home/ubuntu/lab/Experimentos/experimento_3/Ajustes_emails_23_01_2020')
input_file = os.path.join('input', 'BRmoral-510-sept2019_params_prof.xlsx')
df = pd.read_excel(input_file, sheet_name='read_pandas')

In [8]:
df = df.rename(columns={'it': 'itf'})

In [9]:
df['text'] = df['text'].str.lower()

In [10]:
df_final = pd.DataFrame()

for i in range(0, len(df)):
    text_list = df['text'].iloc[i].split('.')
    author_list = [df['id_author'].iloc[i]]*len(text_list)
    df_inter = pd.DataFrame(data=[text_list, author_list]).T
    df_inter = df_inter.rename(columns={0: 'text', 1: 'id_author'})
    
    df_final = df_final.append(df_inter)

In [11]:
df_final_statistics = df_final.copy()
df_final_statistics['number_of_words'] = df_final['text'].apply(lambda x: len(x.split()))

In [12]:
print("Total de unidades: " + str(len(df_final_statistics)))
print("Total de palavras: " + str(sum(df_final_statistics['number_of_words'])))
print("Total de palavras/unidade: " + str(sum(df_final_statistics['number_of_words'])/len(df_final_statistics)))  

Total de unidades: 10236
Total de palavras: 218218
Total de palavras/unidade: 21.318679171551388


In [13]:
## selecao de autores com mais textos
limit_authors_aa_selection = 20
list_authors_aa_selection = list(df_final['id_author'].value_counts()[:limit_authors_aa_selection].index)

In [14]:
#Divisao de bases
df_aa = df[df['id_author'].isin(list_authors_aa_selection)]
df_ca = df[~df['id_author'].isin(list_authors_aa_selection)]

In [15]:
pipeline_kbest = Pipeline([
    ('vect', TfidfVectorizer()), 
    ('k_best', SelectKBest(f_classif)),
    ('clf', LogisticRegression(random_state=42, n_jobs=6))
])
    
parameters_kbest = {    
    'k_best__k': (range(3000,11000,1000))
}

grid_search_resp = GridSearchCV(pipeline_kbest,
                                parameters_kbest,
                                cv=10,
                                scoring='f1_macro',
                                n_jobs=4,
                                verbose=10
                                )

In [16]:
ca_list = ['gender', 'itf', 'ap_age', 'ap_school', 'ap_religion', 'ap_politics']

In [17]:
#k_best_dict = {}

#for ca in ca_list:
   
    #warnings.filterwarnings("ignore")
#    grid_search_resp.fit(df_ca['text'], df_ca[ca])
#    print('Finalizado, tarefa: ' + str(ca))

#    k_best_dict.update({ca: grid_search_resp.best_estimator_.get_params()['k_best__k']})
#print(k_best_dict)

#### O resultado deste for é:
k_best_dict = {'gender': 5000, 
               'itf': 3000, 
               'ap_age': 9000, 
               'ap_school': 10000, 
               'ap_religion': 7000, 
               'ap_politics': 9000}

In [18]:
vect = TfidfVectorizer()
clf = LogisticRegression(class_weight='balanced')
#k_best_dict = {'gender': 1800,
#               'itf': 7800,
#               'ap_age': 9800, #9800
#               'ap_school': 9800, #9800
#               'ap_religion': 7800,
#               'ap_politics': 2300}

In [19]:
text_vectorized_train = vect.fit_transform(df_ca['text'])
text_vectorized_test = vect.transform(df_aa['text'])

In [20]:
df_test_ca = pd.DataFrame()
df_test_ca['id_author'] = df_aa['id_author']

In [21]:
df_ca.shape

(490, 13)

In [22]:
df_test_ca = pd.DataFrame()
df_test_ca['id_author'] = list(df_aa['id_author'])

print('ca;accuracy;precision_macro;recall_macro;f1_macro')

for ca_var in k_best_dict:

    k_best = k_best_dict[ca_var]
    sel = SelectKBest(k = k_best)
    ft = sel.fit(text_vectorized_train, df_ca[ca_var])
    train_best = ft.transform(text_vectorized_train)

    clf.fit(train_best, df_ca[ca_var])

    test_best = ft.transform(text_vectorized_test)

    predicted = clf.predict(test_best) #com variavel categorica
    predicted_prob = clf.predict_proba(test_best) #com probabilidade de variavel
    
    f1, precision, recall, accuracy = eval_measures(df_aa[ca_var], predicted)

    print(pd.DataFrame(data=[[ca_var, f1, precision, recall, accuracy]], 
             columns=['ca_var', 'f1', 'precision', 'recall', 'accuracy']))

    df_test_ca[ca_var + '_predict'] = predicted
    
    # probs
    df_temp = pd.DataFrame(predicted_prob)
    df_temp = df_temp.add_prefix(ca_var + '_')
    df_test_ca = df_test_ca.join(df_temp)

ca;accuracy;precision_macro;recall_macro;f1_macro
   ca_var        f1  precision    recall  accuracy
0  gender  0.866667   0.866667  0.866667       0.9
  ca_var        f1  precision    recall  accuracy
0    itf  0.784946     0.8125  0.766667      0.85
   ca_var        f1  precision    recall  accuracy
0  ap_age  0.373333   0.520833  0.416667      0.55
      ca_var        f1  precision   recall  accuracy
0  ap_school  0.421818     0.4375  0.47619       0.6
        ca_var        f1  precision    recall  accuracy
0  ap_religion  0.542328   0.627778  0.559259      0.55
        ca_var        f1  precision  recall  accuracy
0  ap_politics  0.324675   0.309524   0.375      0.45


In [23]:
clf_dummy = DummyClassifier(strategy='stratified') #most_frequent, #stratified
for ca_var in k_best_dict:
    clf_dummy.fit(df_ca['id_author'].values.reshape(-1,1), df_ca[ca_var].values)
    predicted = clf_dummy.predict(df_aa['id_author'].values.reshape(-1,1))
    df_test_ca[ca_var + '_dummy_stratified'] = predicted

In [24]:
clf_dummy = DummyClassifier(strategy='most_frequent') #most_frequent, #stratified
for ca_var in k_best_dict:
    clf_dummy.fit(df_ca['id_author'].values.reshape(-1,1), df_ca[ca_var].values)
    predicted = clf_dummy.predict(df_aa['id_author'].values.reshape(-1,1))
    df_test_ca[ca_var + '_dummy_most_frequent'] = predicted

In [25]:
df_test_ca_baseline = df_test_ca.copy() 

In [26]:
df_test_ca_baseline = df_test_ca_baseline.merge(df_aa[['id_author', 'gender', 'itf', 'ap_age', 'ap_school', 'ap_religion', 
                                                       'ap_politics']],
                          left_on='id_author', right_on='id_author', how='left')

In [27]:
#baseline CA
#print('ca;f1;precision_macro;recall_macro;accuracy')

for ca_var in k_best_dict:
    
    f1, precision, recall, accuracy = eval_measures(df_test_ca_baseline[ca_var], df_test_ca_baseline[ca_var + '_dummy_most_frequent'])

    print(pd.DataFrame(data=[[ca_var, f1, precision, recall, accuracy]], 
             columns=['ca_var', 'f1', 'precision', 'recall', 'accuracy']))

   ca_var        f1  precision  recall  accuracy
0  gender  0.428571      0.375     0.5      0.75
  ca_var        f1  precision  recall  accuracy
0    itf  0.428571      0.375     0.5      0.75
   ca_var        f1  precision    recall  accuracy
0  ap_age  0.086957       0.05  0.333333      0.15
      ca_var        f1  precision    recall  accuracy
0  ap_school  0.206897       0.15  0.333333      0.45
        ca_var        f1  precision    recall  accuracy
0  ap_religion  0.153846        0.1  0.333333       0.3
        ca_var        f1  precision    recall  accuracy
0  ap_politics  0.190476   0.133333  0.333333       0.4


In [28]:
df_final_aa = df_final[df_final['id_author'].isin(list_authors_aa_selection)]
df_final_aa['id_author'] = df_final_aa['id_author'].astype(int)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  


In [29]:
df_final_aa = df_final_aa.merge(df_test_ca, how='left', left_on='id_author', right_on='id_author')
df_final_aa = df_final_aa.merge(df[['id_author', 'gender','itf','ap_age','ap_school','ap_religion','ap_politics']], how='left', left_on='id_author', right_on='id_author')

In [48]:
df_final_aa.drop_duplicates(subset=['id_author'])[['id_author', 'gender', 'itf', 'ap_age', 'ap_school', 'ap_religion', 'ap_politics']]

Unnamed: 0,id_author,gender,itf,ap_age,ap_school,ap_religion,ap_politics
0,1079,1,1,2,2,0,0
50,6018,1,1,0,0,1,1
122,6027,1,1,0,0,0,0
170,6033,1,1,0,0,0,1
216,41,1,1,2,2,0,1
267,6096,1,1,1,2,1,1
311,6102,1,1,1,0,1,0
364,6170,1,0,0,0,2,0
413,2226,1,1,2,2,2,0
459,6285,0,0,2,2,2,1


In [30]:
df_final_aa.shape

(1009, 42)

In [31]:
clf_final = LogisticRegression(random_state=42, multi_class='multinomial', solver='newton-cg')
scoring = ['accuracy','precision_macro', 'recall_macro', 'f1_macro']

In [32]:
def run_simulation(df_train, author, simulation, ca_list, author_list):
    ### train
    try:
        x_train, x_test, y_train, y_test = train_test_split(df_train[['text'] + ca_list], 
                                                        df_train['id_author'], 
                                                    test_size=0.3,random_state=rand.randint(0,900))

        pipelines = [pipelineCharacter, pipelineObfuscator, pipelineWord]
        for p in pipelines:
            p.fit(x_train['text'], y_train)

        predict_text_train_aa_mix = np.hstack([p.predict_proba(x_train['text']) for p in pipelines])
        df_train_mix = pd.DataFrame(data=predict_text_train_aa_mix)
        if 'gender_0' in x_train.columns:
            df_train_mix['gender_0'] = x_train['gender_0'].values
        if 'gender_1' in x_train.columns:
            df_train_mix['gender_1'] = x_train['gender_1'].values
        if 'itf_0' in x_train.columns:
            df_train_mix['itf_0'] = x_train['itf_0'].values
        if 'itf_1' in x_train.columns:
            df_train_mix['itf_1'] = x_train['itf_1'].values
        if 'ap_age_0' in x_train.columns:
            df_train_mix['ap_age_0'] = x_train['ap_age_0'].values
        if 'ap_age_1' in x_train.columns:
            df_train_mix['ap_age_1'] = x_train['ap_age_1'].values
        if 'ap_age_2' in x_train.columns:
            df_train_mix['ap_age_2'] = x_train['ap_age_2'].values
        if 'ap_school_0' in x_train.columns:
            df_train_mix['ap_school_0'] = x_train['ap_school_0'].values
        if 'ap_school_1' in x_train.columns:
            df_train_mix['ap_school_1'] = x_train['ap_school_1'].values
        if 'ap_school_2' in x_train.columns:
            df_train_mix['ap_school_2'] = x_train['ap_school_2'].values
        if 'ap_religion_0' in x_train.columns:
            df_train_mix['ap_religion_0'] = x_train['ap_religion_0'].values
        if 'ap_religion_1' in x_train.columns:
            df_train_mix['ap_religion_1'] = x_train['ap_religion_1'].values
        if 'ap_religion_2' in x_train.columns:
            df_train_mix['ap_religion_2'] = x_train['ap_religion_2'].values
        if 'ap_politics_0' in x_train.columns:
            df_train_mix['ap_politics_0'] = x_train['ap_politics_0'].values
        if 'ap_politics_1' in x_train.columns:
            df_train_mix['ap_politics_1'] = x_train['ap_politics_1'].values
        if 'ap_politics_2' in x_train.columns:
            df_train_mix['ap_politics_2'] = x_train['ap_politics_2'].values

        if 'gender' in x_train.columns:
            df_train_mix['gender'] = x_train['gender'].values
        if 'itf' in x_train.columns:
            df_train_mix['itf'] = x_train['itf'].values
        if 'ap_age' in x_train.columns:
            df_train_mix['ap_age'] = x_train['ap_age'].values
        if 'ap_school' in x_train.columns:
            df_train_mix['ap_school'] = x_train['ap_school'].values
        if 'ap_religion' in x_train.columns:
            df_train_mix['ap_religion'] = x_train['ap_religion'].values
        if 'ap_politics' in x_train.columns:
            df_train_mix['ap_politics'] = x_train['ap_politics'].values
            
        if 'gender_dummy_most_frequent' in x_train.columns:
            df_train_mix['gender_dummy_most_frequent'] = x_train['gender_dummy_most_frequent'].values
        if 'itf_dummy_most_frequent' in x_train.columns:
            df_train_mix['itf_dummy_most_frequent'] = x_train['itf_dummy_most_frequent'].values
        if 'ap_age_dummy_most_frequent' in x_train.columns:
            df_train_mix['ap_age_dummy_most_frequent'] = x_train['ap_age_dummy_most_frequent'].values
        if 'ap_school_dummy_most_frequent' in x_train.columns:
            df_train_mix['ap_school_dummy_most_frequent'] = x_train['ap_school_dummy_most_frequent'].values
        if 'ap_religion_dummy_most_frequent' in x_train.columns:
            df_train_mix['ap_religion_dummy_most_frequent'] = x_train['ap_religion_dummy_most_frequent'].values
        if 'ap_politics_dummy_most_frequent' in x_train.columns:
            df_train_mix['ap_politics_dummy_most_frequent'] = x_train['ap_politics_dummy_most_frequent'].values

        if 'gender_dummy_stratified' in x_train.columns:
            df_train_mix['gender_dummy_stratified'] = x_train['gender_dummy_stratified'].values
        if 'itf_dummy_stratified' in x_train.columns:
            df_train_mix['itf_dummy_stratified'] = x_train['itf_dummy_stratified'].values
        if 'ap_age_dummy_stratified' in x_train.columns:
            df_train_mix['ap_age_dummy_stratified'] = x_train['ap_age_dummy_stratified'].values
        if 'ap_school_dummy_stratified' in x_train.columns:
            df_train_mix['ap_school_dummy_stratified'] = x_train['ap_school_dummy_stratified'].values
        if 'ap_religion_dummy_stratified' in x_train.columns:
            df_train_mix['ap_religion_dummy_stratified'] = x_train['ap_religion_dummy_stratified'].values
        if 'ap_politics_dummy_stratified' in x_train.columns:
            df_train_mix['ap_politics_dummy_stratified'] = x_train['ap_politics_dummy_stratified'].values
            
        if 'gender_predict' in x_train.columns:
            df_train_mix['gender_predict'] = x_train['gender_predict'].values
        if 'itf_predict' in x_train.columns:
            df_train_mix['itf_predict'] = x_train['itf_predict'].values
        if 'ap_age_predict' in x_train.columns:
            df_train_mix['ap_age_predict'] = x_train['ap_age_predict'].values
        if 'ap_school_predict' in x_train.columns:
            df_train_mix['ap_school_predict'] = x_train['ap_school_predict'].values
        if 'ap_religion_predict' in x_train.columns:
            df_train_mix['ap_religion_predict'] = x_train['ap_religion_predict'].values
        if 'ap_politics_predict' in x_train.columns:
            df_train_mix['ap_politics_predict'] = x_train['ap_politics_predict'].values

        clf_final.fit(df_train_mix, y_train)

        #test
        predict_text_test_aa_mix = np.hstack([p.predict_proba(x_test['text']) for p in pipelines])
        df_test_mix = pd.DataFrame(data=predict_text_test_aa_mix)
        if 'gender_0' in x_test.columns:
            df_test_mix['gender_0'] = x_test['gender_0'].values
        if 'gender_1' in x_test.columns:
            df_test_mix['gender_1'] = x_test['gender_1'].values
        if 'itf_0' in x_test.columns:
            df_test_mix['itf_0'] = x_test['itf_0'].values
        if 'itf_1' in x_test.columns:
            df_test_mix['itf_1'] = x_test['itf_1'].values
        if 'ap_age_0' in x_test.columns:
            df_test_mix['ap_age_0'] = x_test['ap_age_0'].values
        if 'ap_age_1' in x_test.columns:
            df_test_mix['ap_age_1'] = x_test['ap_age_1'].values
        if 'ap_age_2' in x_test.columns:
            df_test_mix['ap_age_2'] = x_test['ap_age_2'].values
        if 'ap_school_0' in x_test.columns:
            df_test_mix['ap_school_0'] = x_test['ap_school_0'].values
        if 'ap_school_1' in x_test.columns:
            df_test_mix['ap_school_1'] = x_test['ap_school_1'].values
        if 'ap_school_2' in x_test.columns:
            df_test_mix['ap_school_2'] = x_test['ap_school_2'].values
        if 'ap_religion_0' in x_test.columns:
            df_test_mix['ap_religion_0'] = x_test['ap_religion_0'].values
        if 'ap_religion_1' in x_test.columns:
            df_test_mix['ap_religion_1'] = x_test['ap_religion_1'].values
        if 'ap_religion_2' in x_test.columns:
            df_test_mix['ap_religion_2'] = x_test['ap_religion_2'].values
        if 'ap_politics_0' in x_test.columns:
            df_test_mix['ap_politics_0'] = x_test['ap_politics_0'].values
        if 'ap_politics_1' in x_test.columns:
            df_test_mix['ap_politics_1'] = x_test['ap_politics_1'].values
        if 'ap_politics_2' in x_test.columns:
            df_test_mix['ap_politics_2'] = x_test['ap_politics_2'].values

        if 'gender' in x_test.columns:
            df_test_mix['gender'] = x_test['gender'].values
        if 'itf' in x_test.columns:
            df_test_mix['itf'] = x_test['itf'].values
        if 'ap_age' in x_test.columns:
            df_test_mix['ap_age'] = x_test['ap_age'].values
        if 'ap_school' in x_test.columns:
            df_test_mix['ap_school'] = x_test['ap_school'].values
        if 'ap_religion' in x_test.columns:
            df_test_mix['ap_religion'] = x_test['ap_religion'].values
        if 'ap_politics' in x_test.columns:
            df_test_mix['ap_politics'] = x_test['ap_politics'].values
            
        if 'gender_dummy_most_frequent' in x_test.columns:
            df_test_mix['gender_dummy_most_frequent'] = x_test['gender_dummy_most_frequent'].values
        if 'itf_dummy_most_frequent' in x_test.columns:
            df_test_mix['itf_dummy_most_frequent'] = x_test['itf_dummy_most_frequent'].values
        if 'ap_age_dummy_most_frequent' in x_test.columns:
            df_test_mix['ap_age_dummy_most_frequent'] = x_test['ap_age_dummy_most_frequent'].values
        if 'ap_school_dummy_most_frequent' in x_test.columns:
            df_test_mix['ap_school_dummy_most_frequent'] = x_test['ap_school_dummy_most_frequent'].values
        if 'ap_religion_dummy_most_frequent' in x_test.columns:
            df_test_mix['ap_religion_dummy_most_frequent'] = x_test['ap_religion_dummy_most_frequent'].values
        if 'ap_politics_dummy_most_frequent' in x_test.columns:
            df_test_mix['ap_politics_dummy_most_frequent'] = x_test['ap_politics_dummy_most_frequent'].values

        if 'gender_dummy_stratified' in x_test.columns:
            df_test_mix['gender_dummy_stratified'] = x_test['gender_dummy_stratified'].values
        if 'itf_dummy_stratified' in x_test.columns:
            df_test_mix['itf_dummy_stratified'] = x_test['itf_dummy_stratified'].values
        if 'ap_age_dummy_stratified' in x_test.columns:
            df_test_mix['ap_age_dummy_stratified'] = x_test['ap_age_dummy_stratified'].values
        if 'ap_school_dummy_stratified' in x_test.columns:
            df_test_mix['ap_school_dummy_stratified'] = x_test['ap_school_dummy_stratified'].values
        if 'ap_religion_dummy_stratified' in x_test.columns:
            df_test_mix['ap_religion_dummy_stratified'] = x_test['ap_religion_dummy_stratified'].values
        if 'ap_politics_dummy_stratified' in x_test.columns:
            df_test_mix['ap_politics_dummy_stratified'] = x_test['ap_politics_dummy_stratified'].values

        if 'gender_predict' in x_test.columns:
            df_test_mix['gender_predict'] = x_test['gender_predict'].values
        if 'itf_predict' in x_test.columns:
            df_test_mix['itf_predict'] = x_test['itf_predict'].values
        if 'ap_age_predict' in x_test.columns:
            df_test_mix['ap_age_predict'] = x_test['ap_age_predict'].values
        if 'ap_school_predict' in x_test.columns:
            df_test_mix['ap_school_predict'] = x_test['ap_school_predict'].values
        if 'ap_religion_predict' in x_test.columns:
            df_test_mix['ap_religion_predict'] = x_test['ap_religion_predict'].values
        if 'ap_politics_predict' in x_test.columns:
            df_test_mix['ap_politics_predict'] = x_test['ap_politics_predict'].values
            
        test_pred = clf_final.predict(df_test_mix)

        f1, precision, recall, accuracy = eval_measures(y_test, test_pred)

        return pd.DataFrame(data=[[author, f1, precision, recall, accuracy, simulation, str(ca_list), str(author_list)]], 
                 columns=['autors_selected', 'f1', 'precision', 'recall', 'accuracy', 'simulation', 'ca_list', 'author_list'])
    except Exception as e:
        #print('erro na simulacao')
        print(traceback.print_exc())
        return pd.DataFrame(data=[[0.0, 0.0, 0.0, 0.0, 0.0, simulation, str(ca_list), str(author_list)]], 
                 columns=['autors_selected', 'f1', 'precision', 'recall', 'accuracy', 'simulation', 'ca_list', 'author_list'])

In [33]:
ca_list_11 = []

#individuais
ca_list_02 = ['gender']
ca_list_03 = ['itf']
ca_list_04 = ['ap_age']
ca_list_05 = ['ap_school']
ca_list_06 = ['ap_religion']
ca_list_07 = ['ap_politics']

ca_list_12 = ['gender_predict']
ca_list_13 = ['itf_predict']
ca_list_14 = ['ap_age_predict']
ca_list_15 = ['ap_school_predict']
ca_list_16 = ['ap_religion_predict']
ca_list_17 = ['ap_politics_predict']

ca_list_221 = ['gender_dummy_stratified']
ca_list_231 = ['itf_dummy_stratified']
ca_list_241 = ['ap_age_dummy_stratified']
ca_list_251 = ['ap_school_dummy_stratified']
ca_list_261 = ['ap_religion_dummy_stratified']
ca_list_271 = ['ap_politics_dummy_stratified']

ca_list_222 = ['gender_dummy_most_frequent']
ca_list_232 = ['itf_dummy_most_frequent']
ca_list_242 = ['ap_age_dummy_most_frequent']
ca_list_252 = ['ap_school_dummy_most_frequent']
ca_list_262 = ['ap_religion_dummy_most_frequent']
ca_list_272 = ['ap_politics_dummy_most_frequent']

ca_list_32 = ['gender_0', 'gender_1']
ca_list_33 = ['itf_0', 'itf_1']
ca_list_34 = ['ap_age_0', 'ap_age_1', 'ap_age_2']
ca_list_35 = ['ap_school_0', 'ap_school_1', 'ap_school_2']
ca_list_36 = ['ap_religion_0', 'ap_religion_1', 'ap_religion_2']
ca_list_37 = ['ap_politics_0', 'ap_politics_1', 'ap_politics_2']

In [29]:
#df_filter = df_final_aa[df_final_aa['id_author'].isin(author_list_filter)]
#rand.shuffle(list_authors_aa_selection)
#author_list_filter = list_authors_aa_selection[:2]

#run_simulation(df_filter, 1, 1, ca_list_36, author_list_filter)

In [None]:
df_metrics = pd.DataFrame()

for i in range(0, 20):
    rand.shuffle(list_authors_aa_selection)

    now = datetime.now()

    print("Executing simulation number: " + str(i) + " data: " + str(now.strftime("%d/%m/%Y %H:%M:%S")))

    for j in range(2, limit_authors_aa_selection + 2, 2):
        
        author_list_filter = list_authors_aa_selection[:j]

        df_filter = df_final_aa[df_final_aa['id_author'].isin(author_list_filter)]
        
        df_metrics = df_metrics.append(run_simulation(df_filter, j, i, ca_list_11, author_list_filter)) # baseline
        
        #df_metrics = df_metrics.append(run_simulation(df_filter, j, i, ca_list_07, author_list_filter)) # predict
        df_metrics = df_metrics.append(run_simulation(df_filter, j, i, ca_list_12, author_list_filter)) # predict
        df_metrics = df_metrics.append(run_simulation(df_filter, j, i, ca_list_13, author_list_filter)) # predict
        df_metrics = df_metrics.append(run_simulation(df_filter, j, i, ca_list_14, author_list_filter)) # predict
        df_metrics = df_metrics.append(run_simulation(df_filter, j, i, ca_list_15, author_list_filter)) # predict
        df_metrics = df_metrics.append(run_simulation(df_filter, j, i, ca_list_16, author_list_filter)) # predict
        df_metrics = df_metrics.append(run_simulation(df_filter, j, i, ca_list_17, author_list_filter)) # predict

        #df_metrics = df_metrics.append(run_simulation(df_filter, j, i, ca_list_271, author_list_filter)) # baseline
        
        df_metrics = df_metrics.append(run_simulation(df_filter, j, i, ca_list_222, author_list_filter)) # most_frequent
        df_metrics = df_metrics.append(run_simulation(df_filter, j, i, ca_list_232, author_list_filter)) # most_frequent
        df_metrics = df_metrics.append(run_simulation(df_filter, j, i, ca_list_242, author_list_filter)) # most_frequent
        df_metrics = df_metrics.append(run_simulation(df_filter, j, i, ca_list_252, author_list_filter)) # most_frequent
        df_metrics = df_metrics.append(run_simulation(df_filter, j, i, ca_list_262, author_list_filter)) # most_frequent
        df_metrics = df_metrics.append(run_simulation(df_filter, j, i, ca_list_272, author_list_filter)) # most_frequent
        
        df_metrics = df_metrics.append(run_simulation(df_filter, j, i, ca_list_32, author_list_filter)) # prob
        df_metrics = df_metrics.append(run_simulation(df_filter, j, i, ca_list_33, author_list_filter)) # prob
        df_metrics = df_metrics.append(run_simulation(df_filter, j, i, ca_list_34, author_list_filter)) # prob
        df_metrics = df_metrics.append(run_simulation(df_filter, j, i, ca_list_35, author_list_filter)) # prob
        df_metrics = df_metrics.append(run_simulation(df_filter, j, i, ca_list_36, author_list_filter)) # prob
        df_metrics = df_metrics.append(run_simulation(df_filter, j, i, ca_list_37, author_list_filter)) # prob
        

    df_metrics.to_excel(os.path.join('output', 'kbest_proprio_' + str(now.strftime("%d_%m_%Y__%H_%M_%S")) + ".xlsx"))

        #df_metrics = df_metrics.append(run_simulation(df_train, j, i, ca_list))
        #print(str(authors_shuffle[:j]))


Executing simulation number: 0 data: 29/04/2020 15:03:42
Executing simulation number: 1 data: 29/04/2020 15:06:26
Executing simulation number: 2 data: 29/04/2020 15:09:27
Executing simulation number: 3 data: 29/04/2020 15:14:44
Executing simulation number: 4 data: 29/04/2020 15:30:19


### Teste McNemar's

In [34]:
from statsmodels.stats.contingency_tables import mcnemar

In [43]:
def test_mcnemar(j, ca_classf_list_1, ca_classf_list_2):

    clf_final_classif_1 = LogisticRegression(random_state=42, multi_class='multinomial', solver='newton-cg')
    clf_final_classif_2 = LogisticRegression(random_state=42, multi_class='multinomial', solver='newton-cg')

    author_list_filter = list_authors_aa_selection[:j]

    df_filter = df_final_aa[df_final_aa['id_author'].isin(author_list_filter)]

    x_train, x_test, y_train, y_test = train_test_split(df_filter[['text'] + ca_classf_list_1], 
                                                    df_filter['id_author'], 
                                                test_size=0.3,random_state=42)

    pipelines = [pipelineCharacter, pipelineObfuscator, pipelineWord]

    for p in pipelines:
        p.fit(x_train['text'], y_train)

    predict_text_train_aa_mix = np.hstack([p.predict_proba(x_train['text']) for p in pipelines])

    df_train_mix = pd.DataFrame(data=predict_text_train_aa_mix)

    for ca in ca_classf_list_1:
        df_train_mix[ca] = x_train[ca].values

    clf_final_classif_1.fit(df_train_mix, y_train)

    predict_text_test_aa_mix = np.hstack([p.predict_proba(x_test['text']) for p in pipelines])
    df_test_mix = pd.DataFrame(data=predict_text_test_aa_mix)

    for ca in ca_classf_list_1:
        df_test_mix[ca] = x_test[ca].values

    test_pred = clf_final_classif_1.predict(df_test_mix)

    x_train, x_test, y_train, y_test = train_test_split(df_filter[['text'] + ca_classf_list_2], 
                                                    df_filter['id_author'], 
                                                test_size=0.3,random_state=42)

    pipelines = [pipelineCharacter, pipelineObfuscator, pipelineWord]

    for p in pipelines:
        p.fit(x_train['text'], y_train)

    predict_text_train_aa_mix = np.hstack([p.predict_proba(x_train['text']) for p in pipelines])

    df_train_mix = pd.DataFrame(data=predict_text_train_aa_mix)

    for ca in ca_classf_list_2:
        df_train_mix[ca] = x_train[ca].values

    clf_final_classif_2.fit(df_train_mix, y_train)

    predict_text_test_aa_mix = np.hstack([p.predict_proba(x_test['text']) for p in pipelines])
    df_test_mix = pd.DataFrame(data=predict_text_test_aa_mix)

    for ca in ca_classf_list_2:
        df_test_mix[ca] = x_test[ca].values

    test_pred2 = clf_final_classif_2.predict(df_test_mix)

    df_mcnemar_test = pd.DataFrame()
    df_mcnemar_test['model_pred_classif_1'] = test_pred
    df_mcnemar_test['model_pred_classif_2'] = test_pred2
    df_mcnemar_test['original_label'] = y_test.values

    df_mcnemar_test['classf_1'] = np.where(df_mcnemar_test['model_pred_classif_1']==df_mcnemar_test['original_label'], 0, 1)
    df_mcnemar_test['classf_2'] = np.where(df_mcnemar_test['model_pred_classif_2']==df_mcnemar_test['original_label'], 0, 1)

    print("classf_1 acc: " + str(1-sum(df_mcnemar_test['classf_1'])/len(df_mcnemar_test)))
    print("classf_2 acc: " + str(1-sum(df_mcnemar_test['classf_2'])/len(df_mcnemar_test)))

    data_crosstab = pd.crosstab(df_mcnemar_test['classf_1'],  
                                df_mcnemar_test['classf_2'], 
                                margins = False) 
    print(data_crosstab)

    
    count_contingence_table = np.where(data_crosstab>25, 1, 0).sum()
    
    if count_contingence_table==4:
        result = mcnemar(data_crosstab, exact=False, correction=True)
        print("Todos os termos maiores que 25")
    else:
        print("Algum termo menor que 25")
        result = mcnemar(data_crosstab, exact=True)

    print('statistic=%.3f, p-value=%.3f' % (result.statistic, result.pvalue))
    # interpret the p-value
    alpha = 0.05
    if result.pvalue > alpha:
        print('Same proportions of errors (fail to reject H0)')
    else:
        print('Different proportions of errors (reject H0)')

In [44]:
test_mcnemar(20, ca_list_11, ca_list_16)

classf_1 acc: 0.2838283828382838
classf_2 acc: 0.4158415841584159
classf_2   0    1
classf_1         
0         85    1
1         41  176
Algum termo menor que 25
statistic=1.000, p-value=0.000
Different proportions of errors (reject H0)


In [45]:
test_mcnemar(20, ca_list_11, ca_list_36)

classf_1 acc: 0.2838283828382838
classf_2 acc: 0.29702970297029707
classf_2   0    1
classf_1         
0         86    0
1          4  213
Algum termo menor que 25
statistic=0.000, p-value=0.125
Same proportions of errors (fail to reject H0)


In [46]:
test_mcnemar(20, ca_list_16, ca_list_36)

classf_1 acc: 0.4158415841584159
classf_2 acc: 0.29702970297029707
classf_2   0    1
classf_1         
0         89   37
1          1  176
Algum termo menor que 25
statistic=1.000, p-value=0.000
Different proportions of errors (reject H0)
