In [1]:
import os
import pandas as pd
import numpy as np
import re
import itertools
from datetime import datetime
import traceback
import pickle
import warnings

#model valuation
from sklearn.model_selection import train_test_split

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.feature_selection import SelectKBest
from sklearn.model_selection import cross_validate
from sklearn.feature_selection import f_classif
from sklearn.model_selection import GridSearchCV

import random as rand

from sklearn.pipeline import Pipeline
from sklearn.preprocessing import normalize, Normalizer, MaxAbsScaler, MinMaxScaler
from sklearn.decomposition import PCA
from sklearn.dummy import DummyClassifier

from sklearn.datasets import load_files

In [2]:
import platform
import scipy
import sklearn
print(platform.platform())
print("NumPy", np.__version__)
print("SciPy", scipy.__version__)
print("Scikit-Learn", sklearn.__version__)

Linux-4.4.0-1112-aws-x86_64-with-debian-stretch-sid
NumPy 1.14.3
SciPy 1.1.0
Scikit-Learn 0.19.1


In [3]:
from sklearn.base import BaseEstimator
from scipy.sparse import issparse

class DenseTransformer(BaseEstimator):
    def __init__(self, return_copy=True):
        self.return_copy = return_copy
        self.is_fitted = False

    def transform(self, X, y=None):
        if issparse(X):
            return X.toarray()
        elif self.return_copy:
            return X.copy()
        else:
            return X

    def fit(self, X, y=None):
        self.is_fitted = True
        return self

    def fit_transform(self, X, y=None):
        return self.transform(X=X, y=y)

In [4]:
from sklearn.base import BaseEstimator
from scipy.sparse import issparse

class ObfuscationTransformer(BaseEstimator):
    def __init__(self,re_from=r'(\b)(\w{0,2})\w+(\w{1,3})(\b)', re_to=r'\1\2XX\3\4', return_copy=True):
        self.re_from = re_from
        self.re_to = re_to

    def transform(self, X, y=None):
        X = np.array(X).copy();
        for i in range(len(X)):
            X[i] = re.sub(self.re_from,self.re_to, X[i])
        
        return X;

    def fit(self, X, y=None):
        return self

    def fit_transform(self, X, y=None):
        return self.transform(X=X, y=y)

In [5]:
import warnings
from sklearn.metrics import f1_score, precision_score, recall_score, accuracy_score
from sklearn.preprocessing import LabelEncoder

def eval_measures(gt, pred):
    with warnings.catch_warnings():
        warnings.simplefilter('ignore')
        f1 = f1_score(gt,
                  pred,
                  labels=list(set(gt)),
                  average='macro')
        precision = precision_score(gt,
                  pred,
                  labels=list(set(gt)),
                  average='macro')
        recall = recall_score(gt,
                  pred,
                  labels=list(set(gt)),
                  average='macro')
        accuracy = accuracy_score(gt,
                  pred)

    return f1,precision,recall,accuracy

In [6]:
pipelineCharacter = Pipeline([
    ('vect',   TfidfVectorizer(
            analyzer='char',
            min_df=0.05,
            max_df=1.0,
            ngram_range=(2,5),
            lowercase=False,
            norm='l2',
            sublinear_tf=True)),
    ('dense',  DenseTransformer()),
    ('scaler', MaxAbsScaler()),
    ('transf', PCA(0.999)),
    ('clf', LogisticRegression(random_state=0,multi_class='multinomial', solver='newton-cg')),
])

pipelineObfuscator = Pipeline([
        ('obs',ObfuscationTransformer(re_from=r'\w',re_to='x')),
        ('vect',   TfidfVectorizer(
                analyzer='char',
                min_df=0.05,
                max_df=1.0,
                ngram_range=(2,5),
                lowercase=False,
                norm='l2',
                sublinear_tf=True)),
        ('dense',  DenseTransformer()),
        ('scaler', MaxAbsScaler()),
        ('transf', PCA(0.999)),
        ('clf', LogisticRegression(random_state=0,multi_class='multinomial', solver='newton-cg')),
    ])

pipelineWord = Pipeline([
        ('vect',   TfidfVectorizer(
                analyzer='word',
                min_df=0.05,
                max_df=1.0,
                ngram_range=(1,3),
                lowercase=True,
                norm='l2',
                sublinear_tf=True)),
        ('dense',  DenseTransformer()),
        ('scaler', MaxAbsScaler()),
        ('transf', PCA(0.999)),
        ('clf', LogisticRegression(random_state=0, multi_class='multinomial', solver='newton-cg')),
    ]);

In [86]:
os.chdir('/home/ubuntu/lab/Experimentos/experimento_3_corpus_b5_post')
input_summary_file = os.path.join('input', 'subjects_trim.csv')

In [87]:
### le tabel summary
df_summary = pd.read_csv(input_summary_file, sep=',')

In [88]:
### le textos
textsDir = os.path.join('input', 'pt-gender')
texts = load_files(textsDir, 
    description=None, categories=['male', 'female'], 
    load_content=True, encoding='utf-8', shuffle=True, random_state=42)

In [89]:
id_list = []

for i in range(0, len(texts.filenames)):
    id_list.append(texts.filenames[i].split('-')[3])

In [90]:
df = pd.DataFrame(data=texts.data, columns=['text'])
df['id_author'] = id_list
df['id_author'] = df['id_author'].astype(int)

In [91]:
df = df.merge(df_summary, how='left', left_on='id_author', right_on='id')

In [92]:
df = df[df['age']>0]
df = df[df['ti'].notnull()]

In [94]:
#df.describe()

Unnamed: 0,id_author,id,words,age
count,733.0,733.0,733.0,733.0
mean,659.148704,659.148704,845.085948,24.607094
std,339.125259,339.125259,577.299772,6.482937
min,17.0,17.0,10.0,18.0
25%,399.0,399.0,439.0,21.0
50%,692.0,692.0,742.0,23.0
75%,952.0,952.0,1137.0,26.0
max,2226.0,2226.0,4663.0,61.0


In [14]:
df['gender_final'] = np.where(df['gender']=='male', 1, 0)
df['age_final'] = np.where(df['age']<=20, 
                           0, 
                           np.where(df['age']>=28, 
                                    2, 1))
df['it_final'] = np.where(df['ti']=='yes', 1, 0)

In [15]:
del df['words']
del df['id']
del df['age']
del df['ti']
del df['gender']

In [16]:
df_final = pd.DataFrame()

for i in range(0, len(df)):
    text_list = df['text'].iloc[i].split('.')
    author_list = [df['id_author'].iloc[i]]*len(text_list)
    df_inter = pd.DataFrame(data=[text_list, author_list]).T
    df_inter = df_inter.rename(columns={0: 'text', 1: 'id_author'})
    
    df_final = df_final.append(df_inter)

In [17]:
df_final_statistics = df_final.copy()
df_final_statistics['number_of_words'] = df_final['text'].apply(lambda x: len(x.split()))

In [18]:
print("Total de unidades: " + str(len(df_final_statistics)))
print("Total de palavras: " + str(sum(df_final_statistics['number_of_words'])))
print("Total de palavras/unidade: " + str(sum(df_final_statistics['number_of_words'])/len(df_final_statistics)))    

Total de unidades: 128310
Total de palavras: 1951259
Total de palavras/unidade: 15.207380562699711


In [19]:
## selecao de autores com mais textos
limit_authors_aa_selection = 20
list_authors_aa_selection = list(df_final['id_author'].value_counts()[:limit_authors_aa_selection].index)

In [20]:
#Divisao de bases
df_aa = df[df['id_author'].isin(list_authors_aa_selection)]
df_ca = df[~df['id_author'].isin(list_authors_aa_selection)]

In [21]:
pipeline_kbest = Pipeline([
    ('vect', TfidfVectorizer()), 
    ('k_best', SelectKBest(f_classif)),
    ('clf', LogisticRegression(random_state=42, n_jobs=6))
])
    
parameters_kbest = {    
    'k_best__k': (range(3000,20000,1000))
}

grid_search_resp = GridSearchCV(pipeline_kbest,
                               parameters_kbest,
                               cv=10,
                               scoring='f1_macro',
                               n_jobs=6,
                               #verbose=10
                               )

In [22]:
ca_list = ['gender_final', 'it_final', 'age_final']

In [23]:
#k_best_dict = {}

#for ca in ca_list:  
#    warnings.filterwarnings("ignore")
#    grid_search_resp.fit(df_ca['text'], df_ca[ca])
#    print('Finalizado, tarefa: ' + str(ca))

#    k_best_dict.update({ca: grid_search_resp.best_estimator_.get_params()['k_best__k']})
#print(k_best_dict)

k_best_dict = {'gender_final': 13000, 
               'it_final': 6000, 
               'age_final': 16000}

In [24]:
vect = TfidfVectorizer()
clf = LogisticRegression(class_weight='balanced')

### Utilizando o kbest encontrado
#k_best_dict = {'gender_final': 1800,
#               'it_final': 7800,
#               'age_final': 9800}

In [25]:
text_vectorized_train = vect.fit_transform(df_ca['text'])
text_vectorized_test = vect.transform(df_aa['text'])

In [26]:
#df_test_ca = pd.DataFrame()
#df_test_ca['id_author'] = df_aa['id_author']

In [27]:
df_test_ca = pd.DataFrame()
df_test_ca['id_author'] = list(df_aa['id_author'])

print('ca;accuracy;precision_macro;recall_macro;f1_macro')

for ca_var in k_best_dict:

    k_best = k_best_dict[ca_var]
    sel = SelectKBest(k = k_best)
    ft = sel.fit(text_vectorized_train, df_ca[ca_var])
    train_best = ft.transform(text_vectorized_train)

    clf.fit(train_best, df_ca[ca_var])

    test_best = ft.transform(text_vectorized_test)

    predicted = clf.predict(test_best) #com variavel categorica
    predicted_prob = clf.predict_proba(test_best) #com probabilidade de variavel
    
    f1, precision, recall, accuracy = eval_measures(df_aa[ca_var], predicted)

    print(pd.DataFrame(data=[[ca_var, f1, precision, recall, accuracy]], 
             columns=['ca_var', 'f1', 'precision', 'recall', 'accuracy']))

    df_test_ca[ca_var + '_predict'] = predicted
    
    # probs
    df_temp = pd.DataFrame(predicted_prob)
    df_temp = df_temp.add_prefix(ca_var + '_')
    df_test_ca = df_test_ca.join(df_temp)

ca;accuracy;precision_macro;recall_macro;f1_macro
         ca_var        f1  precision    recall  accuracy
0  gender_final  0.607843   0.607843  0.607843       0.8
     ca_var        f1  precision    recall  accuracy
0  it_final  0.722222   0.722222  0.722222       0.9
      ca_var        f1  precision    recall  accuracy
0  age_final  0.423687   0.563492  0.449206      0.45


In [28]:
clf_dummy = DummyClassifier(strategy='stratified') #most_frequent, #stratified
for ca_var in k_best_dict:
    clf_dummy.fit(df_ca['id_author'].values.reshape(-1,1), df_ca[ca_var].values)
    predicted = clf_dummy.predict(df_aa['id_author'].values.reshape(-1,1))
    df_test_ca[ca_var + '_dummy_stratified'] = predicted

In [29]:
clf_dummy = DummyClassifier(strategy='most_frequent') #most_frequent, #stratified
for ca_var in k_best_dict:
    clf_dummy.fit(df_ca['id_author'].values.reshape(-1,1), df_ca[ca_var].values)
    predicted = clf_dummy.predict(df_aa['id_author'].values.reshape(-1,1))
    df_test_ca[ca_var + '_dummy_most_frequent'] = predicted

In [30]:
df_test_ca_baseline = df_test_ca.copy() 
#df_aa[['gender_final', 'age_final', 'it_final']]

In [31]:
df_test_ca_baseline = df_test_ca_baseline.merge(df_aa[['id_author','gender_final', 'age_final', 'it_final']],
                          left_on='id_author', right_on='id_author', how='left')

In [32]:
#baseline CA
print('ca;accuracy;precision_macro;recall_macro;f1_macro')

for ca_var in k_best_dict:
    
    f1, precision, recall, accuracy = eval_measures(df_test_ca_baseline[ca_var], df_test_ca_baseline[ca_var + '_dummy_most_frequent'])

    print(pd.DataFrame(data=[[ca_var, f1, precision, recall, accuracy]], 
             columns=['ca_var', 'f1', 'precision', 'recall', 'accuracy']))


ca;accuracy;precision_macro;recall_macro;f1_macro
         ca_var        f1  precision  recall  accuracy
0  gender_final  0.459459      0.425     0.5      0.85
     ca_var        f1  precision  recall  accuracy
0  it_final  0.473684       0.45     0.5       0.9
      ca_var       f1  precision    recall  accuracy
0  age_final  0.17284   0.116667  0.333333      0.35


In [33]:
df_final_aa = df_final[df_final['id_author'].isin(list_authors_aa_selection)]
df_final_aa['id_author'] = df_final_aa['id_author'].astype(int)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  


In [34]:
df_final_aa = df_final_aa.merge(df_test_ca, how='left', left_on='id_author', right_on='id_author')
df_final_aa = df_final_aa.merge(df[['id_author', 'gender_final','it_final','age_final']], how='left', left_on='id_author', right_on='id_author')

In [35]:
df_final_aa = df_final_aa[df_final_aa['text'].apply(len)>20]

In [36]:
clf_final = LogisticRegression(random_state=42, multi_class='multinomial', solver='newton-cg')
scoring = ['accuracy','precision_macro', 'recall_macro', 'f1_macro']

In [84]:
df_final_aa.drop_duplicates(subset=['id_author'])[['id_author', 'gender_final', 'it_final', 'age_final']]

Unnamed: 0,id_author,gender_final,it_final,age_final
0,1163,0,0,1
616,1119,0,0,1
1354,430,0,0,1
1985,913,0,0,2
2587,1147,0,1,0
3272,946,0,0,0
4062,1166,0,0,2
4651,1165,0,0,1
5352,919,0,0,0
6039,584,0,0,2


In [37]:
#df_final_aa.shape

In [38]:
def run_simulation(df_train, author, simulation, ca_list, author_list):
    ### train
    try:
        x_train, x_test, y_train, y_test = train_test_split(df_train[['text'] + ca_list], 
                                                        df_train['id_author'], 
                                                    test_size=0.3,random_state=rand.randint(0,900))

        pipelines = [pipelineCharacter, pipelineObfuscator, pipelineWord]
        for p in pipelines:
            p.fit(x_train['text'], y_train)

        predict_text_train_aa_mix = np.hstack([p.predict_proba(x_train['text']) for p in pipelines])
        df_train_mix = pd.DataFrame(data=predict_text_train_aa_mix)
        if 'gender_final_0' in x_train.columns:
            df_train_mix['gender_final_0'] = x_train['gender_final_0'].values
        if 'gender_final_1' in x_train.columns:
            df_train_mix['gender_final_1'] = x_train['gender_final_1'].values
        if 'it_final_0' in x_train.columns:
            df_train_mix['it_final_0'] = x_train['it_final_0'].values
        if 'it_final_1' in x_train.columns:
            df_train_mix['it_final_1'] = x_train['it_final_1'].values
        if 'age_final_0' in x_train.columns:
            df_train_mix['age_final_0'] = x_train['age_final_0'].values
        if 'age_final_1' in x_train.columns:
            df_train_mix['age_final_1'] = x_train['age_final_1'].values
        if 'age_final_2' in x_train.columns:
            df_train_mix['age_final_2'] = x_train['age_final_2'].values

        if 'gender_final_predict' in x_train.columns:
            df_train_mix['gender_final_predict'] = x_train['gender_final_predict'].values
        if 'it_final_predict' in x_train.columns:
            df_train_mix['it_final_predict'] = x_train['it_final_predict'].values
        if 'age_final_predict' in x_train.columns:
            df_train_mix['age_final_predict'] = x_train['age_final_predict'].values
            
        if 'gender_final_dummy_stratified' in x_train.columns:
            df_train_mix['gender_final_dummy_stratified'] = x_train['gender_final_dummy_stratified'].values
        if 'it_final_dummy_stratified' in x_train.columns:
            df_train_mix['it_final_dummy_stratified'] = x_train['it_final_dummy_stratified'].values
        if 'age_final_dummy_stratified' in x_train.columns:
            df_train_mix['age_final_dummy_stratified'] = x_train['age_final_dummy_stratified'].values

        if 'gender_final_dummy_most_frequent' in x_train.columns:
            df_train_mix['gender_final_dummy_most_frequent'] = x_train['gender_final_dummy_most_frequent'].values
        if 'it_final_dummy_most_frequent' in x_train.columns:
            df_train_mix['it_final_dummy_most_frequent'] = x_train['it_final_dummy_most_frequent'].values
        if 'age_final_dummy_most_frequent' in x_train.columns:
            df_train_mix['age_final_dummy_most_frequent'] = x_train['age_final_dummy_most_frequent'].values

        if 'gender_final' in x_train.columns:
            df_train_mix['gender_final'] = x_train['gender_final'].values
        if 'it_final' in x_train.columns:
            df_train_mix['it_final'] = x_train['it_final'].values
        if 'age_final' in x_train.columns:
            df_train_mix['age_final'] = x_train['age_final'].values

        clf_final.fit(df_train_mix, y_train)

        #test
        predict_text_test_aa_mix = np.hstack([p.predict_proba(x_test['text']) for p in pipelines])
        df_test_mix = pd.DataFrame(data=predict_text_test_aa_mix)
        if 'gender_final_0' in x_test.columns:
            df_test_mix['gender_final_0'] = x_test['gender_final_0'].values
        if 'gender_final_1' in x_test.columns:
            df_test_mix['gender_final_1'] = x_test['gender_final_1'].values
        if 'it_final_0' in x_test.columns:
            df_test_mix['it_final_0'] = x_test['it_final_0'].values
        if 'it_final_1' in x_test.columns:
            df_test_mix['it_final_1'] = x_test['it_final_1'].values
        if 'age_final_0' in x_test.columns:
            df_test_mix['age_final_0'] = x_test['age_final_0'].values
        if 'age_final_1' in x_test.columns:
            df_test_mix['age_final_1'] = x_test['age_final_1'].values
        if 'age_final_2' in x_test.columns:
            df_test_mix['age_final_2'] = x_test['age_final_2'].values

        if 'gender_final_predict' in x_test.columns:
            df_test_mix['gender_final_predict'] = x_test['gender_final_predict'].values
        if 'it_final_predict' in x_test.columns:
            df_test_mix['it_final_predict'] = x_test['it_final_predict'].values
        if 'age_final_predict' in x_test.columns:
            df_test_mix['age_final_predict'] = x_test['age_final_predict'].values
            
        if 'gender_final_dummy_stratified' in x_test.columns:
            df_test_mix['gender_final_dummy_stratified'] = x_test['gender_final_dummy_stratified'].values
        if 'it_final_dummy_stratified' in x_test.columns:
            df_test_mix['it_final_dummy_stratified'] = x_test['it_final_dummy_stratified'].values
        if 'age_final_dummy_stratified' in x_test.columns:
            df_test_mix['age_final_dummy_stratified'] = x_test['age_final_dummy_stratified'].values

        if 'gender_final_dummy_most_frequent' in x_test.columns:
            df_test_mix['gender_final_dummy_most_frequent'] = x_test['gender_final_dummy_most_frequent'].values
        if 'it_final_dummy_most_frequent' in x_test.columns:
            df_test_mix['it_final_dummy_most_frequent'] = x_test['it_final_dummy_most_frequent'].values
        if 'age_final_dummy_most_frequent' in x_test.columns:
            df_test_mix['age_final_dummy_most_frequent'] = x_test['age_final_dummy_most_frequent'].values

        if 'gender_final' in x_test.columns:
            df_test_mix['gender_final'] = x_test['gender_final'].values
        if 'it_final' in x_test.columns:
            df_test_mix['it_final'] = x_test['it_final'].values
        if 'age_final' in x_test.columns:
            df_test_mix['age_final'] = x_test['age_final'].values
            
        test_pred = clf_final.predict(df_test_mix)

        f1, precision, recall, accuracy = eval_measures(y_test, test_pred)

        return pd.DataFrame(data=[[author, f1, precision, recall, accuracy, simulation, str(ca_list), str(author_list)]], 
                 columns=['autors_selected', 'f1', 'precision', 'recall', 'accuracy', 'simulation', 'ca_list', 'author_list'])
    except Exception as e:
        #print('erro na simulacao')
        print(traceback.print_exc())
        return pd.DataFrame(data=[[0.0, 0.0, 0.0, 0.0, 0.0, simulation, str(ca_list), str(author_list)]], 
                 columns=['autors_selected', 'f1', 'precision', 'recall', 'accuracy', 'simulation', 'ca_list', 'author_list'])

In [39]:
ca_list_11 = []

#individuais
ca_list_02 = ['gender_final']
ca_list_03 = ['it_final']
ca_list_04 = ['age_final']

ca_list_12 = ['gender_final_predict']
ca_list_13 = ['it_final_predict']
ca_list_14 = ['age_final_predict']

ca_list_112 = ['gender_final_predict', 'it_final_predict']
ca_list_113 = ['it_final_predict', 'age_final_predict']
ca_list_114 = ['gender_final_predict', 'age_final_predict']
ca_list_115 = ['gender_final_predict', 'it_final_predict', 'age_final_predict']

ca_list_221 = ['gender_final_dummy_stratified']
ca_list_231 = ['it_final_dummy_stratified']
ca_list_241 = ['age_final_dummy_stratified']

ca_list_222 = ['gender_final_dummy_most_frequent']
ca_list_232 = ['it_final_dummy_most_frequent']
ca_list_242 = ['age_final_dummy_most_frequent']

ca_list_1222 = ['gender_final_dummy_most_frequent', 'it_final_dummy_most_frequent']
ca_list_1232 = ['it_final_dummy_most_frequent', 'age_final_dummy_most_frequent']
ca_list_1242 = ['gender_final_dummy_most_frequent','age_final_dummy_most_frequent']
ca_list_1252 = ['gender_final_dummy_most_frequent', 'it_final_dummy_most_frequent', 'age_final_dummy_most_frequent']

ca_list_32 = ['gender_final_0', 'gender_final_1']
ca_list_33 = ['it_final_0', 'it_final_1']
ca_list_34 = ['age_final_0', 'age_final_1', 'age_final_2']

ca_list_132 = ['gender_final_0', 'gender_final_1', 'it_final_0', 'it_final_1']
ca_list_133 = ['it_final_0', 'it_final_1', 'age_final_0', 'age_final_1', 'age_final_2']
ca_list_134 = ['gender_final_0', 'gender_final_1', 'age_final_0', 'age_final_1', 'age_final_2']
ca_list_135 = ['gender_final_0', 'gender_final_1', 'it_final_0', 'it_final_1', 'age_final_0', 'age_final_1', 'age_final_2']

In [44]:
df_metrics = pd.DataFrame()

for i in range(0, 20):
    rand.shuffle(list_authors_aa_selection)

    now = datetime.now()

    print("Executing simulation number: " + str(i) + " data: " + str(now.strftime("%d/%m/%Y %H:%M:%S")))

    for j in range(2, limit_authors_aa_selection + 2, 2):
        
        author_list_filter = list_authors_aa_selection[:j]

        df_filter = df_final_aa[df_final_aa['id_author'].isin(author_list_filter)]
        
        df_metrics = df_metrics.append(run_simulation(df_filter, j, i, ca_list_11, author_list_filter)) 
        
        #df_metrics = df_metrics.append(run_simulation(df_filter, j, i, ca_list_02, author_list_filter)) 
        #df_metrics = df_metrics.append(run_simulation(df_filter, j, i, ca_list_03, author_list_filter)) 
        #df_metrics = df_metrics.append(run_simulation(df_filter, j, i, ca_list_04, author_list_filter))
        
        #df_metrics = df_metrics.append(run_simulation(df_filter, j, i, ca_list_12, author_list_filter)) 
        #df_metrics = df_metrics.append(run_simulation(df_filter, j, i, ca_list_13, author_list_filter)) 
        #df_metrics = df_metrics.append(run_simulation(df_filter, j, i, ca_list_14, author_list_filter))
        
        #df_metrics = df_metrics.append(run_simulation(df_filter, j, i, ca_list_221, author_list_filter)) 
        #df_metrics = df_metrics.append(run_simulation(df_filter, j, i, ca_list_231, author_list_filter)) 
        #df_metrics = df_metrics.append(run_simulation(df_filter, j, i, ca_list_241, author_list_filter)) 

        #df_metrics = df_metrics.append(run_simulation(df_filter, j, i, ca_list_222, author_list_filter)) 
        #df_metrics = df_metrics.append(run_simulation(df_filter, j, i, ca_list_232, author_list_filter)) 
        #df_metrics = df_metrics.append(run_simulation(df_filter, j, i, ca_list_242, author_list_filter)) 

        #df_metrics = df_metrics.append(run_simulation(df_filter, j, i, ca_list_32, author_list_filter)) 
        #df_metrics = df_metrics.append(run_simulation(df_filter, j, i, ca_list_33, author_list_filter)) 
        #df_metrics = df_metrics.append(run_simulation(df_filter, j, i, ca_list_34, author_list_filter)) 
        
        #df_metrics = df_metrics.append(run_simulation(df_filter, j, i, ca_list_112, author_list_filter)) 
        #df_metrics = df_metrics.append(run_simulation(df_filter, j, i, ca_list_113, author_list_filter)) 
        #df_metrics = df_metrics.append(run_simulation(df_filter, j, i, ca_list_114, author_list_filter)) 
        #df_metrics = df_metrics.append(run_simulation(df_filter, j, i, ca_list_115, author_list_filter)) 

        #df_metrics = df_metrics.append(run_simulation(df_filter, j, i, ca_list_132, author_list_filter)) 
        #df_metrics = df_metrics.append(run_simulation(df_filter, j, i, ca_list_133, author_list_filter)) 
        #df_metrics = df_metrics.append(run_simulation(df_filter, j, i, ca_list_134, author_list_filter)) 
        #df_metrics = df_metrics.append(run_simulation(df_filter, j, i, ca_list_135, author_list_filter)) 

        #df_metrics = df_metrics.append(run_simulation(df_filter, j, i, ca_list_1222, author_list_filter)) 
        #df_metrics = df_metrics.append(run_simulation(df_filter, j, i, ca_list_1232, author_list_filter)) 
        #df_metrics = df_metrics.append(run_simulation(df_filter, j, i, ca_list_1242, author_list_filter)) 
        #df_metrics = df_metrics.append(run_simulation(df_filter, j, i, ca_list_1252, author_list_filter)) 
        
    df_metrics.to_excel(os.path.join('output', 'kbest_proprio_baseline_apenas_EACH_USP_' + str(now.strftime("%d_%m_%Y__%H_%M_%S")) + ".xlsx"))

        #df_metrics = df_metrics.append(run_simulation(df_train, j, i, ca_list))
        #print(str(authors_shuffle[:j]))


Executing simulation number: 0 data: 20/07/2020 15:42:22
Executing simulation number: 1 data: 20/07/2020 15:43:19
Executing simulation number: 2 data: 20/07/2020 15:44:22


Traceback (most recent call last):
  File "<ipython-input-42-691b63822783>", line 10, in run_simulation
    p.fit(x_train['text'], y_train)
  File "/home/ubuntu/anaconda3/lib/python3.6/site-packages/sklearn/pipeline.py", line 248, in fit
    Xt, fit_params = self._fit(X, y, **fit_params)
  File "/home/ubuntu/anaconda3/lib/python3.6/site-packages/sklearn/pipeline.py", line 213, in _fit
    **fit_params_steps[name])
  File "/home/ubuntu/anaconda3/lib/python3.6/site-packages/sklearn/externals/joblib/memory.py", line 362, in __call__
    return self.func(*args, **kwargs)
  File "/home/ubuntu/anaconda3/lib/python3.6/site-packages/sklearn/pipeline.py", line 581, in _fit_transform_one
    res = transformer.fit_transform(X, y, **fit_params)
  File "/home/ubuntu/anaconda3/lib/python3.6/site-packages/sklearn/decomposition/pca.py", line 348, in fit_transform
    U, S, V = self._fit(X)
  File "/home/ubuntu/anaconda3/lib/python3.6/site-packages/sklearn/decomposition/pca.py", line 392, in _fit
    r

None


Traceback (most recent call last):
  File "<ipython-input-42-691b63822783>", line 10, in run_simulation
    p.fit(x_train['text'], y_train)
  File "/home/ubuntu/anaconda3/lib/python3.6/site-packages/sklearn/pipeline.py", line 248, in fit
    Xt, fit_params = self._fit(X, y, **fit_params)
  File "/home/ubuntu/anaconda3/lib/python3.6/site-packages/sklearn/pipeline.py", line 213, in _fit
    **fit_params_steps[name])
  File "/home/ubuntu/anaconda3/lib/python3.6/site-packages/sklearn/externals/joblib/memory.py", line 362, in __call__
    return self.func(*args, **kwargs)
  File "/home/ubuntu/anaconda3/lib/python3.6/site-packages/sklearn/pipeline.py", line 581, in _fit_transform_one
    res = transformer.fit_transform(X, y, **fit_params)
  File "/home/ubuntu/anaconda3/lib/python3.6/site-packages/sklearn/decomposition/pca.py", line 348, in fit_transform
    U, S, V = self._fit(X)
  File "/home/ubuntu/anaconda3/lib/python3.6/site-packages/sklearn/decomposition/pca.py", line 392, in _fit
    r

None
Executing simulation number: 3 data: 20/07/2020 15:45:20
Executing simulation number: 4 data: 20/07/2020 15:46:20
Executing simulation number: 5 data: 20/07/2020 15:47:21
Executing simulation number: 6 data: 20/07/2020 15:48:16
Executing simulation number: 7 data: 20/07/2020 15:49:19
Executing simulation number: 8 data: 20/07/2020 15:50:19
Executing simulation number: 9 data: 20/07/2020 15:51:22
Executing simulation number: 10 data: 20/07/2020 15:52:26
Executing simulation number: 11 data: 20/07/2020 15:53:27
Executing simulation number: 12 data: 20/07/2020 15:54:28
Executing simulation number: 13 data: 20/07/2020 15:55:25
Executing simulation number: 14 data: 20/07/2020 15:56:27
Executing simulation number: 15 data: 20/07/2020 15:57:32
Executing simulation number: 16 data: 20/07/2020 15:58:29
Executing simulation number: 17 data: 20/07/2020 15:59:31
Executing simulation number: 18 data: 20/07/2020 16:00:33
Executing simulation number: 19 data: 20/07/2020 16:01:33


### Teste McNemar's

In [40]:
from statsmodels.stats.contingency_tables import mcnemar

In [41]:
j=20
ca_classf_list_1 = ca_list_12
ca_classf_list_2 = ca_list_32

In [71]:
def test_mcnemar(j, ca_classf_list_1, ca_classf_list_2):

    clf_final_classif_1 = LogisticRegression(random_state=42, multi_class='multinomial', solver='newton-cg')
    clf_final_classif_2 = LogisticRegression(random_state=42, multi_class='multinomial', solver='newton-cg')

    author_list_filter = list_authors_aa_selection[:j]

    df_filter = df_final_aa[df_final_aa['id_author'].isin(author_list_filter)]

    x_train, x_test, y_train, y_test = train_test_split(df_filter[['text'] + ca_classf_list_1], 
                                                    df_filter['id_author'], 
                                                test_size=0.3,random_state=42)

    pipelines = [pipelineCharacter, pipelineObfuscator, pipelineWord]

    for p in pipelines:
        p.fit(x_train['text'], y_train)

    predict_text_train_aa_mix = np.hstack([p.predict_proba(x_train['text']) for p in pipelines])

    df_train_mix = pd.DataFrame(data=predict_text_train_aa_mix)

    for ca in ca_classf_list_1:
        df_train_mix[ca] = x_train[ca].values

    clf_final_classif_1.fit(df_train_mix, y_train)

    predict_text_test_aa_mix = np.hstack([p.predict_proba(x_test['text']) for p in pipelines])
    df_test_mix = pd.DataFrame(data=predict_text_test_aa_mix)

    for ca in ca_classf_list_1:
        df_test_mix[ca] = x_test[ca].values

    test_pred = clf_final_classif_1.predict(df_test_mix)

    x_train, x_test, y_train, y_test = train_test_split(df_filter[['text'] + ca_classf_list_2], 
                                                    df_filter['id_author'], 
                                                test_size=0.3,random_state=42)

    pipelines = [pipelineCharacter, pipelineObfuscator, pipelineWord]

    for p in pipelines:
        p.fit(x_train['text'], y_train)

    predict_text_train_aa_mix = np.hstack([p.predict_proba(x_train['text']) for p in pipelines])

    df_train_mix = pd.DataFrame(data=predict_text_train_aa_mix)

    for ca in ca_classf_list_2:
        df_train_mix[ca] = x_train[ca].values

    clf_final_classif_2.fit(df_train_mix, y_train)

    predict_text_test_aa_mix = np.hstack([p.predict_proba(x_test['text']) for p in pipelines])
    df_test_mix = pd.DataFrame(data=predict_text_test_aa_mix)

    for ca in ca_classf_list_2:
        df_test_mix[ca] = x_test[ca].values

    test_pred2 = clf_final_classif_2.predict(df_test_mix)

    df_mcnemar_test = pd.DataFrame()
    df_mcnemar_test['model_pred_classif_1'] = test_pred
    df_mcnemar_test['model_pred_classif_2'] = test_pred2
    df_mcnemar_test['original_label'] = y_test.values

    df_mcnemar_test['classf_1'] = np.where(df_mcnemar_test['model_pred_classif_1']==df_mcnemar_test['original_label'], 0, 1)
    df_mcnemar_test['classf_2'] = np.where(df_mcnemar_test['model_pred_classif_2']==df_mcnemar_test['original_label'], 0, 1)

    print("classf_1 acc: " + str(1-sum(df_mcnemar_test['classf_1'])/len(df_mcnemar_test)))
    print("classf_2 acc: " + str(1-sum(df_mcnemar_test['classf_2'])/len(df_mcnemar_test)))

    data_crosstab = pd.crosstab(df_mcnemar_test['classf_1'],  
                                df_mcnemar_test['classf_2'], 
                                margins = False) 
    print(data_crosstab)

    
    count_contingence_table = np.where(data_crosstab>25, 1, 0).sum()
    
    if count_contingence_table==4:
        result = mcnemar(data_crosstab, exact=False, correction=True)
        print("Todos os termos maiores que 25")
    else:
        print("Algum termo menor que 25")
        result = mcnemar(data_crosstab, exact=True)

    print('statistic=%.3f, p-value=%.3f' % (result.statistic, result.pvalue))
    # interpret the p-value
    alpha = 0.05
    if result.pvalue > alpha:
        print('Same proportions of errors (fail to reject H0)')
    else:
        print('Different proportions of errors (reject H0)')

In [72]:
test_mcnemar(20, ca_list_11, ca_list_14)

classf_1 acc: 0.3733333333333333
classf_2 acc: 0.4995833333333334
classf_2    0     1
classf_1           
0         894     2
1         305  1199
Algum termo menor que 25
statistic=2.000, p-value=0.000
Different proportions of errors (reject H0)


In [73]:
test_mcnemar(20, ca_list_11, ca_list_34)

classf_1 acc: 0.3733333333333333
classf_2 acc: 0.48124999999999996
classf_2    0     1
classf_1           
0         895     1
1         260  1244
Algum termo menor que 25
statistic=1.000, p-value=0.000
Different proportions of errors (reject H0)


In [74]:
test_mcnemar(20, ca_list_14, ca_list_34)

classf_1 acc: 0.4995833333333334
classf_2 acc: 0.48124999999999996
classf_2     0     1
classf_1            
0         1051   148
1          104  1097
Todos os termos maiores que 25
statistic=7.337, p-value=0.007
Different proportions of errors (reject H0)


###### Debug Teste McNemar

In [115]:
clf_final_classif_1 = LogisticRegression(random_state=42, multi_class='multinomial', solver='newton-cg')
clf_final_classif_2 = LogisticRegression(random_state=42, multi_class='multinomial', solver='newton-cg')

In [116]:
author_list_filter = list_authors_aa_selection[:j]

In [117]:
df_filter = df_final_aa[df_final_aa['id_author'].isin(author_list_filter)]

In [118]:
x_train, x_test, y_train, y_test = train_test_split(df_filter[['text'] + ca_classf_list_1], 
                                                df_filter['id_author'], 
                                            test_size=0.3,random_state=42)

pipelines = [pipelineCharacter, pipelineObfuscator, pipelineWord]

In [119]:
for p in pipelines:
    p.fit(x_train['text'], y_train)

In [120]:
predict_text_train_aa_mix = np.hstack([p.predict_proba(x_train['text']) for p in pipelines])

In [121]:
df_train_mix = pd.DataFrame(data=predict_text_train_aa_mix)

In [122]:
for ca in ca_classf_list_1:
    df_train_mix[ca] = x_train[ca].values

In [123]:
clf_final_classif_1.fit(df_train_mix, y_train)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='multinomial',
          n_jobs=1, penalty='l2', random_state=42, solver='newton-cg',
          tol=0.0001, verbose=0, warm_start=False)

In [124]:
predict_text_test_aa_mix = np.hstack([p.predict_proba(x_test['text']) for p in pipelines])
df_test_mix = pd.DataFrame(data=predict_text_test_aa_mix)

In [125]:
for ca in ca_classf_list_1:
    df_test_mix[ca] = x_test[ca].values

In [126]:
test_pred = clf_final_classif_1.predict(df_test_mix)

In [127]:
x_train, x_test, y_train, y_test = train_test_split(df_filter[['text'] + ca_classf_list_2], 
                                                df_filter['id_author'], 
                                            test_size=0.3,random_state=42)

pipelines = [pipelineCharacter, pipelineObfuscator, pipelineWord]

In [128]:
for p in pipelines:
    p.fit(x_train['text'], y_train)

In [129]:
predict_text_train_aa_mix = np.hstack([p.predict_proba(x_train['text']) for p in pipelines])

In [130]:
df_train_mix = pd.DataFrame(data=predict_text_train_aa_mix)

In [131]:
for ca in ca_classf_list_2:
    df_train_mix[ca] = x_train[ca].values

In [132]:
clf_final_classif_2.fit(df_train_mix, y_train)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='multinomial',
          n_jobs=1, penalty='l2', random_state=42, solver='newton-cg',
          tol=0.0001, verbose=0, warm_start=False)

In [133]:
predict_text_test_aa_mix = np.hstack([p.predict_proba(x_test['text']) for p in pipelines])
df_test_mix = pd.DataFrame(data=predict_text_test_aa_mix)

In [137]:
for ca in ca_classf_list_2:
    df_test_mix[ca] = x_test[ca].values

In [138]:
test_pred2 = clf_final_classif_2.predict(df_test_mix)

In [139]:
df_mcnemar_test = pd.DataFrame()
df_mcnemar_test['model_pred_classif_1'] = test_pred
df_mcnemar_test['model_pred_classif_2'] = test_pred2
df_mcnemar_test['original_label'] = y_test.values

In [140]:
df_mcnemar_test['classf_1'] = np.where(df_mcnemar_test['model_pred_classif_1']==df_mcnemar_test['original_label'], 0, 1)
df_mcnemar_test['classf_2'] = np.where(df_mcnemar_test['model_pred_classif_2']==df_mcnemar_test['original_label'], 0, 1)

In [141]:
print("classf_1 acc: " + str(1-sum(df_mcnemar_test['classf_1'])/len(df_mcnemar_test)))
print("classf_2 acc: " + str(1-sum(df_mcnemar_test['classf_2'])/len(df_mcnemar_test)))

classf_1 acc: 0.43999999999999995
classf_2 acc: 0.4575


In [142]:
data_crosstab = pd.crosstab(df_mcnemar_test['classf_1'],  
                            df_mcnemar_test['classf_2'], 
                            margins = False) 
print(data_crosstab)

classf_2     0     1
classf_1            
0         1002    54
1           96  1248


In [147]:
result = mcnemar(data_crosstab, exact=True)
result = mcnemar(data_crosstab, exact=False, correction=True)

In [148]:
print('statistic=%.3f, p-value=%.3f' % (result.statistic, result.pvalue))
# interpret the p-value
alpha = 0.05
if result.pvalue > alpha:
    print('Same proportions of errors (fail to reject H0)')
else:
    print('Different proportions of errors (reject H0)')

statistic=54.000, p-value=0.001
Different proportions of errors (reject H0)
