# EXIST 2021 - Feature extraction train <a class="anchor" id="feat-train"></a>

    ÁLVARO FAUBEL SANCHIS
    CLARA MARTÍ TORREGROSA

#####  Table of contents :
- [Requiered functions](#functions)
 * [Required libraries and configuration](#libraries)
 * [Pre-processing](#pre-pro)
 * [Feature extraction](#feat)
 * [Models](#models)
     
- [EXIST Task](#exist)
 * [Data load](#data-load)
 * [Modeling](#model)
     - [Baseline](#baseline)
         - [Task 1](#b-t1)
         - [Task 2](#b-t2)
     - [Tunning parameters](#tunning)
         - [Task 1](#tp-t1)
         - [Task 2](#tp-t2)
     - [Ensemble methods](#ensemble)
         - [Task 1](#e-t1)
         - [Task 2](#e-t2)




### Requiered functions  <a class="anchor" id="functions"></a>

#### Required libraries and configuration  <a class="anchor" id="libraries"></a>

In [None]:
# Data
import pandas as pd
import numpy as np

# Pre-processing
import preprocessor as p
import re
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import TweetTokenizer
from nltk.stem import PorterStemmer, WordNetLemmatizer, SnowballStemmer

# Feature extraction
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer, TfidfVectorizer
from gensim.models import word2vec
import gensim.downloader as api

# Models
from sklearn.model_selection import train_test_split, cross_val_score, KFold, GridSearchCV
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score, f1_score
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, BaggingClassifier, AdaBoostClassifier, StackingClassifier
from sklearn import svm
from sklearn import tree

#### Pre-processing  <a class="anchor" id="pre-pro"></a>

In [2]:
def preprocess_tweets(l_tweets, l_langs, keep_hastags = False):
    tweets_res = []
    tweets_stem = []
    tw_tknz = TweetTokenizer()
    stem_sp = SnowballStemmer('spanish')
    lem_en = WordNetLemmatizer()
    stopwords_es_en = set(stopwords.words(['english', 'spanish']))
    p.set_options(p.OPT.URL, p.OPT.MENTION, p.OPT.RESERVED,  
                  p.OPT.SMILEY, p.OPT.HASHTAG)
    if keep_hastags:
        p.set_options(p.OPT.URL, p.OPT.MENTION, p.OPT.RESERVED,
                      p.OPT.SMILEY)
    
    def clean(tweet):
        clean_tw = p.clean(tweet.lower())
        clean_numbers =  re.sub(r'(?:\d+)(?:\w)*', ' ', clean_tw)
        separe_compound = re.sub(r'-', ' ', clean_numbers)
        clean_punct = re.sub(r'[^a-zñáéíóúü#\s]', ' ', separe_compound)
        clean_spaces = re.sub(' +', ' ', clean_punct)
        return clean_spaces.strip()

    def tokenize(tweet_clean):
        return tw_tknz.tokenize(tweet_clean)
    
    def del_stopwords(tweet_token, l_stopwords):
        return [w for w in tweet_token if w not in l_stopwords]
    
    def do_stemming(tweet_token_nstop, stemer):
        return [stemer.stem(w) for w in tweet_token_nstop]

    def do_lemmatization(tweet_token_nstop, lemmatizer):
        return [lemmatizer.lemmatize(w) for w in tweet_token_nstop]
    
    for tw, lang in zip(l_tweets, l_langs):
        tw_clean = clean(tw)
        tw_token = tokenize(tw_clean)
        tw_nstop = del_stopwords(tw_token, stopwords_es_en)
        if lang == 'es':
            tweets_stem.append(do_stemming(tw_nstop, stem_sp))
        elif lang == 'en':
            tweets_stem.append(do_lemmatization(tw_nstop, lem_en))
        tweets_res.append(tw_nstop)
        
    return tweets_res, tweets_stem

#### Feature extraction <a class="anchor" id="feat"></a>

*  Bag of words

In [3]:
def create_bag_of_words(l_docs):
    vectorizer = CountVectorizer(tokenizer=lambda doc: doc, lowercase=False)
    bag_of_words = vectorizer.fit_transform(l_docs)
    #df_bag = pd.DataFrame(bag_of_words.toarray(), columns=vectorizer.get_feature_names())
    return vectorizer, bag_of_words

*  N-grams

In [4]:
def create_ngrams(l_docs, analyzer_type = 'word', ngram = 1):
    if analyzer_type != 'word':
        l_docs = [' '.join(tw) for tw in l_docs]
    vectorizer = CountVectorizer(tokenizer=lambda doc: doc, lowercase=False, 
                             analyzer=analyzer_type, ngram_range=(ngram,ngram))    
    counts = vectorizer.fit_transform(l_docs)
    #df_counts = pd.DataFrame(counts.toarray(), columns=vectorizer.get_feature_names())
    return vectorizer, counts

* TF-IDF

In [5]:
def codify_tfidf_vec(l_docs, idf=True, analyzer_type = 'word', ngram=1):
    if analyzer_type != 'word':
        l_docs = [' '.join(tw) for tw in l_docs]
    tfidf_vectorizer = TfidfVectorizer(tokenizer=lambda doc: doc, lowercase=False, 
                                       smooth_idf=True, use_idf=idf,
                                       analyzer=analyzer_type,
                                       ngram_range=(ngram,ngram))
    counts = tfidf_vectorizer.fit_transform(l_docs)
    #df_counts = pd.DataFrame(counts.toarray(), columns=tfidf_vectorizer.get_feature_names())
    return tfidf_vectorizer, counts

* Word Embeddings

In [6]:
def get_vectors_embeddings(model_pretrained, model_data, l_tweets):
    vectors = []
    for i,tw in enumerate(l_tweets):
        tw_vector = []
        for w in tw:
            try:
                tw_vector.append(model_pretrained.get_vector(w))
            except KeyError:
                try:
                    tw_vector.append(model_data.wv.get_vector(w))
                except KeyError:
                    pass
        sum_vec = sum(tw_vector)
        if type(sum_vec) == int:
            vectors.append(np.zeros(200))
        else:
            vectors.append(sum_vec/len(tw))
    return np.array(vectors)

#### Models <a class="anchor" id="models"></a>

* Cross validation

In [40]:
def crossval_kfold(X, y, model, k, shuffle = True):
    accuracys = []
    f1 = []
    kfs = KFold(k, shuffle = shuffle)
    for train_index, test_index in kfs.split(X):
        X_train, y_train = X[train_index], y[train_index]
        X_test, y_test = X[test_index], y[test_index]
        clf = model.fit(X_train, y_train)
        y_pred = clf.predict(X_test)
        accuracys.append(accuracy_score(y_test, y_pred))
        f1.append(f1_score(y_test, y_pred, average='macro'))
    return accuracys, f1

---

# EXIST Task <a class="anchor" id="exist"></a>

## Data load <a class="anchor" id="data-load"></a>

#### Train data

In [8]:
df = pd.read_csv('../EXIST2021_dataset/training/EXIST2021_training.tsv', sep='\t')

df['task1_encoding'] = df['task1'].replace({'sexist': 1, 'non-sexist':  0})      # Codificamos la variable a numérica

df.head()

Unnamed: 0,test_case,id,source,language,text,task1,task2,task1_encoding,task2_encoding
0,EXIST2021,1,twitter,en,"She calls herself ""anti-feminazi"" how about sh...",sexist,ideological-inequality,1,1
1,EXIST2021,2,twitter,en,"Now, back to these women, the brave and the be...",non-sexist,non-sexist,0,0
2,EXIST2021,3,twitter,en,"@CurvyBandida @Xalynne_B Wow, your skirt is ve...",sexist,objectification,1,5
3,EXIST2021,4,twitter,en,@AurelieGuiboud Incredible! Beautiful!But I l...,non-sexist,non-sexist,0,0
4,EXIST2021,5,twitter,en,i find it extremely hard to believe that kelly...,non-sexist,non-sexist,0,0


* Separate the data for task 2

In [45]:
df_t2 = df.drop(df.loc[df['task1']=='non-sexist'].index)

df_t2['task2_encoding'] = df_t2['task2'].replace({'ideological-inequality' : 0,
                                                  'stereotyping-dominance' : 1, 'misogyny-non-sexual-violence': 2,
                                                  'sexual-violence' : 3, 'objectification': 4})
df_t2.head()

Unnamed: 0,test_case,id,source,language,text,task1,task2,task1_encoding,task2_encoding
0,EXIST2021,1,twitter,en,"She calls herself ""anti-feminazi"" how about sh...",sexist,ideological-inequality,1,0
2,EXIST2021,3,twitter,en,"@CurvyBandida @Xalynne_B Wow, your skirt is ve...",sexist,objectification,1,4
5,EXIST2021,6,twitter,en,@Smithcouple971 Hello....m raj....m with good ...,sexist,sexual-violence,1,3
10,EXIST2021,11,twitter,en,@hapyshoper79 @Dis_Critic @MairiJCam @cazadams...,sexist,ideological-inequality,1,0
15,EXIST2021,16,twitter,en,@Ponderer_O_Purg @BynameRose @GameOverRos @nat...,sexist,ideological-inequality,1,0


* Process and clean the texts

In [10]:
tweets_clean_token, tweets_stem = preprocess_tweets(df['text'], df['language'], keep_hastags=False)
tweets_clean = [' '.join(tw) for tw in tweets_clean_token]

In [47]:
tweets_clean_token2, tweets_stem2 = preprocess_tweets(df_t2['text'], df_t2['language'], keep_hastags=False)
tweets_clean2 = [' '.join(tw) for tw in tweets_clean_token2]

---

## Modeling <a class="anchor" id="model"></a>

### Baseline <a class="anchor" id="baseline"></a>

In [60]:
baseline_models = [svm.SVC(),
                   RandomForestClassifier(),
                   LogisticRegression(),
                   tree.DecisionTreeClassifier()]

#### Task1 <a class="anchor" id="b-t1"></a>

* IDF

In [12]:
idf_vect, idf_matrix = codify_tfidf_vec(tweets_stem)
idf_df = pd.DataFrame(idf_matrix.toarray(), columns=idf_vect.get_feature_names())

In [13]:
X = idf_matrix
y = df.task1_encoding

In [14]:
for model in baseline_models:
    print(model)
    print('Evaluando...')
    model_acc, model_f1 = crossval_kfold(X, y, model, 5)
    print('Accuracy obtenido k-fold:', model_acc) 
    print('Accuracy medio:', sum(model_acc)/5, '\n')

SVC()
Evaluando...
Accuracy obtenido k-fold: [0.7299426934097422, 0.7220630372492837, 0.7311827956989247, 0.7211469534050179, 0.7211469534050179]
Accuracy medio: 0.7250964866335974 

RandomForestClassifier()
Evaluando...
Accuracy obtenido k-fold: [0.7084527220630372, 0.7306590257879656, 0.7025089605734767, 0.7304659498207885, 0.6931899641577061]
Accuracy medio: 0.7130553244805948 

LogisticRegression()
Evaluando...
Accuracy obtenido k-fold: [0.7191977077363897, 0.7034383954154728, 0.7455197132616488, 0.7232974910394265, 0.7017921146953405]
Accuracy medio: 0.7186490844296556 

DecisionTreeClassifier()
Evaluando...
Accuracy obtenido k-fold: [0.6568767908309455, 0.6790830945558739, 0.6659498207885305, 0.6537634408602151, 0.6781362007168459]
Accuracy medio: 0.6667618695504821 



*  Word embeddings 

In [15]:
glove_twitter = api.load("glove-twitter-200")
w2v_data = word2vec.Word2Vec(tweets_clean_token, vector_size=200)

vectors_embeddings = get_vectors_embeddings(glove_twitter, w2v_data, tweets_clean_token)

In [16]:
X = vectors_embeddings
y = df.task1_encoding

In [17]:
import warnings
warnings.filterwarnings("ignore")
for model in baseline_models:
    print(model)
    print('Evaluando...')
    model_acc, model_f1 = crossval_kfold(X, y, model, 5)
    print('Accuracy obtenido k-fold:', model_acc) 
    print('Accuracy medio:', sum(model_acc)/5, '\n')

SVC()
Evaluando...
Accuracy obtenido k-fold: [0.7335243553008596, 0.7156160458452722, 0.7111111111111111, 0.7046594982078853, 0.7161290322580646]
Accuracy medio: 0.7162080085446386 

RandomForestClassifier()
Evaluando...
Accuracy obtenido k-fold: [0.6984240687679083, 0.7012893982808023, 0.7053763440860215, 0.6845878136200717, 0.7189964157706094]
Accuracy medio: 0.7017348081050827 

LogisticRegression()
Evaluando...
Accuracy obtenido k-fold: [0.6891117478510028, 0.6812320916905444, 0.6881720430107527, 0.6652329749103942, 0.6982078853046595]
Accuracy medio: 0.6843913485534707 

DecisionTreeClassifier()
Evaluando...
Accuracy obtenido k-fold: [0.579512893982808, 0.576647564469914, 0.6064516129032258, 0.578494623655914, 0.5913978494623656]
Accuracy medio: 0.5865009088948454 



---

 #### Task2 <a class="anchor" id="b-t2"></a>

* IDF

In [58]:
idf_vect2, idf_matrix2 = codify_tfidf_vec(tweets_stem2)
idf_df2 = pd.DataFrame(idf_matrix2.toarray(), columns=idf_vect2.get_feature_names())

In [66]:
X = idf_matrix2
y = np.array((df_t2.task2_encoding))

In [68]:
for model in baseline_models:
    print(model)
    print('Evaluando...')
    model_acc, model_f1 = crossval_kfold(X, y, model, 5)
    print('F1 macro obtenido k-fold:', model_f1) 
    print('F1 medio:', sum(model_f1)/5, '\n')

SVC()
Evaluando...
F1 macro obtenido k-fold: [0.5825359329088223, 0.5826084629297744, 0.5753695404025984, 0.6031302944051457, 0.5617143824188529]
F1 medio: 0.5810717226130387 

RandomForestClassifier()
Evaluando...
F1 macro obtenido k-fold: [0.5873319585358323, 0.5690920638654616, 0.5560789881066166, 0.6350467885992981, 0.6068623663839959]
F1 medio: 0.590882433098241 

LogisticRegression()
Evaluando...
F1 macro obtenido k-fold: [0.5805940128738465, 0.6184385181956813, 0.6253407755957346, 0.5762944250725195, 0.6090636932494895]
F1 medio: 0.6019462849974543 

DecisionTreeClassifier()
Evaluando...
F1 macro obtenido k-fold: [0.5097184735530071, 0.5446841837203866, 0.5165066333595962, 0.5383358382946508, 0.559528367403811]
F1 medio: 0.5337546992662903 



* Word embeddings

In [76]:
w2v_data2 = word2vec.Word2Vec(tweets_clean_token2, vector_size=200)

vectors_embeddings2 = get_vectors_embeddings(glove_twitter, w2v_data2, tweets_clean_token2)

In [77]:
X = vectors_embeddings2
y =  np.array((df_t2.task2_encoding))

In [71]:
for model in baseline_models:
    print(model)
    print('Evaluando...')
    model_acc, model_f1 = crossval_kfold(X, y, model, 5)
    print('F1 macro obtenido k-fold:', model_f1) 
    print('F1 medio:', sum(model_f1)/5, '\n')

SVC()
Evaluando...
F1 macro obtenido k-fold: [0.55446551772001, 0.5642424397746209, 0.5774007002051638, 0.5497517038781369, 0.5445463449790106]
F1 medio: 0.5580813413113884 

RandomForestClassifier()
Evaluando...
F1 macro obtenido k-fold: [0.4886601492041721, 0.5420315534264546, 0.5141001290950256, 0.5101046270865016, 0.4883489987274273]
F1 medio: 0.5086490915079163 

LogisticRegression()
Evaluando...
F1 macro obtenido k-fold: [0.5719980679008865, 0.5437477110435744, 0.5404612121249144, 0.5311972365155213, 0.5480726087950496]
F1 medio: 0.5470953672759892 

DecisionTreeClassifier()
Evaluando...
F1 macro obtenido k-fold: [0.3788693107855562, 0.39387786395283475, 0.3302587664662731, 0.34007942133922897, 0.32936361440168993]
F1 medio: 0.35448979538911657 



---

### Tunning parameters <a class="anchor" id="tunning"></a>

In [72]:
svm_params = {'C': [0.1, 1, 10, 100], 
              'gamma': [1, 0.1, 0.01, 0.001],
              'kernel': ['rbf', 'poly', 'linear', 'sigmoid']}

rf_params = {'bootstrap': [True, False],
             'max_depth': [5, 10, 100, None],
             'criterion': ['gini', 'entropy']}

lr_params = {'penalty': ['l1','l2'], 
             'C': [0.001, 0.01, 0.1, 1, 10, 100, 1000]}

grid_svm = GridSearchCV(svm.SVC(), svm_params, 
                        refit = True, verbose = 2, n_jobs=12, scoring='accuracy')

grid_rf = GridSearchCV(RandomForestClassifier(), rf_params, 
                       refit = True, verbose = 2, n_jobs=12, scoring='accuracy')

grid_lr = GridSearchCV(LogisticRegression(), lr_params, 
                       refit = True, verbose = 2, n_jobs=12, scoring='accuracy')

#### Task1 <a class="anchor" id="tp-t1"></a>

* IDF

In [19]:
X = idf_matrix
y = df.task1_encoding

In [20]:
grid_svm.fit(X, y)
print(grid_svm.best_score_)
print(grid_svm.best_params_)

Fitting 5 folds for each of 64 candidates, totalling 320 fits
0.7157807766172681
{'C': 1, 'gamma': 1, 'kernel': 'sigmoid'}


In [21]:
grid_rf.fit(X, y)
print(grid_rf.best_score_)
print(grid_rf.best_params_)

Fitting 5 folds for each of 16 candidates, totalling 80 fits
0.7077564161814093
{'bootstrap': True, 'criterion': 'gini', 'max_depth': 100}


In [22]:
grid_lr.fit(X, y)
print(grid_lr.best_score_)
print(grid_lr.best_params_)

Fitting 5 folds for each of 14 candidates, totalling 70 fits
0.7140636329091825
{'C': 1, 'penalty': 'l2'}


* Word embeddings

In [23]:
X = vectors_embeddings
y = df.task1_encoding

In [24]:
grid_svm.fit(X, y)
print(grid_svm.best_score_)
print(grid_svm.best_params_)

Fitting 5 folds for each of 64 candidates, totalling 320 fits
0.7162090355444639
{'C': 1, 'gamma': 0.1, 'kernel': 'rbf'}


In [25]:
grid_rf.fit(X, y)
print(grid_rf.best_score_)
print(grid_rf.best_params_)

Fitting 5 folds for each of 16 candidates, totalling 80 fits
0.695570241653059
{'bootstrap': True, 'criterion': 'entropy', 'max_depth': 100}


In [26]:
grid_lr.fit(X, y)
print(grid_lr.best_score_)
print(grid_lr.best_params_)

Fitting 5 folds for each of 14 candidates, totalling 70 fits
0.6855412802579824
{'C': 0.1, 'penalty': 'l2'}


---

#### Task2 <a class="anchor" id="tp-t2"></a>

* IDF

In [78]:
X = idf_matrix2
y = np.array((df_t2.task2_encoding))

In [79]:
grid_svm.fit(X, y)
print(grid_svm.best_score_)
print(grid_svm.best_params_)

Fitting 5 folds for each of 64 candidates, totalling 320 fits
0.6017357001972388
{'C': 10, 'gamma': 0.1, 'kernel': 'rbf'}


In [80]:
grid_rf.fit(X, y)
print(grid_rf.best_score_)
print(grid_rf.best_params_)

Fitting 5 folds for each of 16 candidates, totalling 80 fits
0.5904742493973263
{'bootstrap': True, 'criterion': 'entropy', 'max_depth': 100}


In [81]:
grid_lr.fit(X, y)
print(grid_lr.best_score_)
print(grid_lr.best_params_)

Fitting 5 folds for each of 14 candidates, totalling 70 fits
0.6014358974358973
{'C': 1, 'penalty': 'l2'}


* Word embeddings

In [82]:
X = vectors_embeddings2
y =  np.array((df_t2.task2_encoding))

In [83]:
grid_svm.fit(X, y)
print(grid_svm.best_score_)
print(grid_svm.best_params_)

Fitting 5 folds for each of 64 candidates, totalling 320 fits
0.5724028051720359
{'C': 10, 'gamma': 0.01, 'kernel': 'rbf'}


In [84]:
grid_rf.fit(X, y)
print(grid_rf.best_score_)
print(grid_rf.best_params_)

Fitting 5 folds for each of 16 candidates, totalling 80 fits
0.5335998246767477
{'bootstrap': False, 'criterion': 'gini', 'max_depth': None}


In [85]:
grid_lr.fit(X, y)
print(grid_lr.best_score_)
print(grid_lr.best_params_)

Fitting 5 folds for each of 14 candidates, totalling 70 fits
0.5558181021257945
{'C': 0.1, 'penalty': 'l2'}


---

### Ensemble methods <a class="anchor" id="ensemble"></a>

#### Task1 <a class="anchor" id="e-t1"></a>

* IDF

In [29]:
X = idf_matrix
y = df.task1_encoding

clf_svm = svm.SVC(C=1, gamma=1, kernel='sigmoid')
clf_lr = LogisticRegression(C=1, penalty='l2')
clf_rf = RandomForestClassifier(bootstrap=True, criterion='gini', max_depth= 100)

In [30]:
bagging_svm = BaggingClassifier(base_estimator=clf_svm, n_estimators=10)
print('Acc. medio Bagging con SVM:', sum(crossval_kfold(X, y, bagging_svm, 5)[0])/5)

bagging_lr = BaggingClassifier(base_estimator=clf_lr, n_estimators=10)
print('Acc. medio Bagging con LR:', sum(crossval_kfold(X, y, bagging_lr, 5)[0])/5)

Acc. medio Bagging con SVM: 0.7146318719125817
Acc. medio Bagging con LR: 0.7220889176448839


In [31]:
#boost_rf = AdaBoostClassifier(base_estimator=clf_rf, n_estimators=100)
#print('Acc. medio AdaBoost con RF:', sum(crossval_kfold(X, y, boost_rf, 5)[0])/5)

boost_lr = AdaBoostClassifier(base_estimator=clf_lr, n_estimators=100)
print('Acc. medio AdaBoost con LR:', sum(crossval_kfold(X, y, boost_lr, 5)[0])/5)

Acc. medio AdaBoost con LR: 0.5645706627229873


In [32]:
stack = StackingClassifier(estimators=[('svm', clf_svm)], final_estimator= clf_lr)
print('Acc. medio Stack SVM LR:', sum(crossval_kfold(X, y, stack, 5)[0])/5)

Acc. medio Stack SVM LR: 0.7172116954740118


* Word embeddings

In [33]:
X = vectors_embeddings
y = df.task1_encoding

clf_svm = svm.SVC(C=1, gamma=0.1, kernel='rbf')
clf_lr = LogisticRegression(C=0.1, penalty='l2')
clf_rf = RandomForestClassifier(bootstrap=True, criterion='entropy', max_depth= 100)

In [34]:
bagging_svm = BaggingClassifier(base_estimator=clf_svm, n_estimators=10)
print('Acc. medio Bagging con SVM:', sum(crossval_kfold(X, y, bagging_svm, 5)[0])/5)

bagging_lr = BaggingClassifier(base_estimator=clf_lr, n_estimators=10)
print('Acc. medio Bagging con LR:', sum(crossval_kfold(X, y, bagging_lr, 5)[0])/5)

Acc. medio Bagging con SVM: 0.7152050405151431
Acc. medio Bagging con LR: 0.6899833626028283


In [35]:
#boost_rf = AdaBoostClassifier(base_estimator=clf_rf, n_estimators=100)
#print('Acc. medio AdaBoost con RF:', sum(crossval_kfold(X, y, boost_rf, 5)[0])/5)

boost_lr = AdaBoostClassifier(base_estimator=clf_lr, n_estimators=100)
print('Acc. medio AdaBoost con LR:', sum(crossval_kfold(X, y, boost_lr, 5)[0])/5)

Acc. medio AdaBoost con LR: 0.6508485072557537


In [36]:
stack = StackingClassifier(estimators=[('svm', clf_svm)], final_estimator= clf_lr)
print('Acc. medio Stack SVM LR:', sum(crossval_kfold(X, y, stack, 5)[0])/5)

Acc. medio Stack SVM LR: 0.718358340779082


---

#### Task2 <a class="anchor" id="e-t2"></a>

* IDF

In [88]:
X = idf_matrix2
y = np.array((df_t2.task2_encoding))

clf_svm = svm.SVC(C=10, gamma=0.1, kernel='rbf')
clf_lr = LogisticRegression(C=1, penalty='l2')
clf_rf = RandomForestClassifier(bootstrap=True, criterion='entropy', max_depth= 100)

In [89]:
bagging_svm = BaggingClassifier(base_estimator=clf_svm, n_estimators=10)
print('F1 medio Bagging con SVM:', sum(crossval_kfold(X, y, bagging_svm, 5)[1])/5)

bagging_lr = BaggingClassifier(base_estimator=clf_lr, n_estimators=10)
print('F1 medio Bagging con LR:', sum(crossval_kfold(X, y, bagging_lr, 5)[1])/5)

F1 medio Bagging con SVM: 0.5943091181591337
F1 medio Bagging con LR: 0.591539937982972


In [90]:
boost_lr = AdaBoostClassifier(base_estimator=clf_lr, n_estimators=100)
print('F1 medio AdaBoost con LR:', sum(crossval_kfold(X, y, boost_lr, 5)[1])/5)

F1 medio AdaBoost con LR: 0.240439684342536


In [91]:
stack = StackingClassifier(estimators=[('svm', clf_svm)], final_estimator= clf_lr)
print('F1 medio Stack SVM LR:', sum(crossval_kfold(X, y, stack, 5)[1])/5)

F1 medio Stack SVM LR: 0.602023876247787


* Word Embeddings

In [96]:
X = vectors_embeddings2
y = np.array((df_t2.task2_encoding))

clf_svm = svm.SVC(C=10, gamma=0.01, kernel='rbf')
clf_lr = LogisticRegression(C=0.1, penalty='l2')
clf_rf = RandomForestClassifier(bootstrap=False, criterion='gini', max_depth= None)

In [93]:
bagging_svm = BaggingClassifier(base_estimator=clf_svm, n_estimators=10)
print('F1 medio Bagging con SVM:', sum(crossval_kfold(X, y, bagging_svm, 5)[1])/5)

bagging_lr = BaggingClassifier(base_estimator=clf_lr, n_estimators=10)
print('F1 medio Bagging con LR:', sum(crossval_kfold(X, y, bagging_lr, 5)[1])/5)

F1 medio Bagging con SVM: 0.5660414326833407
F1 medio Bagging con LR: 0.5612940944956113


In [94]:
boost_lr = AdaBoostClassifier(base_estimator=clf_lr, n_estimators=100)
print('F1 medio AdaBoost con LR:', sum(crossval_kfold(X, y, boost_lr, 5)[1])/5)

F1 medio AdaBoost con LR: 0.4243169227181928


In [95]:
stack = StackingClassifier(estimators=[('svm', clf_svm)], final_estimator= clf_lr)
print('F1 medio Stack SVM LR:', sum(crossval_kfold(X, y, stack, 5)[1])/5)

F1 medio Stack SVM LR: 0.5741816754845492


---