# EXIST 2021 - Feature extraction predicts <a class="anchor" id="feat-preds"></a>

    ÁLVARO FAUBEL SANCHIS
    CLARA MARTÍ TORREGROSA

#####  Table of contents :
- [Requiered functions](#functions)
 * [Required libraries and configuration](#libraries)
 * [Pre-processing](#pre-pro)
 * [Feature extraction](#feat)
     
- [EXIST Task](#exist)
 * [Data load](#data-load)
 * [Predictions](#preds)
     - [IDF](#idf)
         - [Task 1](#idf-t1)
         - [Task 2](#idf-t2)
     - [Word Embeddings](#word-emb)
         - [Task 1](#we-t1)
         - [Task 2](#we-t2)
 * [Submission results](#submission)
      - [IDF](#sub-idf)
      - [Word Embeddings](#sub-idf)





### Required functions <a class="anchor" id="functions"></a>

#### Required libraries and configuration <a class="anchor" id="libraries"></a>

In [None]:
# Data
import pandas as pd
import numpy as np

# Pre-processing
import preprocessor as p
import re
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import TweetTokenizer
from nltk.stem import PorterStemmer, WordNetLemmatizer, SnowballStemmer

# Feature extraction
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer, TfidfVectorizer
from gensim.models import word2vec
import gensim.downloader as api

# Models
from sklearn.model_selection import train_test_split, cross_val_score, KFold, GridSearchCV
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score, f1_score
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, BaggingClassifier, AdaBoostClassifier, StackingClassifier
from sklearn import svm
from sklearn import tree

#### Pre-processing <a class="anchor" id="pre-pro"></a>

In [2]:
def preprocess_tweets(l_tweets, l_langs, keep_hastags = False):
    tweets_res = []
    tweets_stem = []
    tw_tknz = TweetTokenizer()
    stem_sp = SnowballStemmer('spanish')
    lem_en = WordNetLemmatizer()
    stopwords_es_en = set(stopwords.words(['english', 'spanish']))
    p.set_options(p.OPT.URL, p.OPT.MENTION, p.OPT.RESERVED,  
                  p.OPT.SMILEY, p.OPT.HASHTAG)
    if keep_hastags:
        p.set_options(p.OPT.URL, p.OPT.MENTION, p.OPT.RESERVED,
                      p.OPT.SMILEY)
    
    def clean(tweet):
        clean_tw = p.clean(tweet.lower())
        clean_numbers =  re.sub(r'(?:\d+)(?:\w)*', ' ', clean_tw)
        separe_compound = re.sub(r'-', ' ', clean_numbers)
        clean_punct = re.sub(r'[^a-zñáéíóúü#\s]', ' ', separe_compound)
        clean_spaces = re.sub(' +', ' ', clean_punct)
        return clean_spaces.strip()

    def tokenize(tweet_clean):
        return tw_tknz.tokenize(tweet_clean)
    
    def del_stopwords(tweet_token, l_stopwords):
        return [w for w in tweet_token if w not in l_stopwords]
    
    def do_stemming(tweet_token_nstop, stemer):
        return [stemer.stem(w) for w in tweet_token_nstop]

    def do_lemmatization(tweet_token_nstop, lemmatizer):
        return [lemmatizer.lemmatize(w) for w in tweet_token_nstop]
    
    for tw, lang in zip(l_tweets, l_langs):
        tw_clean = clean(tw)
        tw_token = tokenize(tw_clean)
        tw_nstop = del_stopwords(tw_token, stopwords_es_en)
        if lang == 'es':
            tweets_stem.append(do_stemming(tw_nstop, stem_sp))
        elif lang == 'en':
            tweets_stem.append(do_lemmatization(tw_nstop, lem_en))
        tweets_res.append(tw_nstop)
        
    return tweets_res, tweets_stem

#### Feature extraction <a class="anchor" id="feat"></a>

* TF-IDF

In [3]:
def codify_tfidf_vec(l_docs, idf=True, analyzer_type = 'word', ngram=1):
    if analyzer_type != 'word':
        l_docs = [' '.join(tw) for tw in l_docs]
    tfidf_vectorizer = TfidfVectorizer(tokenizer=lambda doc: doc, lowercase=False, 
                                       smooth_idf=True, use_idf=idf,
                                       analyzer=analyzer_type,
                                       ngram_range=(ngram,ngram))
    counts = tfidf_vectorizer.fit_transform(l_docs)
    #df_counts = pd.DataFrame(counts.toarray(), columns=tfidf_vectorizer.get_feature_names())
    return tfidf_vectorizer, counts

* Word embeddings

In [4]:
def get_vectors_embeddings(model_pretrained, model_data, l_tweets):
    vectors = []
    for i,tw in enumerate(l_tweets):
        tw_vector = []
        for w in tw:
            try:
                tw_vector.append(model_pretrained.get_vector(w))
            except KeyError:
                try:
                    tw_vector.append(model_data.wv.get_vector(w))
                except KeyError:
                    pass
        sum_vec = sum(tw_vector)
        if type(sum_vec) == int:
            vectors.append(np.zeros(200))
        else:
            vectors.append(sum_vec/len(tw))
    return np.array(vectors)

---

# EXIST Task <a class="anchor" id="exist"></a>

## Data load <a class="anchor" id="data-load"></a>

#### Train data

In [5]:
df = pd.read_csv('../EXIST2021_dataset/training/EXIST2021_training.tsv', sep='\t')

df['task1_encoding'] = df['task1'].replace({'sexist': 1, 'non-sexist':  0})      # Codificamos la variable a numérica

df.head()

Unnamed: 0,test_case,id,source,language,text,task1,task2,task1_encoding
0,EXIST2021,1,twitter,en,"She calls herself ""anti-feminazi"" how about sh...",sexist,ideological-inequality,1
1,EXIST2021,2,twitter,en,"Now, back to these women, the brave and the be...",non-sexist,non-sexist,0
2,EXIST2021,3,twitter,en,"@CurvyBandida @Xalynne_B Wow, your skirt is ve...",sexist,objectification,1
3,EXIST2021,4,twitter,en,@AurelieGuiboud Incredible! Beautiful!But I l...,non-sexist,non-sexist,0
4,EXIST2021,5,twitter,en,i find it extremely hard to believe that kelly...,non-sexist,non-sexist,0


* Separate the data for task 2

In [6]:
df_t2 = df.drop(df.loc[df['task1']=='non-sexist'].index)

df_t2['task2_encoding'] = df_t2['task2'].replace({'ideological-inequality' : 0,
                                                  'stereotyping-dominance' : 1, 'misogyny-non-sexual-violence': 2,
                                                  'sexual-violence' : 3, 'objectification': 4})
df_t2.head()

Unnamed: 0,test_case,id,source,language,text,task1,task2,task1_encoding,task2_encoding
0,EXIST2021,1,twitter,en,"She calls herself ""anti-feminazi"" how about sh...",sexist,ideological-inequality,1,0
2,EXIST2021,3,twitter,en,"@CurvyBandida @Xalynne_B Wow, your skirt is ve...",sexist,objectification,1,4
5,EXIST2021,6,twitter,en,@Smithcouple971 Hello....m raj....m with good ...,sexist,sexual-violence,1,3
10,EXIST2021,11,twitter,en,@hapyshoper79 @Dis_Critic @MairiJCam @cazadams...,sexist,ideological-inequality,1,0
15,EXIST2021,16,twitter,en,@Ponderer_O_Purg @BynameRose @GameOverRos @nat...,sexist,ideological-inequality,1,0


* Process and clean the texts

In [7]:
tweets_clean_token, tweets_stem = preprocess_tweets(df['text'], df['language'], keep_hastags=False)
tweets_clean = [' '.join(tw) for tw in tweets_clean_token]

In [8]:
tweets_clean_token2, tweets_stem2 = preprocess_tweets(df_t2['text'], df_t2['language'], keep_hastags=False)
tweets_clean2 = [' '.join(tw) for tw in tweets_clean_token2]

#### Test data

In [9]:
df_test = pd.read_csv('../EXIST2021_dataset/test/EXIST2021_test.tsv', sep='\t')

df_test.head()

Unnamed: 0,test_case,id,source,language,text
0,EXIST2021,6978,gab,en,Pennsylvania State Rep horrifies with opening ...
1,EXIST2021,6979,twitter,en,"@iilovegrapes He sounds like as ass, and very ..."
2,EXIST2021,6980,twitter,en,"@averyangryskel1 @4ARealistParty LOL! ""This be..."
3,EXIST2021,6981,twitter,en,@WanderOrange @stalliontwink Rights?I mean yea...
4,EXIST2021,6982,twitter,en,the jack manifold appreciation i’m seeing is o...


* Process and clean the texts

In [10]:
tweets_clean_token_test, tweets_stem_test = preprocess_tweets(df_test['text'], df_test['language'], keep_hastags=False)
tweets_clean_test = [' '.join(tw) for tw in tweets_clean_token_test]

---

## Predictions <a class="anchor" id="preds"></a>

### IDF <a class="anchor" id="idf"></a>

#### Task1 <a class="anchor" id="idf-t1"></a>

* We fit the best model to the whole train data

In [54]:
idf_vect1, idf_matrix1 = codify_tfidf_vec(tweets_stem + tweets_stem_test)
idf_df1 = pd.DataFrame(idf_matrix1.toarray(), columns=idf_vect1.get_feature_names())

In [55]:
Xtrain_idf1 = idf_matrix1[:len(df),]
ytrain_idf1 = df.task1_encoding

In [56]:
clf_idf1 = LogisticRegression(C=1, penalty='l2')
bagging_idf1 = BaggingClassifier(base_estimator=clf_idf1, n_estimators=10)
bagging_idf1.fit(Xtrain_idf1, ytrain_idf1)

BaggingClassifier(base_estimator=LogisticRegression(C=1))

* Predict over test dataset

In [57]:
Xtest_idf1 = idf_matrix1[len(df):,]
preds_idf1 = bagging_idf1.predict(Xtest_idf1)

In [58]:
df_test_idf = df_test
df_test_idf['task1_encoding'] = preds_idf1

---

#### Task2

* Remove the examples classified as non-sexist

In [59]:
df_test2_idf = df_test_idf.drop(df_test_idf.loc[df_test_idf['task1_encoding']==0].index)

In [60]:
tweets_clean_token_test2, tweets_stem_test2 = preprocess_tweets(df_test2_idf['text'], df_test2_idf['language'], keep_hastags=False)
tweets_clean_test2 = [' '.join(tw) for tw in tweets_clean_token_test2]

* We fit the best model to the whole train data

In [61]:
idf_vect2, idf_matrix2 = codify_tfidf_vec(tweets_stem2 + tweets_stem_test2)
idf_df2 = pd.DataFrame(idf_matrix2.toarray(), columns=idf_vect2.get_feature_names())

In [62]:
Xtrain_idf2 = idf_matrix2[:len(df_t2),]
ytrain_idf2 = np.array(df_t2.task2_encoding)

In [63]:
clf_idf21 = svm.SVC(C=10, gamma=0.1, kernel='rbf')
clf_idf22 = LogisticRegression(C=1, penalty='l2')

stack_idf2 = StackingClassifier(estimators=[('svm', clf_idf21)], final_estimator=clf_idf22)
stack_idf2.fit(Xtrain_idf2, ytrain_idf2)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


StackingClassifier(estimators=[('svm', SVC(C=10, gamma=0.1))],
                   final_estimator=LogisticRegression(C=1))

* Predict over the test dataset

In [64]:
Xtest_idf2 = idf_matrix2[len(df_t2):,]
preds_idf2 = stack_idf2.predict(Xtest_idf2)

In [65]:
df_test2_idf['task2_encoding'] = preds_idf2

---

### Word embeddings <a class="anchor" id="word-emb"></a>

#### Task1 <a class="anchor" id="we-t1"></a>

* We fit the best model to the whole train data

In [67]:
#glove_twitter = api.load("glove-twitter-200")
w2v_data1 = word2vec.Word2Vec(tweets_clean_token, vector_size=200)

vec_train1 = get_vectors_embeddings(glove_twitter, w2v_data1, tweets_clean_token)

In [68]:
X_train_we1 = vec_train1
y_train_we1 = df.task1_encoding

In [69]:
clf_we11 = svm.SVC(C=1, gamma=0.1, kernel='rbf')
clf_we12 = LogisticRegression(C=0.1, penalty='l2')

stack_we1 = StackingClassifier(estimators=[('svm', clf_we11)], final_estimator=clf_we12)
stack_we1.fit(X_train_we1, y_train_we1)

StackingClassifier(estimators=[('svm', SVC(C=1, gamma=0.1))],
                   final_estimator=LogisticRegression(C=0.1))

* Predict over the test dataset

In [70]:
vec_test1 = get_vectors_embeddings(glove_twitter, w2v_data1, tweets_clean_token_test)

In [71]:
Xtest_we1 = vec_test1
preds_we1 = stack_we1.predict(Xtest_we1)

In [72]:
df_test_we = df_test
df_test_we['task1_encoding'] = preds_we1

#### Task2 <a class="anchor" id="we-t2"></a>

* We remove the examples classified as non-sexist

In [74]:
df_test2_we = df_test_we.drop(df_test_we.loc[df_test_we['task1_encoding']==0].index)

In [75]:
tweets_clean_token_test2, tweets_stem_test2 = preprocess_tweets(df_test2_we['text'], df_test2_we['language'], keep_hastags=False)
tweets_clean_test2 = [' '.join(tw) for tw in tweets_clean_token_test2]

* We fit the best model to the whole train data

In [76]:
w2v_data2 = word2vec.Word2Vec(tweets_clean_token2, vector_size=200)

vec_train2 = get_vectors_embeddings(glove_twitter, w2v_data2, tweets_clean_token2)

In [77]:
X_train_we2 = vec_train2
y_train_we2 = np.array(df_t2.task2_encoding)

In [78]:
clf_we21 = svm.SVC(C=10, gamma=0.1, kernel='rbf')
clf_we22 = LogisticRegression(C=1, penalty='l2')

stack_we2 = StackingClassifier(estimators=[('svm', clf_we21)], final_estimator=clf_we22)
stack_we2.fit(X_train_we2, y_train_we2)

StackingClassifier(estimators=[('svm', SVC(C=10, gamma=0.1))],
                   final_estimator=LogisticRegression(C=1))

* Predict over the test dataset

In [79]:
vec_test2 = get_vectors_embeddings(glove_twitter, w2v_data2, tweets_clean_token_test2)

In [80]:
Xtest_we2 = vec_test2
preds_we2 = stack_we1.predict(Xtest_we2)

In [81]:
df_test2_we['task2_encoding'] = preds_we2

---

## Submission results <a class="anchor" id="submission"></a>

### IDF <a class="anchor" id="sub-idf"></a>

* Concatenate the dataframes obtained

In [99]:
df_both_idf = pd.merge(df_test_idf, df_test2_idf[['id', 'task2_encoding']], left_on='id',  right_on='id', how='outer')

df_both_idf['task2_encoding'] = df_both_idf['task2_encoding'].fillna('non-sexist')
df_both_idf['task1'] = df_both_idf['task1_encoding'].replace({0: 'non-sexist',
                                                      1: 'sexist'})

df_both_idf['task2'] = df_both_idf['task2_encoding'].replace({0: 'ideological-inequality',
                                                      1: 'stereotyping-dominance',
                                                      2: 'misogyny-non-sexual-violence',
                                                      3: 'sexual-violence', 
                                                      4: 'objectification'})
df_both_idf.head()

Unnamed: 0,test_case,id,source,language,text,task1_encoding,task2_encoding,task1,task2
0,EXIST2021,6978,gab,en,Pennsylvania State Rep horrifies with opening ...,0,non-sexist,non-sexist,non-sexist
1,EXIST2021,6979,twitter,en,"@iilovegrapes He sounds like as ass, and very ...",0,non-sexist,non-sexist,non-sexist
2,EXIST2021,6980,twitter,en,"@averyangryskel1 @4ARealistParty LOL! ""This be...",1,0.0,sexist,ideological-inequality
3,EXIST2021,6981,twitter,en,@WanderOrange @stalliontwink Rights?I mean yea...,1,1.0,sexist,stereotyping-dominance
4,EXIST2021,6982,twitter,en,the jack manifold appreciation i’m seeing is o...,0,non-sexist,non-sexist,non-sexist


* Format the results for submission

In [90]:
df_final1_idf = pd.DataFrame({'test_case': df_both_idf['test_case'],
                             'id': df_both_idf['id'].apply(lambda x: str(x).zfill(6)),
                             'task1': df_both_idf['task1']})
df_final1_idf.head()

Unnamed: 0,test_case,id,task1
0,EXIST2021,6978,non-sexist
1,EXIST2021,6979,non-sexist
2,EXIST2021,6980,sexist
3,EXIST2021,6981,sexist
4,EXIST2021,6982,non-sexist


In [91]:
df_final1_idf.to_csv('../Submission/exist2021_Alclatos/task1_Alclatos_2.tsv', sep='\t', header=None, index=False)

In [92]:
df_final2_idf = pd.DataFrame({'test_case': df_both_idf['test_case'],
                             'id': df_both_idf['id'].apply(lambda x: str(x).zfill(6)),
                             'task2': df_both_idf['task2']})
df_final2_idf.head()

Unnamed: 0,test_case,id,task2
0,EXIST2021,6978,non-sexist
1,EXIST2021,6979,non-sexist
2,EXIST2021,6980,ideological-inequality
3,EXIST2021,6981,stereotyping-dominance
4,EXIST2021,6982,non-sexist


In [93]:
df_final2_idf.to_csv('../Submission/exist2021_Alclatos/task2_Alclatos_2.tsv', sep='\t', header=None, index=False)

---

### Word embeddings <a class="anchor" id="sub-we"></a>

* Concatenate the dataframes obtained

In [98]:
df_both_we = pd.merge(df_test_we, df_test2_we[['id', 'task2_encoding']], left_on='id',  right_on='id', how='outer')

df_both_we['task2_encoding'] = df_both_we['task2_encoding'].fillna('non-sexist')
df_both_we['task1'] = df_both_we['task1_encoding'].replace({0: 'non-sexist',
                                                      1: 'sexist'})

df_both_we['task2'] = df_both_we['task2_encoding'].replace({0: 'ideological-inequality',
                                                      1: 'stereotyping-dominance',
                                                      2: 'misogyny-non-sexual-violence',
                                                      3: 'sexual-violence', 
                                                      4: 'objectification'})
df_both_we.head()

Unnamed: 0,test_case,id,source,language,text,task1_encoding,task2_encoding,task1,task2
0,EXIST2021,6978,gab,en,Pennsylvania State Rep horrifies with opening ...,0,non-sexist,non-sexist,non-sexist
1,EXIST2021,6979,twitter,en,"@iilovegrapes He sounds like as ass, and very ...",0,non-sexist,non-sexist,non-sexist
2,EXIST2021,6980,twitter,en,"@averyangryskel1 @4ARealistParty LOL! ""This be...",1,1.0,sexist,stereotyping-dominance
3,EXIST2021,6981,twitter,en,@WanderOrange @stalliontwink Rights?I mean yea...,1,1.0,sexist,stereotyping-dominance
4,EXIST2021,6982,twitter,en,the jack manifold appreciation i’m seeing is o...,0,non-sexist,non-sexist,non-sexist


* Format the results for submission

In [94]:
df_final1_we = pd.DataFrame({'test_case': df_both_we['test_case'],
                             'id': df_both_we['id'].apply(lambda x: str(x).zfill(6)),
                             'task1': df_both_we['task1']})
df_final1_we.head()

Unnamed: 0,test_case,id,task1
0,EXIST2021,6978,non-sexist
1,EXIST2021,6979,non-sexist
2,EXIST2021,6980,sexist
3,EXIST2021,6981,sexist
4,EXIST2021,6982,non-sexist


In [95]:
df_final1_we.to_csv('../Submission/exist2021_Alclatos/task1_Alclatos_3.tsv', sep='\t', header=None, index=False)

In [96]:
df_final2_we = pd.DataFrame({'test_case': df_both_we['test_case'],
                             'id': df_both_we['id'].apply(lambda x: str(x).zfill(6)),
                             'task2': df_both_we['task2']})
df_final2_we.head()

Unnamed: 0,test_case,id,task2
0,EXIST2021,6978,non-sexist
1,EXIST2021,6979,non-sexist
2,EXIST2021,6980,stereotyping-dominance
3,EXIST2021,6981,stereotyping-dominance
4,EXIST2021,6982,non-sexist


In [97]:
df_final2_we.to_csv('../Submission/exist2021_Alclatos/task2_Alclatos_3.tsv', sep='\t', header=None, index=False)

---