### Dependencies

In [None]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split, cross_val_score, StratifiedKFold
from sklearn.pipeline import Pipeline
from sklearn.metrics import accuracy_score, confusion_matrix,classification_report
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
import spacy
from sklearn.model_selection import cross_validate
from sklearn.metrics import make_scorer, accuracy_score, precision_score, recall_score, f1_score
from sklearn import svm
import re


In [2]:
nlp=spacy.load('en_core_web_sm')
pd.set_option('display.max_colwidth',1000)

### Load Data

In [3]:
train_file='train_all_tasks.csv'
test_file='dev_task_a_entries.csv'
train_ds=pd.read_csv(train_file)
test_ds=pd.read_csv(test_file)


### Tokenizing

In [4]:
stopwords=nlp.Defaults.stop_words
maleWords=['man','men','guy','male','gay']
womenWords=['woman','women','female','girl','wife','lady']
generalWords=['inside','pussy','fuck','fucking','f_ck','bitch','SON','sex','sexual','chick','xxx','sexxy','penetrate','shit','fuckoff','tits','tit','femoid','ass']
AllWords=maleWords+womenWords+generalWords
#AllWords

stopwords=stopwords.union(['.',',','\"','?','!','[',']','...','#','(',')','{','}','[USER]',';',':','[URL]','>','<'])
def getNumberOfTokens(text):
  doc=nlp(text)
  return sum([1 for t in doc])

def getNumberOfMeaningfulTokens(text):
  doc=nlp(text)
  return sum([1 for t in doc if str(t).strip() not in stopwords])
def getMeaningfulTokens_flatten(text):
  doc=nlp(text)
  return (' '.join([t.lemma_ for t in doc if str(t).strip() not in stopwords]))  
def getMeaningfulTokens(text):
  doc=nlp(text)
  return ([t.lemma_ for t in doc if str(t).strip() not in stopwords])   

def checkProfanityPresence(text):
  return True if sum([1 for t in text if t in generalWords])>0 else False

### Cleaning

In [5]:
sw = nlp.Defaults.stop_words


def clean_text(text):
    
    text = text.lower()
    
    text = re.sub(r"[^a-zA-Z?.!,¿]+", " ", text) # replacing everything with space except (a-z, A-Z, ".", "?", "!", ",")

    text = re.sub(r"http\S+", "",text) #Removing URLs 
    #text = re.sub(r"http", "",text)
    
    html=re.compile(r'<.*?>') 
    
    text = html.sub(r'',text) #Removing html tags
    
    punctuations = '@#!?+&*[]-%.:/();$=><|{}^' + "'`" + '_'
    for p in punctuations:
        text = text.replace(p,'') #Removing punctuations
        
    text = [word.lower() for word in text.split() if word.lower() not in sw]
    
    text = [nlp(word)[0].lemma_ for word in text]
    
    text = " ".join(text) #removing stopwords
    
    emoji_pattern = re.compile("["
                           u"\U0001F600-\U0001F64F"  # emoticons
                           u"\U0001F300-\U0001F5FF"  # symbols & pictographs
                           u"\U0001F680-\U0001F6FF"  # transport & map symbols
                           u"\U0001F1E0-\U0001F1FF"  # flags (iOS)
                           u"\U00002702-\U000027B0"
                           u"\U000024C2-\U0001F251"
                           "]+", flags=re.UNICODE)
    text = emoji_pattern.sub(r'', text) #Removing emojis
    
    return text

In [6]:
tr=train_ds.copy()
tst=test_ds.copy()
tr['ntext'] = tr['text'].apply(lambda x: clean_text(x))
tst['ntext'] = tst['text'].apply(lambda x: clean_text(x))

### Split data (80-20)

In [None]:
X_train, X_dev , y_train, y_dev = train_test_split(tr['ntext'].values,tr['label_sexist'].values,test_size=0.2,random_state=123,stratify=tr['label_sexist'].values)

### TF-IDF

In [9]:
tfidf_vectorizer = TfidfVectorizer() 
tfidf_train_vectors = tfidf_vectorizer.fit_transform(X_train)
tfidf_dev_vectors = tfidf_vectorizer.transform(X_dev)

### Trainig-Testing

In [18]:
#skf = StratifiedKFold(n_splits=3, shuffle=True, random_state=17)

lrClassifier = LogisticRegression(C=5e1, solver='saga', multi_class='ovr', random_state=17, n_jobs=4) #optimizer: sag, saga, lbfgs
svmClassifier=svm.SVC(C=1.0, kernel='linear', degree=3, gamma='auto')
rfClassifier=RandomForestClassifier(max_depth=50, random_state=0)

models=[lrClassifier,svmClassifier,rfClassifier]
for model in models:
    model.fit(tfidf_train_vectors,y_train)
    y_pred = model.predict(tfidf_dev_vectors)
    print('model:{}'.format(type(model).__name__))
    print(classification_report(y_dev,y_pred))




Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
  indices = (scores > 0).astype(np.int)


model:LogisticRegression
              precision    recall  f1-score   support

  not sexist       0.85      0.90      0.88      2120
      sexist       0.62      0.52      0.56       680

    accuracy                           0.81      2800
   macro avg       0.74      0.71      0.72      2800
weighted avg       0.80      0.81      0.80      2800

model:SVC
              precision    recall  f1-score   support

  not sexist       0.84      0.98      0.90      2120
      sexist       0.88      0.41      0.56       680

    accuracy                           0.84      2800
   macro avg       0.86      0.70      0.73      2800
weighted avg       0.85      0.84      0.82      2800



Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
  y_store_unique_indices = np.zeros(y.shape, dtype=np.int)
Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
  y_encoded = np.zeros(y.shape, dtype=np.int)
Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
  y_encoded = np.zeros(y.shape, dtype=np.int)
Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
  y_encoded = np.zeros(y.shape, dtype=np.int)
Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
  y_encoded = np.zeros(y.shape, dtype=np.int)
Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
  y_encoded = np.zeros(y.

model:RandomForestClassifier
              precision    recall  f1-score   support

  not sexist       0.79      0.99      0.88      2120
      sexist       0.92      0.19      0.31       680

    accuracy                           0.80      2800
   macro avg       0.86      0.59      0.60      2800
weighted avg       0.82      0.80      0.74      2800

