In [18]:
import tensorflow as tf
from tensorflow import keras
import psycopg2
from sqlalchemy import create_engine
import config
import pandas as pd
import numpy as np
import re
from bs4 import BeautifulSoup
from sklearn.model_selection import train_test_split
import joblib
from sklearn.compose import make_column_transformer

In [4]:
DATABASE_URL = config.postgresURI

In [5]:
engine = create_engine(DATABASE_URL)

In [6]:
sexism_df = pd.read_sql('SELECT * FROM sexism_data', engine)
sexism_df

Unnamed: 0,dataset,text,toxicity,sexist,of_id,id
0,other,MENTION3394 MENTION2031 MENTION3544 curious as...,0.087480,False,-1,1
1,callme,females should not commentate on sport,0.286217,True,-1,2
2,other,"""We're serving leftovers for breakfast"" #mkr",0.122916,False,-1,3
3,hostile,MENTION4416 I like a multimedia approach.,0.077411,False,-1,4
4,other,.MENTION3582 MENTION4612 05 SB. Getting Ike in...,0.042951,False,-1,5
...,...,...,...,...,...,...
9995,other,Would you really be surprised? #gamergate #SVU...,0.070119,False,-1,9996
9996,benevolent,You've got a strong grip for a kid. https://t....,0.264434,False,10477,9997
9997,other,MENTION3574 MENTION2415 i'm married with kids....,0.286828,False,-1,9998
9998,other,Anyone eliminated yet? #mkr,0.353405,False,-1,9999


In [7]:
sexism_data = sexism_df[['text', 'sexist']]

In [8]:
emoji_pattern = re.compile("["
        u"\U0001F600-\U0001F64F"  # emoticons
        u"\U0001F300-\U0001F5FF"  # symbols & pictographs
        u"\U0001F680-\U0001F6FF"  # transport & map symbols
        u"\U0001F1E0-\U0001F1FF"  # flags (iOS)
                           "]+", flags=re.UNICODE)
url_pattern = re.compile('''((https?:\/\/)?(?:www\.|(?!www))[a-zA-Z0-9]([a-zA-Z0-9-]+[a-zA-Z0-9])?\.[^\s]{2,}|www\.[a-zA-Z0-9][a-zA-Z0-9-]+[a-zA-Z0-9]\.[^\s]{2,}|https?:\/\/(?:www\.|(?!www))[a-zA-Z0-9]+\.[^\s]{2,}|www\.[a-zA-Z0-9]+\.[^\s]{2,})''', 
                         flags=re.UNICODE)
mention_pattern = re.compile('([^a-zA-Z0-9]|^)@\S+', flags=re.UNICODE)
mention_pattern2 = re.compile('([^0-9]|^)MENTION\S+', flags=re.UNICODE)
hashtag_pattern = re.compile('([^a-zA-Z0-9]|^)#\S+', flags=re.UNICODE)
rt_pattern = re.compile('([^a-zA-Z0-9]|^)(rt|ht|cc|RT)([^a-zA-Z0-9]|$)', flags=re.UNICODE)

In [9]:
def detweet(text):
    return re.sub(url_pattern, '', 
               re.sub(rt_pattern, '', 
                      re.sub(mention_pattern2, '',
                         re.sub(mention_pattern, '',
                             re.sub(hashtag_pattern, '', 
                                 re.sub(emoji_pattern, '', 
                                    text))))))
def normalize(text):
    return re.sub(r"\s+", " ", #remove extra spaces
                  re.sub(r'[^a-zA-Z0-9]', ' ', #remove non alphanumeric, incl punctuation
                         text)).lower().strip() #lowercase and strip
def fix_encoding_and_unescape(text):
    return BeautifulSoup(text.decode('unicode-escape')).get_text()
def preprocess(text, fix_encoding=False):
    if (type(text)==str) or (type(text)==unicode):
        if fix_encoding:
            return normalize(detweet(fix_encoding_and_unescape(text)))
        else:
            return normalize(detweet(text))
    else:
        return text


In [10]:
sexism_data['text_preprocessed'] = sexism_data['text'].apply(preprocess)
sexism_data['sexist_target'] = sexism_data['sexist'].astype(int)
sexism_data

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """Entry point for launching an IPython kernel.
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  


Unnamed: 0,text,toxicity,sexist,text_preprocessed,sexist_target
0,MENTION3394 MENTION2031 MENTION3544 curious as...,0.087480,False,curious as to if the ap style guide has anythi...,0
1,females should not commentate on sport,0.286217,True,females should not commentate on sport,1
2,"""We're serving leftovers for breakfast"" #mkr",0.122916,False,we re serving leftovers for breakfast,0
3,MENTION4416 I like a multimedia approach.,0.077411,False,i like a multimedia approach,0
4,.MENTION3582 MENTION4612 05 SB. Getting Ike in...,0.042951,False,05 sb getting ike in 4th parker undrafted a no...,0
...,...,...,...,...,...
9995,Would you really be surprised? #gamergate #SVU...,0.070119,False,would you really be surprised,0
9996,You've got a strong grip for a kid. https://t....,0.264434,False,you ve got a strong grip for a kid,0
9997,MENTION3574 MENTION2415 i'm married with kids....,0.286828,False,i m married with kids also 33,0
9998,Anyone eliminated yet? #mkr,0.353405,False,anyone eliminated yet,0


In [11]:
adapt_data = tf.constant(sexism_data['text_preprocessed'])

2021-11-20 13:13:26.029043: I tensorflow/core/platform/cpu_feature_guard.cc:151] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [12]:
text_vectorizer = keras.layers.TextVectorization(output_mode="tf-idf", ngrams=2)

In [13]:
text_vectorizer.adapt(adapt_data)

In [21]:
train, test = train_test_split(sexism_data[['text_preprocessed', 'sexist_target']])
train.shape

(7500, 2)

In [22]:
train = tf.data.Dataset.from_tensor_slices((train['text_preprocessed'].values, train['sexist_target'].values))
test = tf.data.Dataset.from_tensor_slices((test['text_preprocessed'].values, test['sexist_target'].values))

In [23]:
train_dataset = train.batch(2).map(lambda x, y: (text_vectorizer(x), y))
test_dataset = test.batch(2).map(lambda x, y: (text_vectorizer(x), y))

In [24]:
model.compile(optimizer="rmsprop", loss="mse")
model.fit(train_dataset)



<keras.callbacks.History at 0x7f858b1d7a50>

In [25]:
embedding_dim = 128

# A integer input for vocab indices.
inputs = keras.Input(shape=(None,), dtype="int64")

# Next, we add a layer to map those vocab indices into a space of dimensionality
# 'embedding_dim'.
x = keras.layers.Embedding(text_vectorizer.vocabulary_size(), embedding_dim)(inputs)
x = keras.layers.Dropout(0.5)(x)

# Conv1D + global max pooling
x = keras.layers.Conv1D(128, 7, padding="valid", activation="relu", strides=3)(x)
x = keras.layers.Conv1D(128, 7, padding="valid", activation="relu", strides=3)(x)
x = keras.layers.GlobalMaxPooling1D()(x)

# We add a vanilla hidden layer:
x = keras.layers.Dense(128, activation="relu")(x)
x = keras.layers.Dropout(0.5)(x)

# We project onto a single unit output layer, and squash it with a sigmoid:
predictions = keras.layers.Dense(1, activation="sigmoid", name="predictions")(x)

model = keras.Model(inputs, predictions)

# Compile the model with binary crossentropy loss and an adam optimizer.
model.compile(loss="binary_crossentropy", optimizer="adam", metrics=["accuracy"])

In [26]:
epochs = 3

# Fit the model using the train and test datasets.
model.fit(train_dataset, epochs=epochs)

Epoch 1/3
Epoch 2/3
Epoch 3/3


<keras.callbacks.History at 0x7f858bb2efd0>

In [27]:
model.evaluate(test_dataset)



[0.38373079895973206, 0.8715999722480774]

In [28]:
new_tweets = [
    "waaaaah the working class’s income is keeping pace with or outstripping inflation but my capital gains aren’t boo fucking hooooo",
    "I got a haircut today! Woman getting haircut I will now accept (polite, non-sexual) compliments, as is traditional at such times Smiling face",
    "Am I nervous about Baz bringing someone home for Thanksgiving after we’ve been living by pandemic standards for ~2 years? I just bought a shower curtain and matching towels and bathmats. So evidently, yes."
]
new_data = pd.DataFrame({'text':new_tweets})
new_data['text_preprocessed'] = new_data['text'].apply(preprocess)
new_dataset = tf.data.Dataset.from_tensor_slices((new_data['text_preprocessed'].values))
new_dataset = new_dataset.batch(2).map(lambda x: (text_vectorizer(x)))

In [29]:
model.predict(new_dataset)

array([[0.17639154],
       [0.13177627],
       [0.14955333]], dtype=float32)

In [30]:
model.save('vectorizedNN.h5')