In [1]:
import tensorflow as tf
from tensorflow import keras
import psycopg2
from sqlalchemy import create_engine
import config
import pandas as pd
import numpy as np
import re
from bs4 import BeautifulSoup


In [3]:
DATABASE_URL = config.postgresURI
engine = create_engine(DATABASE_URL)
sexism_df = pd.read_sql('SELECT * FROM sexism_data', engine)

In [4]:
emoji_pattern = re.compile("["
        u"\U0001F600-\U0001F64F"  # emoticons
        u"\U0001F300-\U0001F5FF"  # symbols & pictographs
        u"\U0001F680-\U0001F6FF"  # transport & map symbols
        u"\U0001F1E0-\U0001F1FF"  # flags (iOS)
                           "]+", flags=re.UNICODE)
url_pattern = re.compile('''((https?:\/\/)?(?:www\.|(?!www))[a-zA-Z0-9]([a-zA-Z0-9-]+[a-zA-Z0-9])?\.[^\s]{2,}
|www\.[a-zA-Z0-9][a-zA-Z0-9-]+[a-zA-Z0-9]\.[^\s]{2,}|https?:\/\/(?:www\.|(?!www))[a-zA-Z0-9]+\.[^\s]{2,}|
www\.[a-zA-Z0-9]+\.[^\s]{2,})''', flags=re.UNICODE)
mention_pattern = re.compile('([^a-zA-Z0-9]|^)@\S+', flags=re.UNICODE)
mention_pattern2 = re.compile('([^0-9]|^)MENTION\S+', flags=re.UNICODE)
hashtag_pattern = re.compile('([^a-zA-Z0-9]|^)#\S+', flags=re.UNICODE)
rt_pattern = re.compile('([^a-zA-Z0-9]|^)(rt|ht|cc|RT)([^a-zA-Z0-9]|$)', flags=re.UNICODE)

def detweet(text):
    return re.sub(url_pattern, '', 
               re.sub(rt_pattern, '', 
                      re.sub(mention_pattern2, '',
                         re.sub(mention_pattern, '',
                             re.sub(hashtag_pattern, '', 
                                 re.sub(emoji_pattern, '', 
                                    text))))))
def normalize(text):
    return re.sub(r"\s+", " ", #remove extra spaces
                  re.sub(r'[^a-zA-Z0-9]', ' ', #remove non alphanumeric, incl punctuation
                         text)).lower().strip() #lowercase and strip
def fix_encoding_and_unescape(text):
    return BeautifulSoup(text.decode('unicode-escape')).get_text()
def preprocess(text, fix_encoding=False):
    if (type(text)==str) or (type(text)==unicode):
        if fix_encoding:
            return normalize(detweet(fix_encoding_and_unescape(text)))
        else:
            return normalize(detweet(text))
    else:
        return text

In [5]:
sexism_df['text_preprocessed'] = sexism_df['text'].apply(preprocess)
adapt_data = tf.constant(sexism_df['text_preprocessed'])

In [6]:
text_vectorizer = keras.layers.TextVectorization(output_mode="tf-idf", ngrams=2)
text_vectorizer.adapt(adapt_data)

In [2]:
model = keras.models.load_model('vectorizedNN.h5')
model.summary()

2021-11-21 13:56:24.564973: I tensorflow/core/platform/cpu_feature_guard.cc:151] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.


Model: "model_1"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 input_3 (InputLayer)        [(None, None)]            0         
                                                                 
 embedding (Embedding)       (None, None, 128)         9014016   
                                                                 
 dropout (Dropout)           (None, None, 128)         0         
                                                                 
 conv1d (Conv1D)             (None, None, 128)         114816    
                                                                 
 conv1d_1 (Conv1D)           (None, None, 128)         114816    
                                                                 
 global_max_pooling1d (Globa  (None, 128)              0         
 lMaxPooling1D)                                                  
                                                           

In [7]:
string_input = keras.Input(shape=(1,), dtype="string")
x = text_vectorizer(string_input)
preds = model(x)
end_to_end_model = keras.Model(string_input, preds)
end_to_end_model.compile(loss="binary_crossentropy", optimizer="adam", metrics=["accuracy"])

In [8]:
end_to_end_model.summary()

Model: "model"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 input_1 (InputLayer)        [(None, 1)]               0         
                                                                 
 text_vectorization (TextVec  (None, 70422)            1         
 torization)                                                     
                                                                 
 model_1 (Functional)        (None, 1)                 9260289   
                                                                 
Total params: 9,260,290
Trainable params: 9,260,289
Non-trainable params: 1
_________________________________________________________________


In [22]:
end_to_end_model.save('vectorizedNNe2e', save_format='tf')

2021-11-20 14:57:54.205899: W tensorflow/python/util/util.cc:368] Sets are not currently considered sequences, but this may change in the future, so consider avoiding using them.


INFO:tensorflow:Assets written to: vectorizedNNe2e/assets
