In [10]:
import numpy as np
import pandas as pd
import os
import re
from sklearn.model_selection import train_test_split
import string
import tensorflow as tf
from sklearn.model_selection import train_test_split

In [11]:
os.listdir()

['twitter_parsed_dataset.csv',
 'twitter_sentiment.ipynb',
 'twitter_racism_parsed_dataset.csv',
 'twitter_sexism_parsed_dataset.csv',
 '.ipynb_checkpoints',
 'bert.ipynb',
 '.git']

In [12]:
parsed = pd.read_csv('twitter_parsed_dataset.csv')
racism = pd.read_csv('twitter_racism_parsed_dataset.csv')
sexism = pd.read_csv('twitter_sexism_parsed_dataset.csv')

twitter_data = pd.concat([parsed, racism, sexism]).dropna()
twitter_data.head()

Unnamed: 0,index,id,Text,Annotation,oh_label
0,5.74948705591165e+17,5.74948705591165e+17,@halalflaws @biebervalue @greenlinerzjm I read...,none,0.0
1,5.71917888690393e+17,5.71917888690393e+17,@ShreyaBafna3 Now you idiots claim that people...,none,0.0
2,3.90255841338601e+17,3.90255841338601e+17,"RT @Mooseoftorment Call me sexist, but when I ...",sexism,1.0
3,5.68208850655916e+17,5.68208850655916e+17,"@g0ssipsquirrelx Wrong, ISIS follows the examp...",racism,1.0
4,5.75596338802373e+17,5.75596338802373e+17,#mkr No No No No No No,none,0.0


In [13]:
### Cleaning tweets

def cleaning_tweets(tweet):
    # 1. Remove Twitter handles (@user)
    users = re.findall("@[\w]*", tweet) # tokenizing
    for user in users:
        tweet = re.sub(user, '', tweet)
        
    # 2. Remove urls
    tweet = re.sub(r'http\S+', '', tweet)

    # 3. Remove, Punctuations, Numbers, and Special Characters (keep hashtags)
    tweet = tweet.replace(".", " ").replace(",", " ").replace("?", " ").replace("!", " ")
    tweet = "".join([char for char in tweet if char not in string.punctuation])
    tweet = re.sub('[0-9]+', '', tweet)

    # 4. Lowercase all
    tweet = tweet.lower()
    
    return tweet

twitter_data['cleaned_tweets'] = twitter_data['Text'].apply(cleaning_tweets)
twitter_data.head()

Unnamed: 0,index,id,Text,Annotation,oh_label,cleaned_tweets
0,5.74948705591165e+17,5.74948705591165e+17,@halalflaws @biebervalue @greenlinerzjm I read...,none,0.0,i read them in context no change in meaning...
1,5.71917888690393e+17,5.71917888690393e+17,@ShreyaBafna3 Now you idiots claim that people...,none,0.0,now you idiots claim that people who tried to...
2,3.90255841338601e+17,3.90255841338601e+17,"RT @Mooseoftorment Call me sexist, but when I ...",sexism,1.0,rt call me sexist but when i go to an auto p...
3,5.68208850655916e+17,5.68208850655916e+17,"@g0ssipsquirrelx Wrong, ISIS follows the examp...",racism,1.0,wrong isis follows the example of mohammed a...
4,5.75596338802373e+17,5.75596338802373e+17,#mkr No No No No No No,none,0.0,mkr no no no no no no


In [14]:
X_train, X_test, y_train, y_test = train_test_split(twitter_data['cleaned_tweets'], twitter_data['oh_label'], test_size=0.2, random_state=42)

X_train.head()

9327     there is such a diff between reality amp what ...
14633    katies a fatty   model     hahahaha mkr killer...
4197        it is really funny all the assumptions they...
3534                    origin is a flaming piece of shit 
4500     no  you dont   i thought of a really funny jok...
Name: cleaned_tweets, dtype: object

In [15]:
pip install transformers

Note: you may need to restart the kernel to use updated packages.


In [16]:
from transformers import BertTokenizer, TFBertModel

In [17]:
bert_layer = TFBertModel.from_pretrained('bert-base-uncased')

Some layers from the model checkpoint at bert-base-uncased were not used when initializing TFBertModel: ['nsp___cls', 'mlm___cls']
- This IS expected if you are initializing TFBertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFBertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
All the layers of TFBertModel were initialized from the model checkpoint at bert-base-uncased.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFBertModel for predictions without further training.


In [18]:
input_ids = tf.keras.layers.Input(shape=(49), dtype='int32', name='input_ids')
masks = tf.keras.layers.Input(shape=(49), dtype='int32', name='mask')
token_type_ids = tf.keras.layers.Input(shape=(49), dtype='int32', name='token_types')

bert_output = bert_layer([input_ids, masks, token_type_ids])


cls = bert_output[0][:, 0, :]

hidden = tf.keras.layers.Dense(200, activation='relu')(cls)

classification = tf.keras.layers.Dense(1, activation='sigmoid')(hidden)

model = tf.keras.Model(inputs = [input_ids, masks, token_type_ids], outputs = classification)
model.compile(loss=tf.keras.losses.BinaryCrossentropy(), optimizer=tf.keras.optimizers.Adam(lr=0.01), metrics='acc')



In [19]:
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

bert_inputs = tokenizer(list(X_train), padding=True, return_tensors='tf')

In [20]:
bert_inputs

{'input_ids': <tf.Tensor: shape=(36157, 49), dtype=int32, numpy=
array([[  101,  2045,  2003, ...,     0,     0,     0],
       [  101,  9734,  2015, ...,     0,     0,     0],
       [  101,  2009,  2003, ...,     0,     0,     0],
       ...,
       [  101,  2216,  3057, ...,     0,     0,     0],
       [  101, 19387,  2122, ...,     0,     0,     0],
       [  101, 19387, 18301, ...,     0,     0,     0]], dtype=int32)>, 'token_type_ids': <tf.Tensor: shape=(36157, 49), dtype=int32, numpy=
array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       ...,
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]], dtype=int32)>, 'attention_mask': <tf.Tensor: shape=(36157, 49), dtype=int32, numpy=
array([[1, 1, 1, ..., 0, 0, 0],
       [1, 1, 1, ..., 0, 0, 0],
       [1, 1, 1, ..., 0, 0, 0],
       ...,
       [1, 1, 1, ..., 0, 0, 0],
       [1, 1, 1, ..., 0, 0, 0],
       [1, 1, 1, ..., 0, 0, 0]], dtype=in

In [1]:
model.fit(x = [np.array(bert_inputs['input_ids']), np.array(bert_inputs['attention_mask']), np.array(bert_inputs['token_type_ids'])],
          y = y_train,
          epochs = 4,
          batch_size = 12)


NameError: name 'model' is not defined