<a href="https://colab.research.google.com/github/skyblue0123/twitter-flask/blob/main/BERT_offensive_tweets.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [3]:
#@title Run: Import Libraries and Data
"""
Install necessary libraries and functions
"""
 
!pip install -q colorama
!pip install -q sentencepiece
# !wget --quiet https://storage.cloud.google.com/bertlink/tokenization.pyhttps://storage.cloud.google.com/bertlink/tokenization.py
"""
Import libraries
"""

import os
import gc
import io
import zipfile
import requests
import warnings as wrn
wrn.filterwarnings('ignore')

import numpy as np
import pandas as pd
import tokenization
import seaborn as sns
import tensorflow as tf
import tensorflow_hub as hub
from tqdm.notebook import tqdm
import matplotlib.pyplot as plt
from colorama import Fore, Back, Style

from tensorflow.keras.models import Model
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.layers import Dense, Input
from tensorflow.keras.callbacks import ModelCheckpoint

"""
Define data-loading function
"""

data_link = 'https://raw.githubusercontent.com/t-davidson/hate-speech-and-offensive-language/master/data/labeled_data.csv'

def get_dataset():
    return pd.read_csv(io.StringIO(requests.get(data_link).text))

"""
Load the dataset and create a 'label' column
"""

train_df = get_dataset()
del train_df['Unnamed: 0']
train_df['label'] = np.int32(train_df['class'] != 2)

def display_training_curves(training, validation, title, subplot):
    if subplot%10==1:
        plt.subplots(figsize=(30, 15), facecolor='#e6e6e6')
        plt.tight_layout()
    ax = plt.subplot(subplot)
    ax.set_facecolor('#f8f8f8')
    ax.plot(training)
    ax.plot(validation)
    ax.set_title('Model '+ title)
    ax.set_ylabel(title)
    ax.set_xlabel('Epoch')
    ax.legend(['Training', 'Validation'])

In [2]:
sentences = train_df['tweet'].values
labels = train_df['label'].values

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression

sentences_train, sentences_test, labels_train, labels_test = train_test_split(sentences, labels, test_size=0.2, random_state=1000)

#build bag of words encoding for training data
vectorizer = CountVectorizer()
vectorizer.fit(sentences_train)
X_train = vectorizer.transform(sentences_train)
X_test  = vectorizer.transform(sentences_test)
classifier = LogisticRegression(max_iter=5) #to contrast with the results when BERT is used after only a few epochs
classifier.fit(X_train, labels_train)

print("Training Accuracy:", classifier.score(X_train, labels_train))
print("Validation Accuracy:", classifier.score(X_test, labels_test))


NameError: ignored

In [None]:
#Load the BERT model from TensorFlow Hub and generate a vocabulary file
# import tensorflow.compat.v1 as tf
# import tensorflow as tf

# !pip uninstall tensorflow
# !pip install tensorflow==2.4
#1.15

# import tensorflow as tf
# print(tf.__version__)
hub_url = 'https://tfhub.dev/tensorflow/'
hub_id = 'bert_en_uncased_L-24_H-1024_A-16/1'
bert_layer = hub.KerasLayer(hub_url + hub_id, trainable=True)

do_lower_case = bert_layer.resolved_object.do_lower_case.numpy()
vocab_file = bert_layer.resolved_object.vocab_file.asset_path.numpy()

import tokenization
#Define the tokenizer to convert tweets into token vectors
tokenizer = tokenization.FullTokenizer(vocab_file, do_lower_case)

def get_input_tokens(text, tokenizer, max_length):
  size = max_length - 2
  text = tokenizer.tokenize(text)
  sequence = ["[CLS]"] + text[:size] + ["[SEP]"]

  seq_length = len(sequence)
  pad_length = max_length - seq_length
  input_ids = tokenizer.convert_tokens_to_ids(sequence)

  return sequence, input_ids, seq_length, pad_length

sample_tweet = "any string hehe" 
sequence, input_tokens, seq_length, pad_length = get_input_tokens(sample_tweet, tokenizer, len(sample_tweet)+2)
for a, b in zip(sequence, input_tokens):
  print(str(a) + '\t' + str(b))

[CLS]	101
any	2151
string	5164
he	2002
##he	5369
[SEP]	102


In [None]:
def bert_encode(texts, tokenizer, max_length=32):
    all_masks, all_tokens = [], []
    
    for text in tqdm(texts):
        sequence, input_tokens, seq_length, pad_length = get_input_tokens(text, tokenizer, max_length)

        # pad the sequences and add the attention masks 
        input_tokens += [0] * pad_length
        input_pad_masks = [1] * seq_length + [0] * pad_length
        all_tokens = all_tokens + [input_tokens]
        all_masks = all_masks + [input_pad_masks]
    
    return np.array(all_tokens), np.array(all_masks), np.zeros_like(all_tokens)

In [None]:
train_labels = train_df.label.values.reshape(-1, 1)
train_input = bert_encode(train_df.tweet, tokenizer, max_length=32)

HBox(children=(FloatProgress(value=0.0, max=24783.0), HTML(value='')))




In [None]:
def build_bert_model(bert_layer, max_len):
    word_ids = Input(shape=(max_len,), dtype=tf.int32)
    input_mask = Input(shape=(max_len,), dtype=tf.int32)
    segment_ids = Input(shape=(max_len,), dtype=tf.int32) #Just zeros
    _, bert_sequence_output = bert_layer([word_ids, input_mask, segment_ids])

    clf_output = bert_sequence_output[:, 0, :]
    output_layer = Dense(1, activation='sigmoid') #Why 1?
    output = output_layer(clf_output)
    return Model(inputs=[word_ids, input_mask, segment_ids], outputs=output)

model = build_bert_model(bert_layer, max_len=32)
optimizer, loss = Adam(1e-5), 'binary_crossentropy'
model.compile(optimizer=optimizer, loss=loss, metrics=['accuracy'])
model.summary()

Model: "model"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_1 (InputLayer)            [(None, 32)]         0                                            
__________________________________________________________________________________________________
input_2 (InputLayer)            [(None, 32)]         0                                            
__________________________________________________________________________________________________
input_3 (InputLayer)            [(None, 32)]         0                                            
__________________________________________________________________________________________________
keras_layer (KerasLayer)        [(None, 1024), (None 335141889   input_1[0][0]                    
                                                                 input_2[0][0]                

In [None]:
train_history = model.fit(train_input, train_labels, epochs=2,validation_split=0.2, batch_size=16, shuffle=False)

Epoch 1/2
Epoch 2/2


In [None]:
tweets = ["i hate hate hate u", "have a good best day"]
encoded = bert_encode(tweets, tokenizer, max_length=32)
model.predict(encoded)
# arr = np.array((len(tweets), 2))
##get better data because this one sucks



HBox(children=(FloatProgress(value=0.0, max=2.0), HTML(value='')))




array([[0.74531955],
       [0.01711299]], dtype=float32)

In [None]:
!pip install flask-ngrok
from flask_ngrok import run_with_ngrok
from flask import Flask
app = Flask(__name__)
run_with_ngrok(app)   #starts ngrok when the app is run
@app.route("/")
def home():
    return "<h1>Running Flask on Google Colab!</h1>"
  
app.run()

Collecting flask-ngrok
  Downloading https://files.pythonhosted.org/packages/af/6c/f54cb686ad1129e27d125d182f90f52b32f284e6c8df58c1bae54fa1adbc/flask_ngrok-0.0.25-py3-none-any.whl
Installing collected packages: flask-ngrok
Successfully installed flask-ngrok-0.0.25
 * Serving Flask app "__main__" (lazy loading)
 * Environment: production
[2m   Use a production WSGI server instead.[0m
 * Debug mode: off


INFO:werkzeug: * Running on http://127.0.0.1:5000/ (Press CTRL+C to quit)


 * Running on http://97710297d9c2.ngrok.io
 * Traffic stats available on http://127.0.0.1:4040


INFO:werkzeug:127.0.0.1 - - [25/May/2021 20:22:23] "[37mGET / HTTP/1.1[0m" 200 -
INFO:werkzeug:127.0.0.1 - - [25/May/2021 20:22:23] "[33mGET /favicon.ico HTTP/1.1[0m" 404 -
INFO:werkzeug:127.0.0.1 - - [25/May/2021 20:22:23] "[37mGET / HTTP/1.1[0m" 200 -
