In [13]:
import re
import string
import numpy as np
import random
import pandas as pd
import matplotlib.pyplot as plt
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
import keras.preprocessing.sequence as sequence
from tqdm import tqdm
import keras
from keras.preprocessing.text import Tokenizer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from keras.utils import pad_sequences
from keras.layers import Embedding

from nltk.tokenize import word_tokenize




stop_words = stopwords.words('english')
more_stopwords = ['u', 'im', 'ur', 'dont', 'doin', 'ure']
stop_words = stop_words + more_stopwords
stemmer = nltk.stem.SnowballStemmer('english')
nltk.download('popular')

[nltk_data] Downloading collection 'popular'
[nltk_data]    | 
[nltk_data]    | Downloading package cmudict to
[nltk_data]    |     /Users/alessandraperniciano/nltk_data...
[nltk_data]    |   Unzipping corpora/cmudict.zip.
[nltk_data]    | Downloading package gazetteers to
[nltk_data]    |     /Users/alessandraperniciano/nltk_data...
[nltk_data]    |   Unzipping corpora/gazetteers.zip.
[nltk_data]    | Downloading package genesis to
[nltk_data]    |     /Users/alessandraperniciano/nltk_data...
[nltk_data]    |   Unzipping corpora/genesis.zip.
[nltk_data]    | Downloading package gutenberg to
[nltk_data]    |     /Users/alessandraperniciano/nltk_data...
[nltk_data]    |   Unzipping corpora/gutenberg.zip.
[nltk_data]    | Downloading package inaugural to
[nltk_data]    |     /Users/alessandraperniciano/nltk_data...
[nltk_data]    |   Unzipping corpora/inaugural.zip.
[nltk_data]    | Downloading package movie_reviews to
[nltk_data]    |     /Users/alessandraperniciano/nltk_data...
[nltk_d

True

In [3]:
def read_csv_file(file_name):
    return pd.read_csv(file_name)

def clean_text(text):
    # Lowercase
    text = text.lower()
    
    # Remove text in square brackets
    text = re.sub('\[.*?\]', '', text)
    
    # Remove links
    text = re.sub('https?://\S+|www\.\S+', '', text)
    text = re.sub('<.*?>+', '', text)
    
    # Remove punctuation
    text = re.sub('[%s]' % re.escape(string.punctuation), '', text)

    # Remove words containing numbers
    text = re.sub('\w*\d\w*', '', text)
    
    return text

def remove_stopwords(text):
    return ' '.join(word for word in text.split() if word not in stop_words)

def stem_words(text):
    return ' '.join(stemmer.stem(word) for word in text.split())

def preprocess_data(text):
    text = clean_text(text)
    text = remove_stopwords(text)
    text = stem_words(text)
    return text

In [4]:
# Read csv files and save them in dataframes
    
# labels = id, comment_text, toxic, severe_toxic, obscene, threat, insult, identity_hate
train = read_csv_file('data/train.csv')
   
# labels = id, comment_text
test = read_csv_file('data/test.csv')
    
# labels = id, toxic, severe_toxic, obscene, threat, insult, identity_hate
test_labels = read_csv_file('data/test_labels copy.csv')


In [5]:
# Cleaning the text
train['text_clean'] = train['comment_text'].apply(lambda x: preprocess_data(x))
test['text_clean'] = test['comment_text'].apply(lambda x: preprocess_data(x))

In [7]:

x_train = train['text_clean']
y_train = train[['toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate']]

x_test = test['text_clean']
y_test = test_labels[['toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate']]

In [8]:
# Vectorazing the text

vect = CountVectorizer()
vect.fit(x_train)

#Document Term Matrix from train and test_sets
x_train_dtm = vect.transform(x_train)
x_test_dtm = vect.transform(x_test)


In [9]:
vect_tunned = CountVectorizer(stop_words = 'english', ngram_range=(1,2), min_df=0.1, max_df=0.7, max_features=10000)


In [10]:
# TF-IDF
tfidf_transformer = TfidfTransformer()

tfidf_transformer.fit(x_train_dtm)
x_train_tfidf = tfidf_transformer.transform(x_train_dtm)



In [11]:
word_tokenizer = Tokenizer()
word_tokenizer.fit_on_texts(x_train)

vocab_length = len(word_tokenizer.word_index) + 1


In [14]:

x = train['text_clean']
def embed(corpus): 
    return word_tokenizer.texts_to_sequences(corpus)


longest_train = max(x, key=lambda sentence: len(word_tokenize(sentence)))
length_long_sentence = len(word_tokenize(longest_train))

train_padded_sentences = pad_sequences(
    embed(x), length_long_sentence, padding='post')


In [16]:

embeddings_dictionary = dict()
embedding_dim = 50

with open('data/glove.6B.50d.txt') as fp:
    for line in fp.readlines():
        records = line.split()
        word = records[0]
        vector_dimensions = np.asarray(records[1:], dtype='float32')
        embeddings_dictionary[word] = vector_dimensions


In [18]:
# Now we will load embedding vectors of those words that appear in the
# Glove dictionary. Others will be initialized to 0.

embedding_matrix = np.zeros((vocab_length, embedding_dim))

for word, index in word_tokenizer.word_index.items():
    embedding_vector = embeddings_dictionary.get(word)
    if embedding_vector is not None:
        embedding_matrix[index] = embedding_vector
        

In [21]:
#BERT
import tensorflow as tf
from tensorflow.keras.layers import Dense, Input
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.models import Model
from tensorflow.keras.callbacks import ModelCheckpoint

import transformers
from tqdm.notebook import tqdm
from tokenizers import BertWordPieceTokenizer
from transformers import BertTokenizer

In [23]:
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

def bert_encode(data, maximum_length):
    input_ids = []
    attention_masks = []
    
    for text in tqdm(data):
        encoded = tokenizer.encode_plus(
            text=text,
            add_special_tokens=True,
            max_length=maximum_length,
            padding=True,
            return_attention_mask=True
        )
        input_ids.append(encoded['input_ids'])
        attention_masks.append(encoded['attention_mask'])
    return np.array(input_ids), np.array(attention_masks)


In [None]:
train_input_ids, train_attention_masks = bert_encode(x_train, 160)
