<a href="https://colab.research.google.com/github/bhattacharjee/mtu-nlp-assignment/blob/main/assignment1/NLP_Assignment_2.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [10]:
import requests
def get_train_test_files():
    TRAIN_FILE = 'https://raw.githubusercontent.com/bhattacharjee/mtu-nlp-assignment/main/assignment1/Assessment1_Toxic_Train.csv'
    TEST_FILE = 'https://raw.githubusercontent.com/bhattacharjee/mtu-nlp-assignment/main/assignment1/Assessment1_Toxic_Test_For_Evaluation.csv'
    TRAIN_FILE_LOCAL = 'Assessment1_Toxic_Train.csv'
    TEST_FILE_LOCAL = 'Assessment1_Toxic_Test.csv'

    def download(url, localfile):
        with open(localfile, 'wb') as f:
            r = requests.get(url, allow_redirects=True)
            f.write(r.content)

    download(TRAIN_FILE, TRAIN_FILE_LOCAL)
    download(TEST_FILE, TEST_FILE_LOCAL)

    return TRAIN_FILE_LOCAL, TEST_FILE_LOCAL


In [8]:
!pip install spacy  nltk spacymoji huggingface -q       >/dev/null 2>&1         
!python -m spacy download de_core_news_sm               >/dev/null 2>&1
!python -m spacy download de_dep_news_trf               >/dev/null 2>&1
!pip install -q -U tensorflow-text                      >/dev/null 2>&1
!pip install -q tf-models-official                      >/dev/null 2>&1

In [11]:
import pandas as pd
def get_train_test_df():
    train_csv, test_csv = get_train_test_files()

    train_df = pd.read_csv(train_csv)
    test_df = pd.read_csv(test_csv)

    return train_df, test_df

In [12]:
import re
def remove_roles(line:str)->str:
    # Remove texts like @USER, @MODERATOR etc
    pat = re.compile(u'\@[A-Za-z]+')
    return re.sub(pat, '', line)

In [13]:
import re
def remove_emojis(line:str)->str:
    pat = re.compile("["
                u"\U0001F600-\U0001F64F"  # emoticons
                u"\U0001F300-\U0001F5FF"  # symbols & pictographs
                u"\U0001F680-\U0001F6FF"  # transport & map symbols
                u"\U0001F1E0-\U0001F1FF"  # flags (iOS)
                u"\U00002702-\U000027B0"
                u"\U000024C2-\U0001F251"
                u"\U0001f926-\U0001f937"
                u'\U00010000-\U0010ffff'
                u"\u200d"
                u"\u2640-\u2642"
                u"\u2600-\u2B55"
                u"\u23cf"
                u"\u23e9"
                u"\u231a"
                u"\u3030"
                u"\ufe0f"
        "]+", flags=re.UNICODE)
    return re.sub(pat, '', line)


In [14]:
import re
def remove_ellipses(line:str)->str:
    pat = re.compile(u'\.\.+')
    return re.sub(pat, ' ', line)

In [15]:
def to_lower(line:str)->str:
    return line.lower()

In [16]:
def replace_number_with_tag(line:str)->str:
    line = re.sub("\s\d*((\.|\,)\d+)?\s", " nummer ", line)
    line = re.sub('\s\d+$', '', line)
    line = re.sub('^\d+\s', '', line)
    return line

In [17]:
def remove_urls(line:str)->str:
    return re.sub('https?:\/\/\S+', ' hyperlink ', line)

In [18]:
def basic_clean(s:pd.Series)->pd.Series:
    return s.map(to_lower)                                                  \
            .map(remove_emojis)                                             \
            .map(remove_roles)                                              \
            .map(remove_ellipses)                                           \
            .map(replace_number_with_tag)                                   \
            .map(remove_urls)

def get_clean_train_test_df()->tuple:
    train_df, test_df = get_train_test_df()
    train_df['comment_text'] = basic_clean(train_df['comment_text'])
    test_df['comment_text'] = basic_clean(test_df['comment_text'])
    return train_df, test_df


In [19]:
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import SnowballStemmer
import string
import spacy
from spacymoji import Emoji


def is_punct_only(token:str)->bool:
    for c in list(token):
        if c not in string.punctuation:
            return False
    return True

def is_same(l1:list, l2:list)->bool:
    if (len(l1) != len(l2)):
        return False
    for x, y in zip(l1, l2):
        if x != y:
            return False
    return True

def do_basic_nlp_cleaning(line:str)->str:
    nltk.download('stopwords', quiet=True)
    nltk.download('punkt', quiet=True)
    nltk.download('wordnet', quiet=True)

    # Tokenize
    tokens = word_tokenize(line)

    # Some tokens start with a punctuation, remove the first one
    def remove_first_punctuation(tok:str)->str:
        return                                                              \
            tok[1:]                                                         \
            if tok[0] in set(string.punctuation) and len(tok) != 0          \
            else tok

    tokens = [remove_first_punctuation(w) for w in tokens]

    # Remove stop words
    stop_words = set(stopwords.words("german"))
    tokens = [w for w in tokens if w not in stop_words]

    # Remove punctuations
    tokens = [w for w in tokens if not is_punct_only(w)]

    # Stem words
    stem = SnowballStemmer('german')
    tokens = [stem.stem(w) for w in tokens]

    return " ".join(tokens)

import spacy
def get_cleaning_function():
    #nlp = spacy.load("de_dep_news_trf")
    nlp = spacy.load("de_core_news_sm")
    emoji = Emoji(nlp)
    nlp.add_pipe(emoji, first=True)
    stopwords = spacy.lang.de.stop_words.STOP_WORDS

    def do_basic_nlp_cleaning(line:str)->str:
        def is_interesting_token(token):
            if token.pos_ in set(['NUM', 'SYM']):
                return False
            if token.text in stopwords:
                return False
            if (token.is_punct):
                return False
            #if token._.is_emoji:
            #    return False
            return True

        def remove_terminal_punctuations(word):
            word = word.strip()
            while word != "" and word[0] in list(string.punctuation):
                word = word[1:]
            while word != "" and word[-1] in list(string.punctuation):
                word = word[:-1]
            return word

        doc = nlp(line)
        words = [tok.lemma_.lower() for tok in doc if is_interesting_token(tok)]
        words = [remove_terminal_punctuations(word) for word in words]
        words = [word for word in words if word != ""]
        return  " ".join(words)

    return do_basic_nlp_cleaning




def is_empty_string(s:str)->bool:
    if s == '' or s == None:
        return True
    return False

cleaning_fn = get_cleaning_function()
train_df, test_df = get_clean_train_test_df()
train_df['comment_text'] = train_df['comment_text'].map(cleaning_fn)
empty_rows = train_df['comment_text'].map(is_empty_string)
train_df = train_df[~ empty_rows]

In [20]:
from sklearn.naive_bayes import MultinomialNB, CategoricalNB, BernoulliNB
from sklearn.datasets import fetch_20newsgroups
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import confusion_matrix, f1_score, accuracy_score
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression

def get_initial_feel(model, vectorizer, y_column:str):
    print('*' * 80)
    print(f"Classifying for : {y_column}")
    
    trainX, testX, trainY, testY = train_test_split(\
                        train_df['comment_text'], train_df[y_column])

    cv =  vectorizer.fit(trainX)
    trainX = cv.transform(trainX)
    testX = cv.transform(testX)

    model.fit(trainX, trainY)

    predY = model.predict(testX)

    print(confusion_matrix(testY, predY))
    print(f"Accuracy: {accuracy_score(testY, predY)}")
    print(f"f1-score: {f1_score(testY, predY)}")

def tryout(model, vectorizer):
    print(repr(model), repr(vectorizer))
    print()
    get_initial_feel(model, vectorizer, 'Sub1_Toxic')
    get_initial_feel(model, vectorizer, 'Sub2_Engaging')
    get_initial_feel(model, vectorizer, 'Sub3_FactClaiming')
    print()


tryout(MultinomialNB(), CountVectorizer(ngram_range=(1, 3)))

MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True) CountVectorizer(analyzer='word', binary=False, decode_error='strict',
                dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
                lowercase=True, max_df=1.0, max_features=None, min_df=1,
                ngram_range=(1, 3), preprocessor=None, stop_words=None,
                strip_accents=None, token_pattern='(?u)\\b\\w\\w+\\b',
                tokenizer=None, vocabulary=None)

********************************************************************************
Classifying for : Sub1_Toxic
[[438  54]
 [211  80]]
Accuracy: 0.6615581098339719
f1-score: 0.3764705882352941
********************************************************************************
Classifying for : Sub2_Engaging
[[488  84]
 [128  83]]
Accuracy: 0.7292464878671775
f1-score: 0.4391534391534392
********************************************************************************
Classifying for : Sub3_FactClaiming
[[417 106]
 [100 160]]


## BERT MODEL

Use code from https://www.tensorflow.org/text/tutorials/classify_text_with_bert


In [14]:
import os
import shutil

import tensorflow as tf
import tensorflow_hub as hub
import tensorflow_text as text
from official.nlp import optimization
import tensorflow as tf
import tensorflow_hub as hub


tfhub_handle_encoder = "https://tfhub.dev/tensorflow/small_bert/bert_en_uncased_L-4_H-512_A-8/1"
tfhub_handle_preprocess = "https://tfhub.dev/tensorflow/bert_en_uncased_preprocess/3"

bert_preprocess_model = hub.KerasLayer(tfhub_handle_preprocess)
text_test = ["""this is such an amazing movie! evaluationScriptGermeval2018.pl

represents the evaluation tool. It can be used to evaluated the predictions for both tasks of the shared task.

In order to obtain further information on the usage of that tool please type:
As discussed in Section 2, our pretrained language models will learn pre-existing biases from the
training datasets. The main portion (89%) of our training data, namely the OSCAR dataset, uses texts
scraped from the internet, which is in some respects problematic. First off, this dataset contains a lot of
explicit and indecent material. While we filtered out many of these documents through keyword match-
ing, we cannot guarantee that this method was successful in every case. Furthermore, many websites
contain unverified information and any dataset containing this kind of text can lead to a skewed model
that reflects commonly found lies and misconceptions. This includes gender, racial and religious biases
which are found in textual data of all registers and so we advise that anyone using our model to recognise
that it will not always build true and accurate representation of real world concepts. We implore users
of the model to seriously consider these issues before deploying it in a production setting, especially
in situations where impartiality matter, such as journalism, and institutional decision making like job

The underground cities were well designed for protection against attacks. The few entrances were hidden by foliage and not easily spotted from outside. Inside, they took the form of a labyrinth of passageways which were unnavigable for outsiders, and could be sealed with large rock doors, around a metre high and shaped like mill-stones. These doors were built such that they could be rolled into a closed position relatively easily, but could not be moved from the outside. They had a hole in the centre which was probably used as a kind of peephole. In some cities there were holes in ceiling above, through which the enemy could be attacked with spears.[6] The cities descended up to twelve stories – over 100 metres – under the ground and had everything necessary for a long siege. The upper stories were largely used as stables and storerooms, with a constant temperature of around 10 °C. In the walls of the caverns there were receptacles for various kinds of food, as well as hollows for vessels in which liquids could be stored. Further down, were the living and working spaces, where furniture, including seats, tables, and beds were carved out of the rock. Working spaces include a wine press at Derinkuyu, a copper foundry in Kaymakli, as well as cisterns and wells which ensured a supply of drinking water during a long siege.[9] There were also prisons and toilets.
applications or insurance assessments
"""]
text_preprocessed = bert_preprocess_model(text_test)

#print(f'Keys       : {list(text_preprocessed.keys())}')
#print(f'Shape      : {text_preprocessed["input_word_ids"].shape}')
#print(f'Word Ids   : {text_preprocessed["input_word_ids"][0, :12]}')
#print(f'Input Mask : {text_preprocessed["input_mask"][0, :12]}')
#print(f'Type Ids   : {text_preprocessed["input_type_ids"][0, :12]}')


bert_model = hub.KerasLayer(tfhub_handle_encoder)
bert_results = bert_model(text_preprocessed)

#print(f'Loaded BERT: {tfhub_handle_encoder}')
print(f'Pooled Outputs Shape:{bert_results["pooled_output"].shape}')
print(f'Pooled Outputs Values:{bert_results["pooled_output"][0, :12]}')
print(f'Sequence Outputs Shape:{bert_results["sequence_output"].shape}')
print(f'Sequence Outputs Values:{bert_results["sequence_output"][0, :12]}')

Pooled Outputs Shape:(1, 512)
Pooled Outputs Values:[ 0.9107527   0.4435962  -0.0445334   0.07857981  0.13065211  0.95631254
  0.9006179  -0.79465264 -0.56049114 -0.94223607 -0.3001277  -0.973857  ]
Sequence Outputs Shape:(1, 128, 512)
Sequence Outputs Values:[[ 0.01486441  0.7203154   0.14484876 ... -0.8022307   0.06363815
   0.95406747]
 [-0.65695393  0.08590551  0.15920955 ...  0.64430267 -0.19999222
   0.7745597 ]
 [-0.81394494  0.74874014 -0.43530738 ...  0.07674243 -0.4267268
   0.6159239 ]
 ...
 [-0.65857285  1.4859799   0.7598989  ... -0.02287899  0.23170051
   0.85067797]
 [-0.77028626  1.3962332   0.19560781 ... -0.1806682   0.76875293
   1.0642036 ]
 [ 0.44916624  0.7581736   0.42707348 ... -0.8116752   1.2785095
   1.1305085 ]]


In [15]:
print(bert_results.keys())

dict_keys(['sequence_output', 'encoder_outputs', 'default', 'pooled_output'])


In [16]:
print(text_preprocessed['input_mask'])

tf.Tensor(
[[1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
  1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
  1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
  1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1]], shape=(1, 128), dtype=int32)


## HUGGING FACE TRANSFORMERS : GERMAN

In [None]:
!pip install transformers

In [3]:
from transformers import AutoTokenizer, AutoModelForMaskedLM
import tensorflow as tf
  
bert_tokenizer = AutoTokenizer.from_pretrained("bert-base-german-cased")

bert_model = AutoModelForMaskedLM.from_pretrained("bert-base-german-cased")

Some weights of the model checkpoint at bert-base-german-cased were not used when initializing BertForMaskedLM: ['cls.seq_relationship.bias', 'cls.seq_relationship.weight']
- This IS expected if you are initializing BertForMaskedLM from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForMaskedLM from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [6]:
def build_classifier_model():
  text_input = tf.keras.layers.Input(shape=(), dtype=tf.string, name='text')
  preprocessed_input = bert_tokenizer(text_input)
  outputs = bert_model(preprocessed_input)
  net = outputs['pooled_output']
  net = tf.keras.layers.Dropout(0.1)(net)
  net = tf.keras.layers.Dense(1, activation=None, name='classifier')(net)
  return tf.keras.Model(text_input, net)

In [7]:
full_model = build_classifier_model()

ValueError: ignored