<a href="https://colab.research.google.com/github/bhattacharjee/mtu-nlp-assignment/blob/main/assignment1/best_model.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
!pip install spacy  nltk spacymoji huggingface -q       >/dev/null 2>&1         
!pip install -q -U tensorflow-text                      >/dev/null 2>&1
!pip install -q tf-models-official                      >/dev/null 2>&1
!python -m spacy download de_core_news_sm               >/dev/null 2>&1
!python -m spacy download de_dep_news_trf               >/dev/null 2>&1
!pip install transformers                               >/dev/null 2>&1

!python -m spacy download de_core_news_sm               >/dev/null 2>&1
!python -m spacy download de_dep_news_trf               >/dev/null 2>&1

!pip install mlxtend                                    >/dev/null 2>&1
!pip install imblearn                                   >/dev/null 2>&1

# handling emojis
!pip install demoji                                     >/dev/null 2>&1

In [2]:
import requests
from functools import lru_cache
import sklearn

@lru_cache(maxsize=10)
def get_train_test_files():
    TRAIN_FILE = 'https://raw.githubusercontent.com/bhattacharjee/mtu-nlp-assignment/main/assignment1/Assessment1_Toxic_Train.csv'
    TEST_FILE = 'https://raw.githubusercontent.com/bhattacharjee/mtu-nlp-assignment/main/assignment1/Assessment1_Toxic_Test_For_Evaluation.csv'
    EXTRA_FILE = 'https://raw.githubusercontent.com/bhattacharjee/mtu-nlp-assignment/main/assignment1/germeval2018_a.txt'
    TRAIN_FILE_LOCAL = 'Assessment1_Toxic_Train.csv'
    TEST_FILE_LOCAL = 'Assessment1_Toxic_Test.csv'
    EXTRA_FILE_LOCAL = 'germeval2018.csv'

    def download(url, localfile):
        with open(localfile, 'wb') as f:
            r = requests.get(url, allow_redirects=True)
            f.write(r.content)

    download(TRAIN_FILE, TRAIN_FILE_LOCAL)
    download(TEST_FILE, TEST_FILE_LOCAL)
    download(EXTRA_FILE, EXTRA_FILE_LOCAL)

    return TRAIN_FILE_LOCAL, TEST_FILE_LOCAL, EXTRA_FILE_LOCAL

def seed_random():
    import numpy as np
    import random
    np.random.seed(0)
    random.seed(0)

sklearn.set_config(display="diagram")

# Functions to read the CSV and do basic cleaning

In [34]:
import re
import pandas as pd
import demoji

from functools import lru_cache

def remove_roles(line:str)->str:
    # Remove texts like @USER, @MODERATOR etc
    pat = re.compile(u'\@[A-Za-z]+')
    return re.sub(pat, '', line)

@lru_cache(maxsize=3)
def get_train_test_df_cached():
    train_csv, test_csv, extra_csv = get_train_test_files()
    train_df = pd.read_csv(train_csv)
    test_df = pd.read_csv(test_csv)
    extra_df = pd.read_csv(extra_csv)
    return train_df, test_df, extra_df

def get_train_test_df():
    tr, te, ex = get_train_test_df_cached()
    return tr.copy(), te.copy(), ex.copy()

def remove_emojis(line:str)->str:
    # Replace emojis with their description, eg __thumbs_down__
    demoji_str = demoji.replace_with_desc(line, sep=" ::: ")
    if (demoji_str == line):
        return line
    
    inEmoji = False
    currentEmojiWords = []
    allWords = []

    def accumulate(word:str)->None:
        nonlocal inEmoji
        nonlocal currentEmojiWords
        nonlocal allWords
        if not inEmoji and word != ":::":
            allWords.append(word)
        elif inEmoji:
            if word == ':::':
                currentEmoji = "_".join(currentEmojiWords)
                currentEmoji = "__" + currentEmoji + "__"
                allWords.append(currentEmoji)
                currentEmojiWords = []
            else:
                currentEmojiWords.append(word)
        else: # Not in emoji but ::: is true
            inEmoji = True

    [accumulate(word) for word in demoji_str.split()]

    sentence = " ".join(allWords)
    return sentence


def remove_ellipses(line:str)->str:
    pat = re.compile(u'\.\.+')
    return re.sub(pat, ' ', line)

def to_lower(line:str)->str:
    return line.lower()

def replace_number_with_tag(line:str)->str:
    line = re.sub("\s\d*((\.|\,)\d+)?\s", " nummer ", line)
    line = re.sub('\s\d+$', '', line)
    line = re.sub('^\d+\s', '', line)
    return line

def remove_urls(line:str)->str:
    return re.sub('https?:\/\/\S+', ' hyperlink ', line)

def basic_clean(s:pd.Series)->pd.Series:
    return s.map(to_lower)                                                  \
            .map(remove_emojis)                                             \
            .map(remove_roles)                                              \
            .map(remove_ellipses)                                           \
            .map(replace_number_with_tag)                                   \
            .map(remove_urls)

def get_clean_train_test_df()->tuple:
    train_df, test_df, extra_df = get_train_test_df()
    train_df['comment_text'] = basic_clean(train_df['comment_text'])
    test_df['comment_text'] = basic_clean(test_df['comment_text'])
    extra_df['comment_text'] = basic_clean(extra_df['comment_text'])
    return train_df, test_df, extra_df


# Clean using Spacy and Enrich

In [4]:
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import SnowballStemmer
import string
import spacy
from spacymoji import Emoji
import  de_core_news_sm

def is_punct_only(token:str)->bool:
    for c in list(token):
        if c not in string.punctuation:
            return False
    return True

def is_same(l1:list, l2:list)->bool:
    if (len(l1) != len(l2)):
        return False
    for x, y in zip(l1, l2):
        if x != y:
            return False
    return True

def get_num_of_allcap_words(s:str)->int:
    def is_allcaps(s:str)->bool:
        if (len(s) < 3):
            return False
        for c in list(s):
            if not (\
                    (ord(c) <=ord('Z') and ord(c) >= ord('A')) or           \
                    (ord(c) >= ord('0') and ord(c) <= ord('9'))             \
                    ):
                return False
        return True

    if len(s) < 3:
        return 0
    tokens = [w.strip() for w in s.split()]
    return sum([1 for t in tokens if is_allcaps(t)])

def get_percentage_of_excalamations(s:str)->float:
    if len(s) == 0:
        return 0.0
    exclamation_count = sum([1 for c in list(s) if c == '!'])
    return exclamation_count / len(s)


def is_empty_string(s:str)->bool:
    if s == '' or s == None:
        return True
    return False

def do_basic_nlp_cleaning(line:str)->str:
    nltk.download('stopwords', quiet=True)
    nltk.download('punkt', quiet=True)
    nltk.download('wordnet', quiet=True)

    # Tokenize
    tokens = word_tokenize(line)

    # Some tokens start with a punctuation, remove the first one
    def remove_first_punctuation(tok:str)->str:
        return                                                              \
            tok[1:]                                                         \
            if tok[0] in set(string.punctuation) and len(tok) != 0          \
            else tok

    tokens = [remove_first_punctuation(w) for w in tokens]

    # Remove stop words
    stop_words = set(stopwords.words("german"))
    tokens = [w for w in tokens if w not in stop_words]

    # Remove punctuations
    tokens = [w for w in tokens if not is_punct_only(w)]

    # Stem words
    stem = SnowballStemmer('german')
    tokens = [stem.stem(w) for w in tokens]

    return " ".join(tokens)

def get_cleaning_function(remove_named_ents:bool=True, pos_tagging:bool=False):
    nlp = de_core_news_sm.load()
    emoji = Emoji(nlp)
    nlp.add_pipe(emoji, first=True)
    stopwords = spacy.lang.de.stop_words.STOP_WORDS

    def do_basic_nlp_cleaning(line:str)->str:
        def is_interesting_token(token, doc):
            if token.pos_ in set(['NUM', 'SYM']):
                return False
            if remove_named_ents:
                for e in doc.ents:
                    for t in e:
                        if token.text == t.text:
                            return False
            if token.text in stopwords:
                return False
            if (token.is_punct):
                return False
            #if token._.is_emoji:
            #    return False
            return True

        def remove_terminal_punctuations(word):
            word = word.strip()
            while word != "" and word[0] in list(string.punctuation):
                word = word[1:]
            while word != "" and word[-1] in list(string.punctuation):
                word = word[:-1]
            return word

        def get_final_string(tok, doc):
            lemma = tok.lemma_.lower()
            if pos_tagging:
                lemma = lemma + ":" + tok.pos_
                lemma = lemma + ":" + tok.tag_
            return lemma

        doc = nlp(line)
        words = [get_final_string(tok, doc) for tok in doc if is_interesting_token(tok, doc)]
        words = [remove_terminal_punctuations(word) for word in words]
        words = [word for word in words if word != ""]
        return  " ".join(words)

    return do_basic_nlp_cleaning

def get_enriched_dataset(df):
    cleaning_fn = get_cleaning_function(remove_named_ents=True, pos_tagging=True)
    df['cleaned_comment_text'] = df['comment_text'].map(cleaning_fn)
    df['n_all_caps'] = df['comment_text'].map(get_num_of_allcap_words)
    df['perc_exclamations'] = df['comment_text'].map(get_percentage_of_excalamations)
    df['num_exclamations'] = df['comment_text'].map(lambda s: sum([1 for x in list(s) if x == '!']))
    return df

import functools
from functools import lru_cache

@lru_cache(maxsize=128)
def get_all_enriched_dfs_cached():
    train_df, test_df, extra_df = get_clean_train_test_df()
    train_df = get_enriched_dataset(train_df)
    test_df = get_enriched_dataset(test_df)
    extra_df = get_enriched_dataset(extra_df)
    return train_df, test_df, extra_df
    
def get_all_enriched_dataframes():
    tr, te, ex = get_all_enriched_dfs_cached()
    return tr.copy(), te.copy(), ex.copy()

train_df, test_df, extra_df = get_all_enriched_dfs_cached()

# Multinomial NB (original)

In [38]:
from sklearn.naive_bayes import MultinomialNB, CategoricalNB, BernoulliNB
from sklearn.datasets import fetch_20newsgroups
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.metrics import confusion_matrix, f1_score, accuracy_score
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline
#from imblearn.pipeline import Pipeline
#from imblearn.over_sampling import SMOTE 
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.compose import make_column_transformer
from sklearn.compose import ColumnTransformer
from sklearn.svm import LinearSVC
from sklearn.ensemble import RandomForestClassifier
from mlxtend.preprocessing import DenseTransformer
from sklearn.linear_model import Lasso
from sklearn.linear_model import Ridge
from sklearn.ensemble import AdaBoostClassifier
from sklearn.model_selection import GridSearchCV

def get_feature_column_names(df):
    return [cname for cname in df.columns if not cname.startswith('Sub')]

def get_target_column_names(df):
    return [cname for cname in df.columns if cname.startswith('Sub')]

def is_text_column(colname:str)->bool:
    if 'text' in colname:
        return True
    return False

def get_text_columns(df)->list:
    return [cn for cn in df.columns if is_text_column(cn)]

def get_nontext_columns(df)->list:
    return [cn for cn in df.columns if not is_text_column(cn)]

def run_classification(                                                     \
                       dataset:pd.DataFrame,                                \
                       extra_dataset:pd.DataFrame,                          \
                       test_dataset:pd.DataFrame,                           \
                       target_column:str,                                   \
                       clf_gen_fn,                                          \
                       grid_search_dict=None,                               \
                       use_extra_dataset=False,                             \
                       )->tuple:
    dataset = dataset[[cn for cn in dataset.columns if cn != 'comment_text']]

    seed_random()

    X = dataset[get_feature_column_names(dataset)]
    y = dataset[target_column]
    trainX, testX, trainY, testY = train_test_split(X, y, random_state=0)

    if use_extra_dataset:
        extraX = extra_dataset[get_feature_column_names(dataset)]
        extray = extra_dataset[target_column]
        trainX = pd.concat([trainX, extraX])
        trainY = pd.concat([trainY, extray])
        print("Added additional data from GermEval 2018")

    if None != grid_search_dict and isinstance(grid_search_dict, dict):
        # TODO: Update grid for the vectorizers
        # Right now, we're hitting RAM constraints if we turn these on
        gridupd = {
        }
        grid_search_dict = grid_search_dict.copy()
        grid_search_dict.update(gridupd)

                        
    column_trans = make_column_transformer(                                 \
                            (CountVectorizer(ngram_range=(1,1)), 'cleaned_comment_text'),   \
                            (TfidfVectorizer(use_idf=True), 'cleaned_comment_text'),    \
                            (TfidfVectorizer(use_idf=False), 'cleaned_comment_text'),    \
                            remainder=MinMaxScaler(),                       \
                        )
    
    classif_pipeline = Pipeline(                                        \
                            [                                           \
                                ('column_transformer', column_trans),   \
                                ('dense', DenseTransformer()),          \
                                ('clf', clf_gen_fn()),                  \
                            ])

    if None != grid_search_dict and isinstance(grid_search_dict, dict):
        search = GridSearchCV(classif_pipeline, grid_search_dict, cv=3, n_jobs=-1, scoring='f1')
        search.fit(trainX, trainY)
        classif_pipeline = search.best_estimator_
        print("best params: ", search.best_params_)
        print("best f1 score: ", search.best_score_)
    else:
        classif_pipeline.fit(trainX, trainY)

    y_pred = classif_pipeline.predict(testX)
    print('-' * 40)

    return accuracy_score(testY, y_pred), f1_score(testY, y_pred), classif_pipeline

def predict_on_test_set(\
                        dataset,\
                        colname,\
                        classif_pipeline):
    seed_random()
    dataset = dataset[[cn for cn in dataset.columns if cn != 'comment_text']]
    y_pred = classif_pipeline.predict(dataset)
    return y_pred.tolist()

def run_classifiers():

    linearsvc_gen = lambda: LinearSVC()
    linearsvc_paramgrid = {'clf__class_weight': [None, 'balanced'], 'clf__max_iter': [1000, 10000]}
    classifiers = {
        "LinearSVC": [
                      "Sub1_Toxic",
                      linearsvc_gen,
                      linearsvc_paramgrid,
                      True, # Use additional data
                      ]
    }

    rfc_gen = lambda: RandomForestClassifier()
    rfc_paramgrid = {'clf__criterion': ['entropy', 'gini'], 'clf__min_samples_split': [2, 10], 'clf__class_weight': ['balanced', 'balanced_subsample']}
    tempdict = {
        'RandomForest1': [
                          "Sub2_Engaging",
                          rfc_gen,
                          rfc_paramgrid,
                          False # Don't use additional data
        ]
    }
    classifiers.update(tempdict)
    
    mnb_gen = lambda: MultinomialNB()
    mnb_paramgrid = {'clf__fit_prior': [True, False]}
    tempdict = {
        'MultinomialNB1': [
                           "Sub3_FactClaiming",
                           mnb_gen,
                           mnb_paramgrid,
                           False # Don't use additional data
        ]
    }
    classifiers.update(tempdict)

    best_classifiers = list() # list of tuples: (columnname, clfname, pipeline)

    for clfname, val in classifiers.items():
        colname = val[0]
        generator = val[1]
        gridsearch = val[2]
        use_extra_data = val[3]

        print("Running: ", colname, clfname)

        train_df, test_df, extra_df = get_all_enriched_dfs_cached()
        acc, f1, classif_pipeline = run_classification(\
            dataset=train_df,\
            extra_dataset=extra_df if use_extra_data else None,\
            test_dataset=test_df,\
            target_column=colname,\
            clf_gen_fn=generator,\
            grid_search_dict=gridsearch,\
            use_extra_dataset=use_extra_data)
        
        best_classifiers.append((colname, clfname, classif_pipeline))

        print(colname, clfname, acc, f1,)


    or_train_df, or_test_df, or_extra_df = get_train_test_df()
    test_pred_df_dict = {
        'comment_text': or_test_df['comment_text'].to_numpy().tolist()
    }

    retval = list()
    for colname, clfname, clf_pipeline in best_classifiers:
        train_df, test_df, extra_df = get_all_enriched_dfs_cached()
        y_pred = predict_on_test_set(test_df, clfname, clf_pipeline)
        temp_dict = {colname: y_pred}
        test_pred_df_dict.update(temp_dict)
        tempdict = {'clfname': clfname, 'colname': colname, 'clf_pipeline': clf_pipeline}
        retval.append(tempdict)

    result_df = pd.DataFrame(test_pred_df_dict)
    return result_df, retval

seed_random()
result_df, retval = run_classifiers()


for colname, clfname, clf_pipeline in retval:
    print(colname, clfname)
    clf_pipeline

Running:  Sub1_Toxic LinearSVC
Added additional data from GermEval 2018




best params:  {'clf__class_weight': 'balanced', 'clf__max_iter': 10000}
best f1 score:  0.5168579314509509
----------------------------------------
Sub1_Toxic LinearSVC 0.655819774718398 0.4701348747591522
Running:  Sub2_Engaging RandomForest1
best params:  {'clf__class_weight': 'balanced', 'clf__criterion': 'gini', 'clf__min_samples_split': 10}
best f1 score:  0.5965191250907029
----------------------------------------
Sub2_Engaging RandomForest1 0.8385481852315394 0.6485013623978202
Running:  Sub3_FactClaiming MultinomialNB1
best params:  {'clf__fit_prior': False}
best f1 score:  0.6029812346017276
----------------------------------------
Sub3_FactClaiming MultinomialNB1 0.704630788485607 0.5902777777777777
clfname colname
clfname colname
clfname colname


In [39]:
print(retval)

[{'clfname': 'LinearSVC', 'colname': 'Sub1_Toxic', 'clf_pipeline': Pipeline(steps=[('column_transformer',
                 ColumnTransformer(remainder=MinMaxScaler(),
                                   transformers=[('countvectorizer',
                                                  CountVectorizer(),
                                                  'cleaned_comment_text'),
                                                 ('tfidfvectorizer-1',
                                                  TfidfVectorizer(),
                                                  'cleaned_comment_text'),
                                                 ('tfidfvectorizer-2',
                                                  TfidfVectorizer(use_idf=False),
                                                  'cleaned_comment_text')])),
                ('dense', DenseTransformer()),
                ('clf', LinearSVC(class_weight='balanced', max_iter=10000))])}, {'clfname': 'RandomForest1', 'colname': 'Sub2_E