<a href="https://colab.research.google.com/github/bhattacharjee/mtu-nlp-assignment/blob/main/assignment1/NLP_Assignment_1.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import requests
def get_train_test_files():
    TRAIN_FILE = 'https://raw.githubusercontent.com/bhattacharjee/mtu-nlp-assignment/main/assignment1/Assessment1_Toxic_Train.csv'
    TEST_FILE = 'https://raw.githubusercontent.com/bhattacharjee/mtu-nlp-assignment/main/assignment1/Assessment1_Toxic_Test_For_Evaluation.csv'
    TRAIN_FILE_LOCAL = 'Assessment1_Toxic_Train.csv'
    TEST_FILE_LOCAL = 'Assessment1_Toxic_Test.csv'

    def download(url, localfile):
        with open(localfile, 'wb') as f:
            r = requests.get(url, allow_redirects=True)
            f.write(r.content)

    download(TRAIN_FILE, TRAIN_FILE_LOCAL)
    download(TEST_FILE, TEST_FILE_LOCAL)

    return TRAIN_FILE_LOCAL, TEST_FILE_LOCAL


In [2]:
!pip install spacy nltk huggingface -q                  >/dev/null 2>&1
!python -m spacy download de_core_news_sm               >/dev/null 2>&1

In [3]:
import pandas as pd
def get_train_test_df():
    train_csv, test_csv = get_train_test_files()

    train_df = pd.read_csv(train_csv)
    test_df = pd.read_csv(test_csv)

    return train_df, test_df

In [4]:
import re
def remove_roles(line:str)->str:
    # Remove texts like @USER, @MODERATOR etc
    pat = re.compile(u'\@[A-Za-z]+')
    return re.sub(pat, '', line)

In [5]:
import re
def remove_emojis(line:str)->str:
    pat = re.compile("["
                u"\U0001F600-\U0001F64F"  # emoticons
                u"\U0001F300-\U0001F5FF"  # symbols & pictographs
                u"\U0001F680-\U0001F6FF"  # transport & map symbols
                u"\U0001F1E0-\U0001F1FF"  # flags (iOS)
                u"\U00002702-\U000027B0"
                u"\U000024C2-\U0001F251"
                u"\U0001f926-\U0001f937"
                u'\U00010000-\U0010ffff'
                u"\u200d"
                u"\u2640-\u2642"
                u"\u2600-\u2B55"
                u"\u23cf"
                u"\u23e9"
                u"\u231a"
                u"\u3030"
                u"\ufe0f"
        "]+", flags=re.UNICODE)
    return re.sub(pat, ' EMOJI ', line)


In [6]:
import re
def remove_ellipses(line:str)->str:
    pat = re.compile(u'\.\.+')
    return re.sub(pat, ' ', line)

In [7]:
def to_lower(line:str)->str:
    return line.lower()

In [8]:
def replace_number_with_tag(line:str)->str:
    line = re.sub("\s\d*((\.|\,)\d+)?\s", " nummer ", line)
    line = re.sub('\s\d+$', ' nummer ', line)
    line = re.sub('^\d+\s', ' nummer ', line)
    return line

In [9]:
def remove_urls(line:str)->str:
    return re.sub('https?:\/\/\S+', ' hyperlink ', line)

In [10]:
def basic_clean(s:pd.Series)->pd.Series:
    return s.map(to_lower)                                                  \
            .map(remove_emojis)                                             \
            .map(remove_roles)                                              \
            .map(remove_ellipses)                                           \
            .map(replace_number_with_tag)                                   \
            .map(remove_urls)

def get_clean_train_test_df()->tuple:
    train_df, test_df = get_train_test_df()
    train_df['comment_text'] = basic_clean(train_df['comment_text'])
    test_df['comment_text'] = basic_clean(test_df['comment_text'])
    return train_df, test_df


In [33]:
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import SnowballStemmer
import string


def is_punct_only(token:str)->bool:
    for c in list(token):
        if c not in string.punctuation:
            return False
    return True

def is_same(l1:list, l2:list)->bool:
    if (len(l1) != len(l2)):
        return False
    for x, y in zip(l1, l2):
        if x != y:
            return False
    return True

def do_basic_nlp_cleaning(line:str)->str:
    nltk.download('stopwords', quiet=True)
    nltk.download('punkt', quiet=True)
    nltk.download('wordnet', quiet=True)

    # Tokenize
    tokens = word_tokenize(line)

    # Some tokens start with a punctuation, remove the first one
    def remove_first_punctuation(tok:str)->str:
        return                                                              \
            tok[1:]                                                         \
            if tok[0] in set(string.punctuation) and len(tok) != 0          \
            else tok

    tokens = [remove_first_punctuation(w) for w in tokens]

    # Remove stop words
    stop_words = set(stopwords.words("german"))
    tokens = [w for w in tokens if w not in stop_words]

    # Remove punctuations
    tokens = [w for w in tokens if not is_punct_only(w)]

    # Stem words
    stem = SnowballStemmer('german')
    tokens = [stem.stem(w) for w in tokens]

    return " ".join(tokens)

import spacy
def get_cleaning_function():
    nlp = spacy.load("de_core_news_sm")
    doc = nlp(line)

    def do_basic_nlp_cleaning(line:str)->str:
        stopwords = spacy.lang.de.stop_words.STOP_WORDS
    
        def is_interesting_token(token):
            if token.text in stopwords:
                return False
            if (token.is_punct):
                return False
            for c in list(token.text):
                if c in set(list("0123456789&")):
                    return False
            return True

        line = " ".join([tok.lemma_ for tok in doc if is_interesting_token(tok)])
        return line

    return do_basic_nlp_cleaning





cleaning_fn = get_cleaning_function()
train_df, test_df = get_clean_train_test_df()
print(train_df['comment_text'])
train_df['comment_text'] = train_df['comment_text'].map(cleaning_fn)
print(train_df['comment_text'])

0       gestern bei illner, montag bei nummer  ist das...
1       mein gott der war erst gestern bei illner. die...
2        die cdu lässt das so wie so nicht zu . sagen ...
3       bei meiner beschissenen rente als 2x geschiede...
4       wer nummer jahre zum mindestlohn arbeiten muß,...
                              ...                        
3189    hier mal eine info. flüchtlinge werden nummer ...
3190    .aha .mal abwarten kommt bei uns auch .firmen ...
3191                                           .so ist es
3192                                       .die warten da
3193     .das bekommen die gesagt wie sich verhalten s...
Name: comment_text, Length: 3194, dtype: object
0       gestern illner montag nummer   langsam ör-part...
1                gott gestern illner redaktionen versagen
2         cdu lässt sagen reich bekommen nummer millia...
3       beschissen rente geschieden mann steuern krank...
4       nummer mindestlohn arbeiten erhalten € rente n...
                        

In [29]:
nlp = spacy.load("de_core_news_sm")

In [30]:
mytext = [
          "gestern bei illner, montag bei nummer  ist das nicht langsam ör-parteihilfe wie bei den grünlingen ? nummer EMOJ",
          "die cdu lässt das so wie so nicht zu . sagen doch nur wenn sie reichen was bekommen sollen ( nummer milliarden für die hotellobby)"
]

out = nlp(mytext)

TypeError: ignored

In [34]:
from sklearn.naive_bayes import MultinomialNB
from sklearn.datasets import fetch_20newsgroups
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import confusion_matrix, f1_score, accuracy_score
from sklearn.model_selection import train_test_split

def get_initial_feel(y_column:str):
    print('*' * 80)
    print(f"Classifying for : {y_column}")
    
    trainX, testX, trainY, testY = train_test_split(\
                        train_df['comment_text'], train_df[y_column])

    cv =  CountVectorizer(ngram_range=(1,4))
    trainX = cv.fit_transform(trainX)
    testX = cv.transform(testX)

    classifier = MultinomialNB()
    classifier.fit(trainX, trainY)

    predY = classifier.predict(testX)

    print(confusion_matrix(testY, predY))
    print(f"Accuracy: {accuracy_score(testY, predY)}")
    print(f"f1-score: {f1_score(testY, predY)}")

get_initial_feel('Sub1_Toxic')
get_initial_feel('Sub2_Engaging')
get_initial_feel('Sub3_FactClaiming')

********************************************************************************
Classifying for : Sub1_Toxic
[[432  93]
 [171 103]]
Accuracy: 0.6695869837296621
f1-score: 0.4382978723404255
********************************************************************************
Classifying for : Sub2_Engaging
[[514  77]
 [129  79]]
Accuracy: 0.7421777221526908
f1-score: 0.4340659340659341
********************************************************************************
Classifying for : Sub3_FactClaiming
[[456  56]
 [128 159]]
Accuracy: 0.769712140175219
f1-score: 0.6334661354581673
