<a href="https://colab.research.google.com/github/bhattacharjee/mtu-nlp-assignment/blob/main/assignment1/NLP_Assignment_1.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [48]:
import requests
def get_train_test_files():
    TRAIN_FILE = 'https://raw.githubusercontent.com/bhattacharjee/mtu-nlp-assignment/main/assignment1/Assessment1_Toxic_Train.csv'
    TEST_FILE = 'https://raw.githubusercontent.com/bhattacharjee/mtu-nlp-assignment/main/assignment1/Assessment1_Toxic_Test_For_Evaluation.csv'
    TRAIN_FILE_LOCAL = 'Assessment1_Toxic_Train.csv'
    TEST_FILE_LOCAL = 'Assessment1_Toxic_Test.csv'

    def download(url, localfile):
        with open(localfile, 'wb') as f:
            r = requests.get(url, allow_redirects=True)
            f.write(r.content)

    download(TRAIN_FILE, TRAIN_FILE_LOCAL)
    download(TEST_FILE, TEST_FILE_LOCAL)

    return TRAIN_FILE_LOCAL, TEST_FILE_LOCAL


In [49]:
!pip install spacy nltk huggingface -q                  >/dev/null 2>&1
!python -m spacy download de_core_news_sm               >/dev/null 2>&1

In [96]:
import pandas as pd
def get_train_test_df():
    train_csv, test_csv = get_train_test_files()

    train_df = pd.read_csv(train_csv)
    test_df = pd.read_csv(test_csv)

    return train_df, test_df

In [111]:
train_df, test_df = get_train_test_df()

In [98]:
import re
def remove_roles(line:str)->str:
    # Remove texts like @USER, @MODERATOR etc
    pat = re.compile(u'\@[A-Za-z]+')
    return re.sub(pat, '', line)

In [99]:
import re
def remove_emojis(line:str)->str:
    pat = re.compile(
        "["
            u"\U0001F600-\U0001F64F"
            u"\U0001F300-\U0001F5FF"
            u"\U0001F680-\U0001F6FF"
            u"\U0001F1E0-\U0001F1FF"
        "]", flags=re.UNICODE)
    return re.sub(pat, '', line)

In [100]:
import re
def remove_ellipses(line:str)->str:
    pat = re.compile(u'\.\.+')
    return re.sub(pat, ' ', line)

In [101]:
def to_lower(line:str)->str:
    return line.lower()

In [106]:
def replace_number_with_tag(line:str)->str:
    iline = line
    line = re.sub("\s\d*((\.|\,)\d+)?\s", " nummer ", line)
    line = re.sub('\s\d+$', ' nummer ', line)
    line = re.sub('^\d+\s', ' nummer ', line)
    return line

In [112]:
def remove_urls(line:str)->str:
    pat = re.compile(u'https?:\/\/\S+')
    retval = re.sub('https?:\/\/\S+', ' ', line)

In [113]:
def basic_clean(s:pd.Series)->pd.Series:
    return s.map(to_lower)                                                  \
            .map(remove_emojis)                                             \
            .map(remove_roles)                                              \
            .map(remove_ellipses)                                           \
            .map(replace_number_with_tag)                                   \
            .map(remove_urls)

train_df['comment_text'] = basic_clean(train_df['comment_text'])
test_df['comment_text'] = basic_clean(test_df['comment_text'])



wer nummer jahre zum mindestlohn arbeiten mu√ü, erh√§lt 514‚Ç¨ rente daf√ºr nummer das ist doch das grundproblem ü§¶‚Äç‚ôÄÔ∏è wir steuern somit sehenden auges in die massenaltersarmut ! habe selbst jetzt jahrelang zum mindestlohn gearbeitet , netto 1147‚Ç¨ bei stkl.1 . ergebnis ist jetzt ein alg1 von 769‚Ç¨ . nummer anderen besch√§ftigten im callcenter geht es da auch so nummer und da hilft kein gesabbel ! l√∂sungen m√ºssen her ! ü§¶‚Äç‚ôÄÔ∏è zum gie√ükannenprinzip halte ich entgegen , was ist mit dieser gie√ükanne hier ?? ü§¨ https://www.nzz.ch/meinung/kommentare/die-fluechtlingskosten-sind-ein-deutsches-tabuthema-ld.1316333 das angebliche nummer milliardenloch vom scholz ist √ºbrigens hier nummer https://www.welt.de/wirtschaft/article181239154/haushaltsreserve-warum-die-regierung-24-milliarden-euro-fuer-fluechtlinge-zurueckhaelt.html
wer nummer jahre zum mindestlohn arbeiten mu√ü, erh√§lt 514‚Ç¨ rente daf√ºr nummer das ist doch das grundproblem ü§¶‚Äç‚ôÄÔ∏è wir steuern somit seh

In [20]:
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import SnowballStemmer
import string


def is_punct_only(token:str)->bool:
    for c in list(token):
        if c not in string.punctuation:
            return False
    return True

def is_same(l1:list, l2:list)->bool:
    if (len(l1) != len(l2)):
        return False
    for x, y in zip(l1, l2):
        if x != y:
            return False
    return True

def do_basic_nlp_cleaning(line:str)->str:
    nltk.download('stopwords', quiet=True)
    nltk.download('punkt', quiet=True)
    nltk.download('wordnet', quiet=True)

    # Tokenize
    tokens = word_tokenize(line)

    # Remove stop words
    stop_words = set(stopwords.words("german"))
    tokens = [w for w in tokens if w not in stop_words]

    # Remove punctuations
    tokens = [w for w in tokens if not is_punct_only(w)]

    # Stem words
    stem = SnowballStemmer('german')
    tokens = [stem.stem(w) for w in tokens]
        
    # Some tokens start with a punctuation, remove the first one
    def remove_first_punctuation(tok:str)->str:
        return                                                              \
            tok[1:]                                                         \
            if tok[0] in set(string.punctuation) and len(tok) != 0          \
            else tok
    tokens = [remove_first_punctuation(w) for w in tokens]


    return " ".join(tokens)


new_series = train_df['comment_text'].map(do_basic_nlp_cleaning)


"""
from spacy.tokenizer import Tokenizer
from spacy.lang.en import English

sentence = "I'm a dog and it's great! You're cool and Sandy's book is big. Don't tell her, you'll regret it! 'Hey', she'll say!"
nlp = English()
tokenizer = nlp.tokenizer
for x in nlp.tokenizer(sentence):
    print(x)
"""

'\nfrom spacy.tokenizer import Tokenizer\nfrom spacy.lang.en import English\n\nsentence = "I\'m a dog and it\'s great! You\'re cool and Sandy\'s book is big. Don\'t tell her, you\'ll regret it! \'Hey\', she\'ll say!"\nnlp = English()\ntokenizer = nlp.tokenizer\nfor x in nlp.tokenizer(sentence):\n    print(x)\n'

In [None]:
train_df['comment_text'].map(remove_roles).map(remove_emojis)