<a href="https://colab.research.google.com/github/bhattacharjee/mtu-nlp-assignment/blob/main/assignment1/NLP_Assignment_1.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [48]:
import requests
def get_train_test_files():
    TRAIN_FILE = 'https://raw.githubusercontent.com/bhattacharjee/mtu-nlp-assignment/main/assignment1/Assessment1_Toxic_Train.csv'
    TEST_FILE = 'https://raw.githubusercontent.com/bhattacharjee/mtu-nlp-assignment/main/assignment1/Assessment1_Toxic_Test_For_Evaluation.csv'
    TRAIN_FILE_LOCAL = 'Assessment1_Toxic_Train.csv'
    TEST_FILE_LOCAL = 'Assessment1_Toxic_Test.csv'

    def download(url, localfile):
        with open(localfile, 'wb') as f:
            r = requests.get(url, allow_redirects=True)
            f.write(r.content)

    download(TRAIN_FILE, TRAIN_FILE_LOCAL)
    download(TEST_FILE, TEST_FILE_LOCAL)

    return TRAIN_FILE_LOCAL, TEST_FILE_LOCAL


In [108]:
!pip install spacy nltk huggingface -q                  >/dev/null 2>&1
!python -m spacy download de_core_news_sm               >/dev/null 2>&1

In [89]:
import pandas as pd
def get_train_test_df():
    train_csv, test_csv = get_train_test_files()

    train_df = pd.read_csv(train_csv)
    test_df = pd.read_csv(test_csv)

    return train_df, test_df

In [90]:
train_df, test_df = get_train_test_df()

In [91]:
import re
def remove_roles(line:str)->str:
    # Remove texts like @USER, @MODERATOR etc
    pat = re.compile(u'\@[A-Za-z]+')
    return re.sub(pat, '', line)

In [92]:
import re
def remove_emojis(line:str)->str:
    pat = re.compile(
        "["
            u"\U0001F600-\U0001F64F"
            u"\U0001F300-\U0001F5FF"
            u"\U0001F680-\U0001F6FF"
            u"\U0001F1E0-\U0001F1FF"
        "]", flags=re.UNICODE)
    return re.sub(pat, '', line)

In [93]:
import re
def remove_ellipses(line:str)->str:
    pat = re.compile(u'\.\.+')
    return re.sub(pat, ' ', line)

In [94]:
def to_lower(line:str)->str:
    return line.lower()

In [95]:
def basic_clean(s:pd.Series)->pd.Series:
    return s.map(to_lower)                                                  \
            .map(remove_emojis)                                             \
            .map(remove_roles)                                              \
            .map(remove_ellipses)
    
print(train_df.columns)
print(train_df['comment_text'])

train_df['comment_text'] = basic_clean(train_df['comment_text'])
test_df['comment_text'] = basic_clean(test_df['comment_text'])


Index(['comment_text', 'Sub1_Toxic', 'Sub2_Engaging', 'Sub3_FactClaiming'], dtype='object')
0       Gestern bei Illner, Montag bei @MODERATOR ...i...
1       Mein Gott der war erst gestern bei Illner. Die...
2       @USER Die CDU lässt das so wie so nicht zu . S...
3       Bei meiner beschissenen Rente als 2x Geschiede...
4       Wer 40 Jahre zum Mindestlohn arbeiten muß, erh...
                              ...                        
3189    Hier mal eine Info. Flüchtlinge werden 10 km v...
3190    @USER.aha .Mal abwarten kommt bei uns auch .Fi...
3191                                     @USER .So ist es
3192                                 @USER .Die warten da
3193    @USER .Das bekommen die gesagt wie sich verhal...
Name: comment_text, Length: 3194, dtype: object


In [106]:
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import SnowballStemmer
import string


def is_punct_only(token:str)->bool:
    for c in list(token):
        if c not in string.punctuation:
            return False
    return True

def is_same(l1:list, l2:list)->bool:
    if (len(l1) != len(l2)):
        return False
    for x, y in zip(l1, l2):
        if x != y:
            return False
    return True

def do_basic_nlp_cleaning(line:str)->str:
    nltk.download('stopwords', quiet=True)
    nltk.download('punkt', quiet=True)
    nltk.download('wordnet', quiet=True)

    # Tokenize
    tokens = word_tokenize(line)

    # Remove stop words
    stop_words = set(stopwords.words("german"))
    tokens = [w for w in tokens if w not in stop_words]

    # Remove punctuations
    tokens = [w for w in tokens if not is_punct_only(w)]

    # Stem words
    stem = SnowballStemmer('german')
    tokens = [stem.stem(w) for w in tokens]
        
    # Some tokens start with a punctuation, remove the first one
    def remove_first_punctuation(tok:str)->str:
        return tok[1:] if tok[0] in set(string.punctuation) else tok
    tokens = [remove_first_punctuation(w) for w in tokens]


    return " ".join(tokens)


new_series = train_df['comment_text'].map(do_basic_nlp_cleaning)

print(len(new_series))
print(new_series)
print(new_series)


"""
from spacy.tokenizer import Tokenizer
from spacy.lang.en import English

sentence = "I'm a dog and it's great! You're cool and Sandy's book is big. Don't tell her, you'll regret it! 'Hey', she'll say!"
nlp = English()
tokenizer = nlp.tokenizer
for x in nlp.tokenizer(sentence):
    print(x)
"""

3194
0       gest illn montag langsam or-parteihilf grunlin...
1                    gott erst gest illn redaktion versag
2       cdu lasst sag reich bekomm soll 10 milliard ho...
3       beschiss rent 2x geschied mann steu krankenkas...
4       wer 40 jahr mindestlohn arbeit muss erhalt 514...
                              ...                        
3189    mal info fluchtling 10 km kust schlauchboot st...
3190    aha mal abwart kommt firm entlass gerad viel m...
3191                                                   so
3192                                             die wart
3193    das bekomm gesagt verhalt soll kameras gericht...
Name: comment_text, Length: 3194, dtype: object
0       gest illn montag langsam or-parteihilf grunlin...
1                    gott erst gest illn redaktion versag
2       cdu lasst sag reich bekomm soll 10 milliard ho...
3       beschiss rent 2x geschied mann steu krankenkas...
4       wer 40 jahr mindestlohn arbeit muss erhalt 514...
                   

'\nfrom spacy.tokenizer import Tokenizer\nfrom spacy.lang.en import English\n\nsentence = "I\'m a dog and it\'s great! You\'re cool and Sandy\'s book is big. Don\'t tell her, you\'ll regret it! \'Hey\', she\'ll say!"\nnlp = English()\ntokenizer = nlp.tokenizer\nfor x in nlp.tokenizer(sentence):\n    print(x)\n'

In [9]:
train_df['comment_text'].map(remove_roles).map(remove_emojis)

0       Gestern bei Illner, Montag bei  ...ist das nic...
1       Mein Gott der war erst gestern bei Illner. Die...
2        Die CDU lässt das so wie so nicht zu . Sagen ...
3       Bei meiner beschissenen Rente als 2x Geschiede...
4       Wer 40 Jahre zum Mindestlohn arbeiten muß, erh...
                              ...                        
3189    Hier mal eine Info. Flüchtlinge werden 10 km v...
3190    .aha .Mal abwarten kommt bei uns auch .Firmen ...
3191                                           .So ist es
3192                                       .Die warten da
3193     .Das bekommen die gesagt wie sich verhalten s...
Name: comment_text, Length: 3194, dtype: object