<a href="https://colab.research.google.com/github/bhattacharjee/mtu-nlp-assignment/blob/main/assignment1/NLP_Assignment_1.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [64]:
import requests
def get_train_test_files():
    TRAIN_FILE = 'https://raw.githubusercontent.com/bhattacharjee/mtu-nlp-assignment/main/assignment1/Assessment1_Toxic_Train.csv'
    TEST_FILE = 'https://raw.githubusercontent.com/bhattacharjee/mtu-nlp-assignment/main/assignment1/Assessment1_Toxic_Test_For_Evaluation.csv'
    TRAIN_FILE_LOCAL = 'Assessment1_Toxic_Train.csv'
    TEST_FILE_LOCAL = 'Assessment1_Toxic_Test.csv'

    def download(url, localfile):
        with open(localfile, 'wb') as f:
            r = requests.get(url, allow_redirects=True)
            f.write(r.content)

    download(TRAIN_FILE, TRAIN_FILE_LOCAL)
    download(TEST_FILE, TEST_FILE_LOCAL)

    return TRAIN_FILE_LOCAL, TEST_FILE_LOCAL


In [65]:
!pip install spacy nltk huggingface -q                  >/dev/null 2>&1
!python -m spacy download de_core_news_sm               >/dev/null 2>&1

In [66]:
import pandas as pd
def get_train_test_df():
    train_csv, test_csv = get_train_test_files()

    train_df = pd.read_csv(train_csv)
    test_df = pd.read_csv(test_csv)

    return train_df, test_df

In [67]:
import re
def remove_roles(line:str)->str:
    # Remove texts like @USER, @MODERATOR etc
    pat = re.compile(u'\@[A-Za-z]+')
    return re.sub(pat, '', line)

In [77]:
import re
def remove_emojis(line:str)->str:
    pat = re.compile("["
                u"\U0001F600-\U0001F64F"  # emoticons
                u"\U0001F300-\U0001F5FF"  # symbols & pictographs
                u"\U0001F680-\U0001F6FF"  # transport & map symbols
                u"\U0001F1E0-\U0001F1FF"  # flags (iOS)
                u"\U00002702-\U000027B0"
                u"\U000024C2-\U0001F251"
                u"\U0001f926-\U0001f937"
                u'\U00010000-\U0010ffff'
                u"\u200d"
                u"\u2640-\u2642"
                u"\u2600-\u2B55"
                u"\u23cf"
                u"\u23e9"
                u"\u231a"
                u"\u3030"
                u"\ufe0f"
        "]+", flags=re.UNICODE)
    return re.sub(pat, ' EMOJI ', line)


In [69]:
import re
def remove_ellipses(line:str)->str:
    pat = re.compile(u'\.\.+')
    return re.sub(pat, ' ', line)

In [70]:
def to_lower(line:str)->str:
    return line.lower()

In [71]:
def replace_number_with_tag(line:str)->str:
    line = re.sub("\s\d*((\.|\,)\d+)?\s", " nummer ", line)
    line = re.sub('\s\d+$', ' nummer ', line)
    line = re.sub('^\d+\s', ' nummer ', line)
    return line

In [72]:
def remove_urls(line:str)->str:
    return re.sub('https?:\/\/\S+', ' hyperlink ', line)

In [73]:
def basic_clean(s:pd.Series)->pd.Series:
    return s.map(to_lower)                                                  \
            .map(remove_emojis)                                             \
            .map(remove_roles)                                              \
            .map(remove_ellipses)                                           \
            .map(replace_number_with_tag)                                   \
            .map(remove_urls)

def get_clean_train_test_df()->tuple:
    train_df, test_df = get_train_test_df()
    train_df['comment_text'] = basic_clean(train_df['comment_text'])
    test_df['comment_text'] = basic_clean(test_df['comment_text'])
    return train_df, test_df


In [82]:
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import SnowballStemmer
import string


def is_punct_only(token:str)->bool:
    for c in list(token):
        if c not in string.punctuation:
            return False
    return True

def is_same(l1:list, l2:list)->bool:
    if (len(l1) != len(l2)):
        return False
    for x, y in zip(l1, l2):
        if x != y:
            return False
    return True

def do_basic_nlp_cleaning(line:str)->str:
    nltk.download('stopwords', quiet=True)
    nltk.download('punkt', quiet=True)
    nltk.download('wordnet', quiet=True)

    # Tokenize
    tokens = word_tokenize(line)

    # Some tokens start with a punctuation, remove the first one
    def remove_first_punctuation(tok:str)->str:
        return                                                              \
            tok[1:]                                                         \
            if tok[0] in set(string.punctuation) and len(tok) != 0          \
            else tok

    tokens = [remove_first_punctuation(w) for w in tokens]

    # Remove stop words
    stop_words = set(stopwords.words("german"))
    tokens = [w for w in tokens if w not in stop_words]

    # Remove punctuations
    tokens = [w for w in tokens if not is_punct_only(w)]

    # Stem words
    stem = SnowballStemmer('german')
    tokens = [stem.stem(w) for w in tokens]

    return " ".join(tokens)


train_df, test_df = get_clean_train_test_df()
new_series = train_df['comment_text'].map(do_basic_nlp_cleaning)


"""
from spacy.tokenizer import Tokenizer
from spacy.lang.en import English

sentence = "I'm a dog and it's great! You're cool and Sandy's book is big. Don't tell her, you'll regret it! 'Hey', she'll say!"
nlp = English()
tokenizer = nlp.tokenizer
for x in nlp.tokenizer(sentence):
    print(x)
"""

3 ==>   was denn?
4 ==>   was denn?
2 ==>   es war einmal 
3 ==>   es war einmal 
4 ==>   es war einmal 
3 ==>   aber sie?
4 ==>   aber sie?
2 ==>   doch
3 ==>   doch
4 ==>   doch
2 ==>  es sollte aber so sein
3 ==>  es sollte aber so sein
4 ==>  es sollte aber so sein
3 ==>   ich auch nicht !
4 ==>   ich auch nicht !
2 ==>   werde ich auch
3 ==>   werde ich auch
4 ==>   werde ich auch
3 ==>   ich auch nicht !
4 ==>   ich auch nicht !
2 ==>  keiner 
3 ==>  keiner 
4 ==>  keiner 
3 ==>   ???????????
4 ==>   ???????????
3 ==>   so wie ihrer hier??
4 ==>   so wie ihrer hier??
3 ==>   von was??
4 ==>   von was??
2 ==>   und
3 ==>   und
4 ==>   und
3 ==>  und nun?
4 ==>  und nun?
2 ==>   ich auch
3 ==>   ich auch
4 ==>   ich auch
2 ==>   .so ist es
3 ==>   .so ist es
4 ==>   .so ist es


'\nfrom spacy.tokenizer import Tokenizer\nfrom spacy.lang.en import English\n\nsentence = "I\'m a dog and it\'s great! You\'re cool and Sandy\'s book is big. Don\'t tell her, you\'ll regret it! \'Hey\', she\'ll say!"\nnlp = English()\ntokenizer = nlp.tokenizer\nfor x in nlp.tokenizer(sentence):\n    print(x)\n'

In [75]:
train_df['comment_text'].map(remove_roles).map(remove_emojis)

0       gestern bei illner, montag bei nummer  ist das...
1       mein gott der war erst gestern bei illner. die...
2        die cdu lässt das so wie so nicht zu . sagen ...
3       bei meiner beschissenen rente als 2x geschiede...
4       wer nummer jahre zum mindestlohn arbeiten muß,...
                              ...                        
3189    hier mal eine info. flüchtlinge werden nummer ...
3190    .aha .mal abwarten kommt bei uns auch .firmen ...
3191                                           .so ist es
3192                                       .die warten da
3193     .das bekommen die gesagt wie sich verhalten s...
Name: comment_text, Length: 3194, dtype: object

In [75]:
print(new_series)