In [None]:
def clean_text(text):
    text = re.sub(r'[^\w\s]', '', text.lower().strip())
    text = re.sub(r"(?:\@|https?\://)\S+", "HTTPURL", str(text), flags=re.MULTILINE)
    text = re.sub(r"\s+", " ", text, flags=re.MULTILINE)
    return text.strip()

In [None]:
#As parts of the data where not in English, we checked for other languages in order to exclude e.g. French or German text.
#we found about 200 entries with mixed langugages.

import spacy
from spacy.language import Language
from spacy_langdetect import LanguageDetector

def get_lang_detector(nlp, name):
    return LanguageDetector()

nlp = spacy.load("en_core_web_trf")
Language.factory("language_detector", func=get_lang_detector)
nlp.add_pipe('language_detector', last=True)


In [None]:
def get_lang(text):
  doc = nlp(text)
  return doc._.language['language']

In [None]:
# !pip install -U spacy[cuda112]
gpu = spacy.prefer_gpu()
print('GPU:', gpu)

In [None]:
complete_data['lang'] = complete_data['text'].apply(lambda x: get_lang(x))

In [None]:
import re
import textacy

def normalize_text(text):
    # remove emojis
    text = textacy.preprocess.replace_emojis(text, '')
    # remove unicode characters
    text = textacy.preprocess.replace_unicode(text, '')
    # remove noisy characters
    text = textacy.preprocess.remove_punct(text, marks='•')
    text = textacy.preprocess.replace_hyphens(text, '-')
    text = textacy.preprocess.replace_quotes(text, '"')
    # remove URLs
    text = textacy.preprocess.replace_urls(text, 'HTTPURL')
    # remove hashtags
    text = textacy.preprocess.replace_hashtags(text, '')
    # remove email addresses
    text = textacy.preprocess.replace_emails(text, 'EMAIL')
    # remove HTML tags
    text = re.sub('<[^<]+?>', '', text)
    # remove extra whitespace
    text = re.sub('\s+', ' ', text).strip()
    
    return text


## Other Preprocessing Steps
- splitting the longer text into pieces
- for Roberta-large, we removed: 
  *   emojis
  *   unicode
  *   'noisy' characters (bulletpoints, hypenated words quotes)
  *   urls
  *   hashtags
  *   email
  *   html tags
  * this was based on textacy: https://textacy.readthedocs.io/en/latest/api_reference/preprocessing.html
  * our model did not improve with those preprocessing steps, so they are not part of the final model
* We removed a list of signs and symbols: 
{'"',
 '#',
 '%',
 '&',
 "'",
 '(',
 ')',
 '+',
 ',',
 '-',
 '.',
 '/',
 ':',
 ';',
 '=',
 '?',
 '[',
 ']',
 '_',
 '°',
 '±',
 'é',
 'č',
 'š',
 '\u200b',
 '‐',
 '–',
 '—',
 ''',
 ''',
 '"',
 '"',
 '•'}