In [1]:
import numpy as np
import pandas as pd
import re
from bs4 import BeautifulSoup
import unicodedata
from nltk.tokenize.toktok import ToktokTokenizer
from tensorflow.keras.preprocessing import sequence
from tensorflow.keras.preprocessing.text import Tokenizer
from tqdm import tqdm
import codecs
import nltk
import matplotlib.pyplot as plt
import spacy

In [2]:
fdir = "/work/tadesse/beichen/Tweets_DIR/Data/DIR/DIR_2011_2020.csv"
df_dir = pd.read_csv(fdir)
labels = df_dir.iloc[:,-7:]
labels = labels.values.astype("int")
labels

array([[0, 0, 0, ..., 1, 0, 1],
       [0, 0, 0, ..., 1, 0, 1],
       [0, 0, 0, ..., 1, 0, 1],
       ...,
       [0, 0, 0, ..., 0, 1, 0],
       [1, 0, 0, ..., 0, 0, 0],
       [0, 1, 0, ..., 0, 0, 1]])

In [3]:
#remove html tags
def remove_html_tags(text):
    soup = BeautifulSoup(text, "html.parser")
    [s.extract() for s in soup(['iframe','script'])]
    stripped_text = soup.get_text()
    stripped_text = re.sub(r'["\|\n|\r|\n\r]+','', stripped_text)
    return stripped_text

#remove html:
def remove_html(text):
    text = re.sub(r'https?:\/\/\S*', '', text, flags=re.MULTILINE)
    return text

In [4]:
#removing accented characters
def remove_accented_chars(text):
    text = unicodedata.normalize('NFKD', text).encode('ascii','ignore').decode('utf-8','ignore')
    return text

In [5]:
#expanding contractions
from contractions import CONTRACTION_MAP
import re

def expand_contractions(text, contraction_mapping=CONTRACTION_MAP):
    
    contractions_pattern = re.compile('({})'.format('|'.join(contraction_mapping.keys())), 
                                      flags=re.IGNORECASE|re.DOTALL)
    def expand_match(contraction):
        match = contraction.group(0)
        first_char = match[0]
        expanded_contraction = contraction_mapping.get(match)\
                                if contraction_mapping.get(match)\
                                else contraction_mapping.get(match.lower())                       
        expanded_contraction = first_char+expanded_contraction[1:]
        return expanded_contraction
        
    expanded_text = contractions_pattern.sub(expand_match, text)
    expanded_text = re.sub("'", "", expanded_text)
    return expanded_text

In [6]:
#removing special characters:
def remove_special_characters(text):
    text = re.sub(r'[^a-zA-z\s]','',text)
    return text

In [7]:
def remove_special_characters(text, remove_digits=False):
    pattern = r'[^a-zA-z0-9\s]' if not remove_digits else r'[^a-zA-z\s]'
    text = re.sub(pattern, '', text)
    return text

remove_special_characters("Well this was fun! What do you think? 123#@!", 
                          remove_digits=True)

'Well this was fun What do you think '

In [8]:
def edits0(word): 
    """
    Return all strings that are zero edits away 
    from the input word (i.e., the word itself).
    """
    return {word}



def edits1(word):
    """
    Return all strings that are one edit away 
    from the input word.
    """
    alphabet = 'abcdefghijklmnopqrstuvwxyz'
    def splits(word):
        """
        Return a list of all possible (first, rest) pairs 
        that the input word is made of.
        """
        return [(word[:i], word[i:]) 
                for i in range(len(word)+1)]
                
    pairs      = splits(word)
    deletes    = [a+b[1:]           for (a, b) in pairs if b]
    transposes = [a+b[1]+b[0]+b[2:] for (a, b) in pairs if len(b) > 1]
    replaces   = [a+c+b[1:]         for (a, b) in pairs for c in alphabet if b]
    inserts    = [a+c+b             for (a, b) in pairs for c in alphabet]
    return set(deletes + transposes + replaces + inserts)


def edits2(word):
    """Return all strings that are two edits away 
    from the input word.
    """
    return {e2 for e1 in edits1(word) for e2 in edits1(e1)}
def known(words):
    """
    Return the subset of words that are actually 
    in our WORD_COUNTS dictionary.
    """
    return {w for w in words if w in WORD_COUNTS}
def correct(word):
    """
    Get the best correct spelling for the input word
    """
    # Priority is for edit distance 0, then 1, then 2
    # else defaults to the input word itself.
    candidates = (known(edits0(word)) or 
                  known(edits1(word)) or 
                  known(edits2(word)) or 
                  [word])
    return max(candidates, key=WORD_COUNTS.get)
def correct_match(match):
    """
    Spell-correct word in match, 
    and preserve proper upper/lower/title case.
    """
    
    word = match.group()
    def case_of(text):
        """
        Return the case-function appropriate 
        for text: upper, lower, title, or just str.:
            """
        return (str.upper if text.isupper() else
                str.lower if text.islower() else
                str.title if text.istitle() else
                str)
    return case_of(word)(correct(word.lower()))

    
def correct_text_generic(text):
    """
    Correct all the words within a text, 
    returning the corrected text.
    """
    return re.sub('[a-zA-Z]+', correct_match, text)

In [9]:
'''#removing special characters:
def remove_special_characters(text):
    text = re.sub(r'[^a-zA-z\s]','',text)
    return text'''

#removing stopwords
#not necessary for bert
tokenizer = ToktokTokenizer()
nltk.download('stopwords')
stopword_list = nltk.corpus.stopwords.words('english')
def remove_stopwords(text, is_lower_case=False, stopwords=stopword_list):
    tokens = tokenizer.tokenize(text)
    tokens = [token.strip() for token in tokens]
    if is_lower_case:
        filtered_tokens = [token for token in tokens if token not in stopwords]
    else:
        filtered_tokens = [token for token in tokens if token.lower() not in stopwords]
    filtered_text = ' '.join(filtered_tokens)    
    return filtered_text


[nltk_data] Downloading package stopwords to
[nltk_data]     /home/tadesse/beichen/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


In [10]:
def preprocessing_text(text_arr):
    preprocessed_text = []
    idx = 0
    for text in text_arr:
        text = remove_html(text)
        text = remove_html_tags(text) #remove html tags
        text = remove_accented_chars(text) #removing accented characters
        text = expand_contractions(text, contraction_mapping=CONTRACTION_MAP) #expanding contractions
        text = remove_special_characters(text, remove_digits=False) #removing special characters
        #text = text.lower() #change to lower case
        #text = remove_stopwords(text) #removing stopwords
        #word_seq,vocab_size = toakenizing(text)
        #max_vocab_size = np.max(vocab_size)
        preprocessed_text.append(text)
        idx+=1
    print('Data Preprocessing finished.')
    return preprocessed_text

In [11]:
df_dir.Title.str.split().str.len()

0         5
1         5
2         7
3        15
4         7
         ..
14173     8
14174    27
14175    11
14176    13
14177    10
Name: Title, Length: 14178, dtype: int64

In [12]:
title = preprocessing_text(df_dir.Title)

Data Preprocessing finished.


In [13]:
print(title[20])
print(df_dir.Title.iloc[20])

Relief sought for dry wells in New Hampshire
Relief sought for dry wells in New Hampshire


In [14]:
description = preprocessing_text(df_dir.Description)

Data Preprocessing finished.


In [15]:
print(description[100])
print(df_dir.Description.iloc[100])

Utahs fire season has been busy and started early with firefighters responding to hundreds of wildfires statewide by midJune after that state experienced its third driest spring on record  The fire season typically begins at the start of June and extends through October  Daily Herald Provo Utah June 19 2020
Utah’s fire season has been busy and started early, with firefighters responding to hundreds of wildfires statewide by mid-June after that state experienced its third driest spring on record.  The fire season typically begins at the start of June and extends through October.  Daily Herald (Provo, Utah), June 19, 2020


In [18]:
df_dir['processed_description'] = description

In [20]:
df_dir['processed_title'] = title

In [23]:
df_dir.to_csv("/work/tadesse/beichen/Tweets_DIR/Data/DIR/DIR_processed.csv", index=None)