In [1]:
import pandas as pd
from tqdm.notebook import tqdm
import re
import json
from nltk.corpus import stopwords
import nltk
#nltk.download('stopwords')
#nltk.download('wordnet')
#nltk.download('averaged_perceptron_tagger')
stopwords = stopwords.words('english')
tqdm.pandas()

In [2]:
def remove_emoji(string):
    emoji_pattern = re.compile("["
                           u"\U0001F600-\U0001F64F" # emoticons
                           u"\U0001F300-\U0001F5FF" # symbols & pictographs
                           u"\U0001F680-\U0001F6FF" # transport & map symbols
                           u"\U0001F1E0-\U0001F1FF" # flags (iOS)
                           u"\U00002702-\U000027B0"
                           u"\U000024C2-\U0001F251"
                           "]+", flags=re.UNICODE)
    return emoji_pattern.sub(r'', string)

In [3]:
# Some imports and global variables related to lemmatization
from nltk.corpus import wordnet as wn
from nltk.stem.wordnet import WordNetLemmatizer
from nltk import pos_tag
from collections import defaultdict

tree_tag_map = defaultdict(lambda : wn.NOUN)
tree_tag_map['J'], tree_tag_map['V'], tree_tag_map['R'] = wn.ADJ, wn.VERB, wn.ADV
lemmatize_function = WordNetLemmatizer()
lemmatized_words_dp, LEMMATIZE_DP_LIMIT = {}, 500000

In [4]:
def do_lemmatize(token):
    if lemmatized_words_dp.get(token):
        return lemmatized_words_dp[token]
    new_token = [lemmatize_function.lemmatize(token, tree_tag_map[tag[0]]) for token, tag in pos_tag([token])]
    if len(lemmatized_words_dp) != LEMMATIZE_DP_LIMIT:
        lemmatized_words_dp[token] = new_token[0]
    return new_token[0]

In [5]:
def clean_text(x):
    x = str(x).lower()
    x = remove_emoji(x)
    x = re.sub("@[A-Za-z0-9]+"," ", x) # remove user IDs
    x = re.sub("(http://.*?\s)|(http://.*)",' ',str(x)) #remove http links in the text
    x = re.sub("\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}",' ',str(x)) # remove IP addresses
    x = re.sub('[!"#$%&\()*+,.-/:;<=>?@[\\]^_`{|}~]',' ',str(x)) #remove all punctuation
    #x = x.split(" ")
    x = re.split("\||/|\=|_|\:|\+|,|~|\^|#|\[|\]|\(|\)|\{|\}|<|>|\!|&|;|\?|\*|%|\$|@|`| |-", x)
    x = [i.strip("|/=.-_:'+,~^#[](){}<>!&;?*%$@`\"") for i in x if i not in stopwords]
    x = [do_lemmatize(w) for w in x]
    x = " ".join(x)
    x = ' '.join(re.sub("[\.]", " ", x).split())
    return x

In [6]:
train = pd.read_csv("unclean/train.csv")
test = pd.read_csv("unclean/test.csv")
val = pd.read_csv("unclean/val.csv")

In [7]:
train["heading"] = train["heading"].progress_apply(lambda x: clean_text(x))
train["body"] = train["body"].progress_apply(lambda x: clean_text(x))
test["heading"] = test["heading"].progress_apply(lambda x: clean_text(x))
test["body"] = test["body"].progress_apply(lambda x: clean_text(x))
val["heading"] = val["heading"].progress_apply(lambda x: clean_text(x))
val["body"] = val["body"].progress_apply(lambda x: clean_text(x))

  0%|          | 0/93652 [00:00<?, ?it/s]

  0%|          | 0/93652 [00:00<?, ?it/s]

  0%|          | 0/11707 [00:00<?, ?it/s]

  0%|          | 0/11707 [00:00<?, ?it/s]

  0%|          | 0/11706 [00:00<?, ?it/s]

  0%|          | 0/11706 [00:00<?, ?it/s]

In [8]:
train.to_csv("clean/train.csv",index=False)
test.to_csv("clean/test.csv",index=False)
val.to_csv("clean/val.csv",index=False)

In [9]:
print("Done")

Done


In [10]:
df = pd.read_csv("clean/train.csv")

In [11]:
df

Unnamed: 0,heading,body,label
0,two gop congressman say suspect terrorist caug...,31 year old canadian israeli woman travel iraq...,1
1,jasmine tridevil woman three breast denies sur...,saudi arabia’s national airline carrier planni...,1
2,drake's late project playlist — format hadn’t ...,almost decade drake star also curator artist r...,0
3,police woman “trained” poison kim jong un’s ha...,kuala lumpur malaysia two woman suspect fatall...,0
4,u official russian drone may turn camera right...,david choi business insider 8 04 2017 03 00 1 ...,0
...,...,...,...
93647,there’s catch opec’s plan cut production,’ ’ ’ opec ’ ’ ’ ’ additionally also agree tag...,0
93648,pumpkin spice condom real thing,man know jihadi john photo reuters the world k...,1
93649,video show isil beheading photojournalist jame...,touch tribute victim charlie hebdo shoot post ...,1
93650,canada probe michael zehaf bibeau possible sus...,man behind ottawa shooting canadian convert is...,0
