In [1]:
import os
from pathlib import Path

import numpy as np
import pandas as pd

pd.set_option('display.max_colwidth', 1000)

import os
import nltk
import csv
from bs4 import BeautifulSoup
import re
import itertools
import emoji

In [2]:
DATA_PATH = 'data/'

TRAIN_FEATURES = os.path.join(DATA_PATH, 'train_example.csv')

train_df = pd.read_csv(TRAIN_FEATURES, index_col=0, encoding='utf-8')

In [3]:
print(train_df.shape)
print(train_df.columns.values)

(7520, 2)
['text' 'target']


In [4]:
train_df.head()

Unnamed: 0,text,target
0,Our Deeds are the Reason of this #earthquake May ALLAH Forgive us all,1
1,Forest fire near La Ronge Sask. Canada,1
2,All residents asked to 'shelter in place' are being notified by officers. No other evacuation or shelter in place orders are expected,1
3,"13,000 people receive #wildfires evacuation orders in California",1
4,Just got sent this photo from Ruby #Alaska as smoke from #wildfires pours into a school,1


Tweets require lots of cleaning but it is inefficient to clean every single tweet because that would consume too much time. 
A general approach must be implemented for cleaning.

Furthermore, tweets are short messages that contain loads of emojis, contractions, hashtags, misspelled words and slang.
Most of these have little value for sentiment analysis and need to be cleaned.

### Data cleaning

- Punctuations #, @, !, ?, +, &, -, $, =, <, >, |, {, }, ^, ', (, ),[, ], *, %, ..., ', ., :, ; are separated from words
- Special characters that are attached to words are removed completely
- Contractions are expanded
- Urls are removed
- Character entity references are replaced with their actual symbols
- Typos and slang are corrected, and informal abbreviations are written in their long forms
- Usernames are removed

In [5]:
def load_dict_smileys():
    
    return {
        ":‑)":"smiley",
        ":-]":"smiley",
        ":-3":"smiley",
        ":->":"smiley",
        "8-)":"smiley",
        ":-}":"smiley",
        ":)":"smiley",
        ":]":"smiley",
        ":3":"smiley",
        ":>":"smiley",
        "8)":"smiley",
        ":}":"smiley",
        ":o)":"smiley",
        ":c)":"smiley",
        ":^)":"smiley",
        "=]":"smiley",
        "=)":"smiley",
        ":-))":"smiley",
        ":‑D":"smiley",
        "8‑D":"smiley",
        "x‑D":"smiley",
        "X‑D":"smiley",
        ":D":"smiley",
        "8D":"smiley",
        "xD":"smiley",
        "XD":"smiley",
        ":‑(":"sad",
        ":‑c":"sad",
        ":‑<":"sad",
        ":‑[":"sad",
        ":(":"sad",
        ":c":"sad",
        ":<":"sad",
        ":[":"sad",
        ":-||":"sad",
        ">:[":"sad",
        ":{":"sad",
        ":@":"sad",
        ">:(":"sad",
        ":'‑(":"sad",
        ":'(":"sad",
        ":‑P":"playful",
        "X‑P":"playful",
        "x‑p":"playful",
        ":‑p":"playful",
        ":‑Þ":"playful",
        ":‑þ":"playful",
        ":‑b":"playful",
        ":P":"playful",
        "XP":"playful",
        "xp":"playful",
        ":p":"playful",
        ":Þ":"playful",
        ":þ":"playful",
        ":b":"playful",
        "<3":"love"
        }


def load_dict_contractions():
    
    return {
        "ain't":"is not",
        "amn't":"am not",
        "aren't":"are not",
        "can't":"cannot",
        "'cause":"because",
        "couldn't":"could not",
        "couldn't've":"could not have",
        "could've":"could have",
        "daren't":"dare not",
        "daresn't":"dare not",
        "dasn't":"dare not",
        "didn't":"did not",
        "doesn't":"does not",
        "don't":"do not",
        "e'er":"ever",
        "em":"them",
        "everyone's":"everyone is",
        "finna":"fixing to",
        "gimme":"give me",
        "gonna":"going to",
        "gon't":"go not",
        "gotta":"got to",
        "hadn't":"had not",
        "hasn't":"has not",
        "haven't":"have not",
        "he'd":"he would",
        "he'll":"he will",
        "he's":"he is",
        "he've":"he have",
        "how'd":"how would",
        "how'll":"how will",
        "how're":"how are",
        "how's":"how is",
        "i'd":"i would",
        "i'll":"i will",
        "i'm":"i am",
        "i'm'a":"i am about to",
        "i'm'o":"i am going to",
        "isn't":"is not",
        "it'd":"it would",
        "it'll":"it will",
        "it's":"it is",
        "I've":"I have",
        "kinda":"kind of",
        "let's":"let us",
        "mayn't":"may not",
        "may've":"may have",
        "mightn't":"might not",
        "might've":"might have",
        "mustn't":"must not",
        "mustn't've":"must not have",
        "must've":"must have",
        "needn't":"need not",
        "ne'er":"never",
        "o'":"of",
        "o'er":"over",
        "ol'":"old",
        "oughtn't":"ought not",
        "shalln't":"shall not",
        "shan't":"shall not",
        "she'd":"she would",
        "she'll":"she will",
        "she's":"she is",
        "shouldn't":"should not",
        "shouldn't've":"should not have",
        "should've":"should have",
        "somebody's":"somebody is",
        "someone's":"someone is",
        "something's":"something is",
        "that'd":"that would",
        "that'll":"that will",
        "that're":"that are",
        "that's":"that is",
        "there'd":"there would",
        "there'll":"there will",
        "there're":"there are",
        "there's":"there is",
        "these're":"these are",
        "they'd":"they would",
        "they'll":"they will",
        "they're":"they are",
        "they've":"they have",
        "this's":"this is",
        "those're":"those are",
        "'tis":"it is",
        "'twas":"it was",
        "wanna":"want to",
        "wasn't":"was not",
        "we'd":"we would",
        "we'd've":"we would have",
        "we'll":"we will",
        "we're":"we are",
        "weren't":"were not",
        "we've":"we have",
        "what'd":"what did",
        "what'll":"what will",
        "what're":"what are",
        "what's":"what is",
        "what've":"what have",
        "when's":"when is",
        "where'd":"where did",
        "where're":"where are",
        "where's":"where is",
        "where've":"where have",
        "which's":"which is",
        "who'd":"who would",
        "who'd've":"who would have",
        "who'll":"who will",
        "who're":"who are",
        "who's":"who is",
        "who've":"who have",
        "why'd":"why did",
        "why're":"why are",
        "why's":"why is",
        "won't":"will not",
        "wouldn't":"would not",
        "would've":"would have",
        "y'all":"you all",
        "you'd":"you would",
        "you'll":"you will",
        "you're":"you are",
        "you've":"you have",
        "Whatcha":"What are you",
        "luv":"love",
        "sux":"sucks"
        }


def strip_accents(text):
    if 'ø' in text or  'Ø' in text:
        return text   
    text = text.encode('ascii', 'ignore')
    text = text.decode("utf-8")
    return str(text)


def special_characters():
    
    return {
        "\x89Û_":"",
        "\x89ÛÒ":"",
        "\x89ÛÓ":"",
        "\x89ÛÏ":"",
        "\x89Ûª":"'",
        "\x89Û÷":"",
        "\x89Û\x9d":"",
        "å_":"",
        "\x89Û¢":"",
        "\x89Û¢åÊ":"",
        "\x92":"'",
        "åÊ":" ",
        "åÈ":"",
        "Ì_":"a",
        "Ì©":"e",
        "å¨":"",
        "Ì¤":"c",
        "åÇ":"",
        "å£":"",
        "åÀ":"",
        "%20":" "
        }

def replaceTwoOrMore(s):
    pattern = re.compile(r"(.)\1{1,}", re.DOTALL)
    return pattern.sub(r"\1\1", s)

In [6]:
def tweet_cleaning_for_sentiment_analysis(tweet):
    tweet = tweet.lower()
    tweet = BeautifulSoup(tweet).get_text()
    tweet = re.sub("@(\w+)", "", tweet)
    tweet = re.sub("(\w+:\/\/\S+)", "", tweet)
    tweet = re.sub("[\.\,\!\?\:\;\-\=]", " ", tweet)
    SPECIAL_CHAR = special_characters()
    words = tweet.split()
    reformed = [SPECIAL_CHAR[word] if word in SPECIAL_CHAR else word for word in words]
    tweet = ' '.join(reformed)
    tweet = tweet.replace("’","'")
    CONTRACTIONS = load_dict_contractions()
    words = tweet.split()
    reformed = [CONTRACTIONS[word] if word in CONTRACTIONS else word for word in words]
    tweet = ' '.join(reformed)
    tweet = ''.join(''.join(s)[:2] for _, s in itertools.groupby(tweet))
    SMILEY = load_dict_smileys()  
    words = tweet.split()
    reformed = [SMILEY[word] if word in SMILEY else word for word in words]
    tweet = ' '.join(reformed)
    tweet = emoji.demojize(tweet)
    tweet = strip_accents(tweet)
    tweet = ' '.join(tweet.split())
    tweet = re.sub("[0123456789]", "", tweet)
    tweet = re.sub("[\@\#\+\&\*\[\]\%\\/\(\)\$\>\<\|\{\}\^\'\_\:]", " ", tweet)
    tweet = re.sub("[\s]+", " ", tweet)
    tweet = re.sub("[\n]+", " ", tweet)

    return tweet

In [7]:
train_df['text_cleaned'] = train_df['text'].apply(lambda s: tweet_cleaning_for_sentiment_analysis(s))

In [8]:
train_df.head(20)

Unnamed: 0,text,target,text_cleaned
0,Our Deeds are the Reason of this #earthquake May ALLAH Forgive us all,1,our deeds are the reason of this earthquake may allah forgive us all
1,Forest fire near La Ronge Sask. Canada,1,forest fire near la ronge sask canada
2,All residents asked to 'shelter in place' are being notified by officers. No other evacuation or shelter in place orders are expected,1,all residents asked to shelter in place are being notified by officers no other evacuation or shelter in place orders are expected
3,"13,000 people receive #wildfires evacuation orders in California",1,people receive wildfires evacuation orders in california
4,Just got sent this photo from Ruby #Alaska as smoke from #wildfires pours into a school,1,just got sent this photo from ruby alaska as smoke from wildfires pours into a school
5,#RockyFire Update => California Hwy. 20 closed in both directions due to Lake County fire - #CAfire #wildfires,1,rockyfire update california hwy closed in both directions due to lake county fire cafire wildfires
6,"#flood #disaster Heavy rain causes flash flooding of streets in Manitou, Colorado Springs areas",1,flood disaster heavy rain causes flash flooding of streets in manitou colorado springs areas
7,I'm on top of the hill and I can see a fire in the woods...,1,i am on top of the hill and i can see a fire in the woods
8,There's an emergency evacuation happening now in the building across the street,1,there is an emergency evacuation happening now in the building across the street
9,I'm afraid that the tornado is coming to our area...,1,i am afraid that the tornado is coming to our area


### Write csv file after text cleaning

In [9]:
TRAIN_NEW = os.path.join(DATA_PATH, 'train_text_cleaned.csv')
df_train = train_df
df_train.to_csv(TRAIN_NEW, header=True)