In [6]:
# from ipynb.fs.full.Preprocessing import *
import pandas as pd
import matplotlib.pyplot as plt
from bs4 import BeautifulSoup
import re
import nltk
from nltk.corpus import stopwords
from string import punctuation
from nltk.stem import WordNetLemmatizer
from nltk.stem.porter import *

In [13]:
# function to remove user handles
def remove_pattern(input_txt, pattern):
    r = re.findall(pattern, input_txt)
    for i in r:
        input_txt = re.sub(i, '', input_txt)

    return input_txt 

def handle_emojis(tweet):
    """
    In this method we are converting the emojis with the text they represent. 
    Happy emojis are converted to EMO_POS and negative to EMO_NEG.
    """
    # Smile -- :), : ), :-), (:, ( :, (-:, :')
    tweet = re.sub(r'(:\s?\)|:-\)|\(\s?:|\(-:|:\'\))', ' smiling ', tweet)
    # Laugh -- :D, : D, :-D, xD, x-D, XD, X-D
    tweet = re.sub(r'(:\s?D|:-D|x-?D|X-?D)', ' laughing ', tweet)
    # Love -- <3, :*
    tweet = re.sub(r'(<3|:\*)', ' love ', tweet)
    # Wink -- ;-), ;), ;-D, ;D, (;,  (-;
    tweet = re.sub(r'(;-?\)|;-?D|\(-?;)', ' wink ', tweet)
    # Sad -- :-(, : (, :(, ):, )-:
    tweet = re.sub(r'(:\s?\(|:-\(|\)\s?:|\)-:)', ' sad ', tweet)
    # Cry -- :,(, :'(, :"(
    tweet = re.sub(r'(:,\(|:\'\(|:"\()', ' cry ', tweet)
    
    return tweet

#pre-precessing function
def preprocess_text(data, col_name):
    
    #converting string to lower case
    data[col_name] = data[col_name].str.lower()
    
    #remove user handles
    data[col_name] = data[col_name].apply(lambda row:remove_pattern(row, "@[\w]*"))
    
    #converting html characters to strings
    data[col_name] = data[col_name].apply(lambda x: BeautifulSoup(x, 'lxml').get_text())
    
    #converting emojis to the string they represent
    data[col_name] = data[col_name].apply(lambda x: handle_emojis(x))
    
    #removing URLs from the tweets
    data[col_name]  = data[col_name].apply(lambda x: re.sub(r'((www\.[\S]+)|(https?://[\S]+))', ' URL ', x))
    
    #converting abbreviations to their full forms
    data[col_name] = data[col_name].replace('[...…]','').str.split().apply(lambda x: ' '.join([short_word_dict.get(e, e) for e in x]))
    
    # Remove single space remaining at the front of the tweet.
    data[col_name] = data[col_name].apply(lambda x: x.lstrip(' '))
    
    # Remove characters beyond Basic Multilingual Plane (BMP) of Unicode:
    data[col_name] = data[col_name].apply(lambda x: ''.join(c for c in x if c <= '\uFFFF'))
    
    #removing non-ASCII characters
    data[col_name] = data[col_name].apply(lambda x: ''.join([i if ord(i) < 128 else ' ' for i in x]))
    
    # Remove words with 2 or fewer letters
    data[col_name] = data[col_name].apply(lambda x: re.sub(r'\b\w{1,2}\b', '', x))
    
    # Convert more than 2 letter repetitions to 2 letter
    # funnnnny --> funny
    data[col_name] = data[col_name].apply(lambda x: re.sub(r'(.)\1+', r'\1\1', x))
    
    #tokenizing data
    data[col_name] = data[col_name].apply(lambda x: nltk.word_tokenize(x))
    
    #stopwords removal
#     stop_words = list(set(stopwords.words('english')))+list(punctuation)+['``', "'s", "...", "n't", "'re", "''"]
    stop_words = list(punctuation)+['``', "'s", "...", "n't", "'re", "''"]
    data[col_name] = data[col_name].apply(lambda row: [word for word in row if word not in stop_words])
    
#     # stemming words
#     stemmer = PorterStemmer()
#     data[col_name] = data[col_name].apply(lambda x: [stemmer.stem(i) for i in x])

    #lemmatization
#     lemmatizer = WordNetLemmatizer()
#     data[col_name] = data[col_name].apply(lambda x: [lemmatizer.lemmatize(i) for i in x])

    
    data[col_name] = data[col_name].apply(lambda x: ' '.join(x))
    
    return data

#converting to full meaningful words
short_word_dict = {
"121": "one to one",
"a/s/l": "age, sex, location",
"ab": "about",
"adn": "any day now",
"afaik": "as far as I know",
"afk": "away from keyboard",
"aight": "alright",
"alol": "actually laughing out loud",
"b4": "before",
"b4n": "bye for now",
"bak": "back at the keyboard",
"bf": "boyfriend",
"bff": "best friends forever",
"bfn": "bye for now",
"bg": "big grin",
"bta": "but then again",
"btw": "by the way",
"cid": "crying in disgrace",
"cnp": "continued in my next post",
"cp": "chat post",
"cu": "see you",
"cul": "see you later",
"cul8r": "see you later",
"cya": "bye",
"cyo": "see you online",
"dbau": "doing business as usual",
"fud": "fear, uncertainty, and doubt",
"fwiw": "for what it's worth",
"fyi": "for your information",
"g": "grin",
"g2g": "got to go",
"ga": "go ahead",
"gal": "get a life",
"gf": "girlfriend",
"gfn": "gone for now",
"gmbo": "giggling my butt off",
"gmta": "great minds think alike",
"h8": "hate",
"hagn": "have a good night",
"hdop": "help delete online predators",
"hhis": "hanging head in shame",
"iac": "in any case",
"ianal": "I am not a lawyer",
"ic": "I see",
"idk": "I don't know",
"imao": "in my arrogant opinion",
"imnsho": "in my not so humble opinion",
"imo": "in my opinion",
"iow": "in other words",
"ipn": "I’m posting naked",
"irl": "in real life",
"jk": "just kidding",
"l8r": "later",
"ld": "later, dude",
"ldr": "long distance relationship",
"llta": "lots and lots of thunderous applause",
"lmao": "laugh my ass off",
"lmirl": "let's meet in real life",
"lol": "laugh out loud",
"ltr": "longterm relationship",
"lulab": "love you like a brother",
"lulas": "love you like a sister",
"luv": "love",
"m/f": "male or female",
"m8": "mate",
"milf": "mother I would like to fuck",
"mkt": "market",
"oll": "online love",
"omg": "oh my god",
"otoh": "on the other hand",
"pir": "parent in room",
"ppl": "people",
"r": "are",
"rofl": "roll on the floor laughing",
"rpg": "role playing games",
"ru": "are you",
"shid": "slaps head in disgust",
"somy": "sick of me yet",
"sot": "short of time",
"thanx": "thanks",
"thx": "thanks",
"ttyl": "talk to you later",
"u": "you",
"ur": "you are",
"uw": "you’re welcome",
"wb": "welcome back",
"wfm": "works for me",
"wibni": "wouldn't it be nice if",
"wtf": "what the fuck",
"wtg": "way to go",
"wtgp": "want to go private",
"yrs": "years",
"ym": "young man",
"gr8": "great"
}

In [14]:
raw_data = pd.read_csv('Data/train.csv')
raw_data.head()

Unnamed: 0,tweet_id,tweet,sentiment
0,1701,#sxswnui #sxsw #apple defining language of tou...,1
1,1851,Learning ab Google doodles! All doodles should...,1
2,2689,one of the most in-your-face ex. of stealing t...,2
3,4525,This iPhone #SXSW app would b pretty awesome i...,0
4,3604,Line outside the Apple store in Austin waiting...,1


In [15]:
raw_data.dropna(inplace=True)

In [16]:
processed_data = preprocess_text(raw_data, 'tweet')

In [21]:
processed_data.head(10)

Unnamed: 0,tweet_id,tweet,sentiment
0,1701,sxswnui sxsw apple defining language touch wit...,1
1,1851,learning about google doodles all doodles shou...,1
2,2689,one the most -your-face stealing the show year...,2
3,4525,this iphone sxsw app would pretty awesome didn...,0
4,3604,line outside the apple store austin waiting fo...,1
5,966,technews one lone dude awaits ipad apple sxsw ...,1
6,1395,sxsw tips prince npr videos toy shopping with ...,1
7,8182,user new ubersocial for iphone now the app sto...,1
8,8835,free sxsw sampler itunes link freemusic,2
9,883,think might all weekend without seeing the sam...,2


In [22]:
def remove_specialchars(tweet):
    tweet=tweet.encode('utf-8')
    tweet=re.sub(rb'[^\x00-\x7f]',rb' ',tweet)
    tweet.decode('utf-8')

In [23]:
processed_data['tweet'] = processed_data['tweet'].apply(lambda x: ''.join([i if ord(i) < 128 else ' ' for i in x]))

In [24]:
processed_data.head(20)

Unnamed: 0,tweet_id,tweet,sentiment
0,1701,sxswnui sxsw apple defining language touch wit...,1
1,1851,learning about google doodles all doodles shou...,1
2,2689,one the most -your-face stealing the show year...,2
3,4525,this iphone sxsw app would pretty awesome didn...,0
4,3604,line outside the apple store austin waiting fo...,1
5,966,technews one lone dude awaits ipad apple sxsw ...,1
6,1395,sxsw tips prince npr videos toy shopping with ...,1
7,8182,user new ubersocial for iphone now the app sto...,1
8,8835,free sxsw sampler itunes link freemusic,2
9,883,think might all weekend without seeing the sam...,2
