### All the abbreviated keywords

In [1]:
CHAT_WORDS_STR = """
AFAIK=As Far As I Know
AFK=Away From Keyboard
ASAP=As Soon As Possible
ATK=At The Keyboard
ATM=At The Moment
A3=Anytime, Anywhere, Anyplace
BAK=Back At Keyboard
BBL=Be Back Later
BBS=Be Back Soon
BFN=Bye For Now
B4N=Bye For Now
BRB=Be Right Back
BRT=Be Right There
BTW=By The Way
B4=Before
B4N=Bye For Now
CU=See You
CUL8R=See You Later
CYA=See You
FAQ=Frequently Asked Questions
FC=Fingers Crossed
FWIW=For What It's Worth
FYI=For Your Information
GAL=Get A Life
GG=Good Game
GN=Good Night
GMTA=Great Minds Think Alike
GR8=Great!
G9=Genius
IC=I See
ICQ=I Seek you (also a chat program)
ILU=ILU: I Love You
IMHO=In My Honest/Humble Opinion
IMO=In My Opinion
IOW=In Other Words
IRL=In Real Life
KISS=Keep It Simple, Stupid
LDR=Long Distance Relationship
LMAO=Laugh My A.. Off
LOL=Laughing Out Loud
LTNS=Long Time No See
L8R=Later
MTE=My Thoughts Exactly
M8=Mate
NRN=No Reply Necessary
OIC=Oh I See
PITA=Pain In The A..
PRT=Party
PRW=Parents Are Watching
ROFL=Rolling On The Floor Laughing
ROFLOL=Rolling On The Floor Laughing Out Loud
ROTFLMAO=Rolling On The Floor Laughing My A.. Off
SK8=Skate
STATS=Your sex and age
ASL=Age, Sex, Location
THX=Thank You
TTFN=Ta-Ta For Now!
TTYL=Talk To You Later
U=You
U2=You Too
U4E=Yours For Ever
WB=Welcome Back
WTF=What The F...
WTG=Way To Go!
WUF=Where Are You From?
W8=Wait...
7K=Sick:-D Laugher
"""

In [2]:
# First, we're going to convert this long string into set of words and its shortcut
chat_words_map_dict = {}
chat_shortcut_list = set()
for line in CHAT_WORDS_STR.split("\n"):
    if line != '':
        shortcut = line.split('=')[0] # split the line from `=` sign and select shortcut
        chat_words = line.split('=')[1]
        chat_shortcut_list.add(shortcut) # add the chat  shortcut to the set
        chat_words_map_dict[shortcut] = chat_words # add each chat_words corresponding to its shortcut

chat_words_map_dict

{'AFAIK': 'As Far As I Know',
 'AFK': 'Away From Keyboard',
 'ASAP': 'As Soon As Possible',
 'ATK': 'At The Keyboard',
 'ATM': 'At The Moment',
 'A3': 'Anytime, Anywhere, Anyplace',
 'BAK': 'Back At Keyboard',
 'BBL': 'Be Back Later',
 'BBS': 'Be Back Soon',
 'BFN': 'Bye For Now',
 'B4N': 'Bye For Now',
 'BRB': 'Be Right Back',
 'BRT': 'Be Right There',
 'BTW': 'By The Way',
 'B4': 'Before',
 'CU': 'See You',
 'CUL8R': 'See You Later',
 'CYA': 'See You',
 'FAQ': 'Frequently Asked Questions',
 'FC': 'Fingers Crossed',
 'FWIW': "For What It's Worth",
 'FYI': 'For Your Information',
 'GAL': 'Get A Life',
 'GG': 'Good Game',
 'GN': 'Good Night',
 'GMTA': 'Great Minds Think Alike',
 'GR8': 'Great!',
 'G9': 'Genius',
 'IC': 'I See',
 'ICQ': 'I Seek you (also a chat program)',
 'ILU': 'ILU: I Love You',
 'IMHO': 'In My Honest/Humble Opinion',
 'IMO': 'In My Opinion',
 'IOW': 'In Other Words',
 'IRL': 'In Real Life',
 'KISS': 'Keep It Simple, Stupid',
 'LDR': 'Long Distance Relationship',
 'LM

In [4]:
def chat_words_conversion(text):
    new_text = []
    for word in text.split():
        if word.upper() in chat_words_map_dict:
            new_text.append(chat_words_map_dict[word.upper()])
        else:
            new_text.append(word)
    return " ".join(new_text)

In [5]:
context = 'Guys! brb need to go. I will be AFK next couple of mins'
chat_words_conversion(context)

'Guys! Be Right Back need to go. I will be Away From Keyboard next couple of mins'

### Apply Feature Engineering

In [15]:
# parse the hashtags
import re
def parse_hashtags(text):
    hashtag=re.compile(r'(\#)(\w*)(_?)(\w*)', re.IGNORECASE)
    text = hashtag.sub(r'\2 \4',text)
    return text.replace('_', ' ')

In [16]:
parse_hashtags('sfjslf sfjslfj fjsfwi  j #kjfjs #sfsk #yta_doko')

'sfjslf sfjslfj fjsfwi  j kjfjs  sfsk  yta doko '

In [17]:
# Remove Markdown links
def md_links(text):
    markdown_link=re.compile(r'\[.*?\]\(.*?\)')
    return markdown_link.sub(r'',text)

In [18]:
# Remove links from the text
def scrape_links(text):
    url = re.compile(r'https?://\S+|www\.\S+')
    return url.sub(r'',text)

In [19]:
example = '@bbcmtd Wholesale Markets ablaze http://t.co/lHYXEOHY6C'

In [20]:
scrape_links(example)

'@bbcmtd Wholesale Markets ablaze '

In [23]:
# Dealing with Emojis
import emoji

def is_emoji(text):
    for label in text:
        if label in emoji.UNICODE_EMOJI_ENGLISH:
            return True
    return False

In [29]:
example02 = 'Omg another Earthquake 😔😔'
example03 = 'How Missing Jet\x89Ûªs Debris Could Have Floated to RÌ©union - The New York Times '

In [27]:
def decode_emoji(text):
    txt_emoji_list = emoji.distinct_emoji_list(text)
    rx = '[' + re.escape(''.join(txt_emoji_list)) + ']'

    if not txt_emoji_list:
        return text
    elif '©' not in txt_emoji_list:
        return re.sub(rx, '', text) +\
        ''.join([emoji.demojize(emoj).replace(':', ' ') for emoj in txt_emoji_list])
    elif ('©' in txt_emoji_list and len(txt_emoji_list) == 1):
        return text.replace('©', '')
    elif ('©' in txt_emoji_list and len(txt_emoji_list) > 1):
        text = text.replace('©', '')
        return re.sub(rx, '', text) +\
        ''.join([emoji.demojize(emoj).replace(':', ' ') for emoj in txt_emoji_list])


In [28]:
decode_emoji(example02)

'Omg another Earthquake  pensive_face '

In [30]:
decode_emoji(example03)

'How Missing Jet\x89Ûªs Debris Could Have Floated to RÌunion - The New York Times '

In [34]:
# English contractions

import contractions
def en_contractions(text):
    return ' '.join([contractions.fix(word)
                     if word in contractions.contractions_dict else word
                     for word in text.split()])

In [35]:
example = "I've worked so hard today. I'm going to run to home!"
en_contractions(example)

'I have worked so hard today. I am going to run to home!'

### Dealing with Tags and Special Character

In [36]:
# Dealing with Tags
example = """<div>
<h1>Real or Fake</h1>
<p>Kaggle </p>
<a href="https://www.kaggle.com/c/nlp-getting-started">getting started</a>
</div>"""

In [37]:
def remove_html(text):
    html=re.compile(r'<.*?>')
    return html.sub(r'',text)

print(remove_html(example))


Real or Fake
Kaggle 
getting started



In [39]:
# Dealing with Special Characters
s_chars = '¥₽ÏïŰŬĎŸæ₿₪ÚŇÀèÅ”ĜåŽÖéříÿý€ŝĤ₹áŜŮÂ₴ûÌÇšŘúüëÓ₫ŠčÎŤÆÒœ₩öËäøÍťìĈôàĥÝ¢ç“žðÙÊĉŭÈŒÐÉÔĵùÁů„âÄűĴóêĝÞîØòď฿ČÜþňÛ'
PUNC = '+@«#_\-!$%%^&*¬()£<>?/\\|}\]\[{;\,~:\"\''

In [40]:
def special_char(text):
    # first, let's remove any unicode strings
    text = text.encode('ascii', 'ignore').decode()
    # remove printable bachslashes
    text = re.sub(r'[\t\s\n\r\b\a]', ' ', text)
    # Special letters
    text = re.sub(r'[{}]'.format(s_chars), '', text)
    # Punctuation
    text = re.sub(r'[{}]'.format(PUNC), ' ', text)
    # space at the start or the end of the context
    text = re.sub(r'(^\s)|(\s$)', '', text)
    # Single character
    text = re.sub(r'(\s[^iIaA]\s)', ' ', text)
    return text

In [41]:
ex = 'How Missing Jet\x89Ûªs Debris Could Have Floated to RÌunion - The New York Times'

In [42]:
special_char(ex)

'How Missing Jets Debris Could Have Floated to Runion The New York Times'

In [43]:
# Special Cases

def sp_cases(df, col):
    df[col] = df[col].str.replace("yr", 'year', regex=False)
    df[col] = df[col].str.replace(r"\.|==|=|'|`|(PM)|(AM)|(UTC)", ' ', regex=True)
    df[col] = df[col].str.replace("...", '', regex=False)
    df[col] = df[col].str.replace(r"(\s[^iIaA]\s)", ' ', regex=True)
    df[col] = df[col].str.replace(r'\s(\w$)|\s(\w[^me]$)', '', regex=True)
    df[col] = df[col].str.replace(r'\s ', ' ', regex=True)
    df[col] = df[col].str.replace(r'(^\s)|(\s$)', '', regex=True)
    return df[col]
