<h1>Table of Contents<span class="tocSkip"></span></h1>
<div class="toc"><ul class="toc-item"><li><span><a href="#Description" data-toc-modified-id="Description-1"><span class="toc-item-num">1&nbsp;&nbsp;</span>Description</a></span></li><li><span><a href="#Load-the-libraries" data-toc-modified-id="Load-the-libraries-2"><span class="toc-item-num">2&nbsp;&nbsp;</span>Load the libraries</a></span></li><li><span><a href="#Load-the-data" data-toc-modified-id="Load-the-data-3"><span class="toc-item-num">3&nbsp;&nbsp;</span>Load the data</a></span></li><li><span><a href="#Useful-Functions" data-toc-modified-id="Useful-Functions-4"><span class="toc-item-num">4&nbsp;&nbsp;</span>Useful Functions</a></span></li><li><span><a href="#Text-Data-Processing" data-toc-modified-id="Text-Data-Processing-5"><span class="toc-item-num">5&nbsp;&nbsp;</span>Text Data Processing</a></span><ul class="toc-item"><li><span><a href="#Process-text" data-toc-modified-id="Process-text-5.1"><span class="toc-item-num">5.1&nbsp;&nbsp;</span>Process text</a></span></li><li><span><a href="#Emoticons-and-emojis" data-toc-modified-id="Emoticons-and-emojis-5.2"><span class="toc-item-num">5.2&nbsp;&nbsp;</span>Emoticons and emojis</a></span></li></ul></li><li><span><a href="#Text-Features-Generation" data-toc-modified-id="Text-Features-Generation-6"><span class="toc-item-num">6&nbsp;&nbsp;</span>Text Features Generation</a></span></li><li><span><a href="#Script" data-toc-modified-id="Script-7"><span class="toc-item-num">7&nbsp;&nbsp;</span>Script</a></span></li><li><span><a href="#Script-for-emoji" data-toc-modified-id="Script-for-emoji-8"><span class="toc-item-num">8&nbsp;&nbsp;</span>Script for emoji</a></span></li></ul></div>

# Description
Twitter sentiment analysis.

# Load the libraries

In [1]:
import sys
sys.path.append('/Users/poudel/opt/miniconda3/envs/nlp/lib/python3.7/site-packages')

In [31]:
import numpy as np
import pandas as pd
pd.set_option('max_colwidth',1000)

#========= NLP
import re
import string
import nltk
import spacy
import textblob
import gensim
import texthero
from urllib.parse import urlparse
from nltk.corpus import stopwords

print([(x.__name__,x.__version__) for x in [nltk,spacy,textblob,gensim]])

#=======OTHERS
import scipy
import multiprocessing as mp

[('nltk', '3.4.4'), ('spacy', '2.2.3'), ('textblob', '0.15.3'), ('gensim', '3.8.3')]


# Load the data

In [46]:
df_train_raw = pd.read_csv('../data/raw/train.csv',nrows=50)
df_test_raw = pd.read_csv('../data/raw/test.csv',nrows=50)

df = df_train_raw.append(df_test_raw)
df = df.reset_index()

print(f"shape df_train_raw: {df_train_raw.shape}")
print(f"shape df_test_raw: {df_test_raw.shape}")

df.head(2).append(df.tail(2))

target = 'label'
maincol = 'tweet'
mc = maincol + '_clean'
mcl = maincol + '_lst_clean'
mce = mc + '_emoji'
mcle = mcl + '_emoji'

shape df_train_raw: (50, 3)
shape df_test_raw: (50, 2)


# Useful Functions

In [33]:
import multiprocessing as mp
def parallelize_dataframe(df, func):
    ncores = mp.cpu_count()
    df_split = np.array_split(df, ncores)
    pool = mp.Pool(ncores)
    df = pd.concat(pool.map(func, df_split))
    pool.close()
    pool.join()
    return df

# Text Data Processing

In [34]:
from urllib.parse import urlparse
def is_url(url):
    try:
        result = urlparse(url)
        return all([result.scheme, result.netloc])
    except ValueError:
        return False

## Process text

In [35]:
def process_text(text):
    """
    Do a basic text processing.

    Parameters
    -----------
    text : string

    Returns
    --------
    This function returns pandas series having one list
    with clean text.
    1: split combined text
    2: lowercase
    3: expand apostrophes
    4: remove punctuation
    5: remove digits
    6: remove repeated substring
    7: remove stop words
    8: lemmatize

    Example:
    ========
    import re
    import string
    from nltk.corpus import stopwords
    import nltk
    
    text = "I'm typing text2num! areYou ? If yesyes say yes pals!"
    process_text(text)
    # ['typing', 'textnum', 'yes', 'say', 'yes', 'pal']

    """
    s = pd.Series([text])
    
    # step: Split combined words areYou ==> are You
    #s = s.apply(lambda x: re.sub(r'([a-z])([A-Z])',r'\1 \2',x))

    # step: lowercase
    s = s.str.lower()
    
    # step: remove ellipsis
    #s = s.str.replace(r'(\w)\u2026+',r'\1',regex=True)
    s = s.str.replace(r'…+',r'')

    # step: remove url
    #s = s.str.replace('http\S+|www.\S+', '', case=False)
    s = pd.Series([' '.join(y for y in x.split() if not is_url(y)) for x in s])

    # step: expand apostrophes
    map_apos = {
        "you're": 'you are',
        "i'm": 'i am',
        "he's": 'he is',
        "she's": 'she is',
        "it's": 'it is',
        "they're": 'they are',
        "can't": 'can not',
        "couldn't": 'could not',
        "don't": 'do not',
        "don;t": 'do not',
        "didn't": 'did not',
        "doesn't": 'does not',
        "isn't": 'is not',
        "wasn't": 'was not',
        "aren't": 'are not',
        "weren't": 'were not',
        "won't": 'will not',
        "wouldn't": 'would not',
        "hasn't": 'has not',
        "haven't": 'have not',
        "what's": 'what is',
        "that's": 'that is',
    }

    sa = pd.Series(s.str.split()[0])
    sb = sa.map(map_apos).fillna(sa)
    sentence = sb.str.cat(sep=' ')
    s = pd.Series([sentence])
    
    # step: expand shortcuts
    shortcuts = {'u': 'you', 'y': 'why', 'r': 'are',
                 'doin': 'doing', 'hw': 'how',
                 'k': 'okay', 'm': 'am', 'b4': 'before',
                 'idc': "i do not care", 'ty': 'thankyou',
                 'wlcm': 'welcome', 'bc': 'because',
                 '<3': 'love', 'xoxo': 'love',
                 'ttyl': 'talk to you later', 'gr8': 'great',
                 'bday': 'birthday', 'awsm': 'awesome',
                 'gud': 'good', 'h8': 'hate',
                 'lv': 'love', 'dm': 'direct message',
                 'rt': 'retweet', 'wtf': 'hate',
                 'idgaf': 'hate','irl': 'in real life',
                 'yolo': 'you only live once'}

    sa = pd.Series(s.str.split()[0])
    sb = sa.map(shortcuts).fillna(sa)
    sentence = sb.str.cat(sep=' ')
    s = pd.Series([sentence])

    # step: remove punctuation
    s = s.str.translate(str.maketrans(' ',' ',
                                        string.punctuation))
    # step: remove digits
    s = s.str.translate(str.maketrans(' ', ' ', '\n'))
    s = s.str.translate(str.maketrans(' ', ' ', string.digits))

    # step: remove repeated substring yesyes ==> yes
    s = s.str.replace(r'(\w+)\1',r'\1',regex=True)

    # step: remove stop words
    stop = set(stopwords.words('English'))
    extra_stop_words = ['...']
    stop.update(extra_stop_words) # inplace operation
    s = s.str.split()
    s = s.apply(lambda x: [I for I in x if I not in stop])

    # step: convert word to base form or lemmatize
    lemmatizer = nltk.stem.WordNetLemmatizer()
    s = s.apply(lambda lst: [lemmatizer.lemmatize(word) 
                               for word in lst])

    return s.to_numpy()[0]

text = "rt text2num! yesyes gud www.xy.com amazing"
process_text(text)

['retwet', 'textnum', 'yes', 'god', 'wwxycom', 'amazing']

In [39]:
def add_features(df):
    df[mcl] = df[maincol].apply(process_text)
    df[mc] = df[mcl].str.join(' ')
    df['hashtags_lst'] = df[maincol].str.findall(r'#.*?(?=\s|$)')
    
    #df['hastags'] = df[mc].apply(lambda x: len([x for x in x.split() if x.startswith('#')]))
    
    df['hashtags'] = df['hashtags_lst'].str.join(' ')

    return df

In [40]:
%%time
df = parallelize_dataframe(df, add_features)

CPU times: user 16.4 ms, sys: 22.8 ms, total: 39.3 ms
Wall time: 346 ms


In [41]:
df.head()

Unnamed: 0,index,id,label,tweet,tweet_lst_clean,tweet_clean,hashtags_lst,hashtags
0,0,1,0.0,#fingerprint #Pregnancy Test https://goo.gl/h1MfQV #android #apps #beautiful #cute #health #igers #iphoneonly #iphonesia #iphone,"[fingerprint, pregnancy, test, android, aps, beautiful, cute, health, igers, iphoneonly, iphonesia, iphone]",fingerprint pregnancy test android aps beautiful cute health igers iphoneonly iphonesia iphone,"[#fingerprint, #Pregnancy, #android, #apps, #beautiful, #cute, #health, #igers, #iphoneonly, #iphonesia, #iphone]",#fingerprint #Pregnancy #android #apps #beautiful #cute #health #igers #iphoneonly #iphonesia #iphone
1,1,2,0.0,Finally a transparant silicon case ^^ Thanks to my uncle :) #yay #Sony #Xperia #S #sonyexperias… http://instagram.com/p/YGEt5JC6JM/,"[finaly, transparant, silicon, case, thanks, uncle, yay, sony, xperia, sonyexperias]",finaly transparant silicon case thanks uncle yay sony xperia sonyexperias,"[#yay, #Sony, #Xperia, #S, #sonyexperias…]",#yay #Sony #Xperia #S #sonyexperias…
2,2,3,0.0,We love this! Would you go? #talk #makememories #unplug #relax #iphone #smartphone #wifi #connect... http://fb.me/6N3LsUpCu,"[love, would, go, talk, makemories, unplug, relax, iphone, smartphone, wifi, conect]",love would go talk makemories unplug relax iphone smartphone wifi conect,"[#talk, #makememories, #unplug, #relax, #iphone, #smartphone, #wifi, #connect...]",#talk #makememories #unplug #relax #iphone #smartphone #wifi #connect...
3,3,4,0.0,I'm wired I know I'm George I was made that way ;) #iphone #cute #daventry #home http://instagr.am/p/Li_5_ujS4k/,"[wired, know, george, made, way, iphone, cute, daventry, home]",wired know george made way iphone cute daventry home,"[#iphone, #cute, #daventry, #home]",#iphone #cute #daventry #home
4,4,5,1.0,What amazing service! Apple won't even talk to me about a question I have unless I pay them $19.95 for their stupid support!,"[amazing, service, aple, wil, even, talk, question, unles, pay, stupid, suport]",amazing service aple wil even talk question unles pay stupid suport,[],


## Emoticons and emojis

In [17]:
%run emoticons.py

In [18]:
%run emojis.py

In [19]:
def convert_emoticons(text):
    for emot in EMOTICONS:
        text = re.sub(u'('+emot+')', "_".join(EMOTICONS[emot].replace(",","").split()), text)
    return text

text1 = "Hello :-) :-)"
text2 = "Thanks to my uncle :) #yay"

print(convert_emoticons(text1))
print(convert_emoticons(text2))

Hello Happy_face_smiley Happy_face_smiley
Thanks to my uncle Happy_face_or_smiley #yay


In [20]:
def convert_emojis(text):
    for emot in UNICODE_EMO:
        text = re.sub(r'('+emot+')', "_".join(UNICODE_EMO[emot].replace(",","").replace(":","").split()), text)
    return text

text = "game is on 🔥"
convert_emojis(text)

'game is on fire'

In [21]:
def process_text_emoji(text):
    """
    Do a basic text processing.

    Parameters
    -----------
    text : string
        
    Returns
    --------
    This function returns pandas series having one list
    with clean text.
    1: split combined text
    2: lowercase
    3: expand apostrophes
    4: remove punctuation
    5: remove digits
    6: remove repeated substring
    7: remove stop words
    8: lemmatize

    Example:
    ========
    import re
    import string
    from nltk.corpus import stopwords
    import nltk
    
    text = "I'm typing text2num! areYou ? If yesyes say yes pals!"
    process_text(text)
    # ['typing', 'textnum', 'yes', 'say', 'yes', 'pal']

    """
    s = pd.Series([text])
    
    # step: expand emoticons and emojis
    s = s.apply(convert_emoticons)
    s = s.apply(convert_emojis)

    # step: Split combined words areYou ==> are You
    #s = s.apply(lambda x: re.sub(r'([a-z])([A-Z])',r'\1 \2',x))

    # step: lowercase
    s = s.str.lower()
    
    # step: remove ellipsis
    #s = s.str.replace(r'(\w)\u2026+',r'\1',regex=True)
    s = s.str.replace(r'…+',r'')

    # step: remove url
    #s = s.str.replace('http\S+|www.\S+', '', case=False)
    s = pd.Series([' '.join(y for y in x.split() if not is_url(y)) for x in s])

    # step: expand apostrophes
    map_apos = {
        "you're": 'you are',
        "i'm": 'i am',
        "he's": 'he is',
        "she's": 'she is',
        "it's": 'it is',
        "they're": 'they are',
        "can't": 'can not',
        "couldn't": 'could not',
        "don't": 'do not',
        "don;t": 'do not',
        "didn't": 'did not',
        "doesn't": 'does not',
        "isn't": 'is not',
        "wasn't": 'was not',
        "aren't": 'are not',
        "weren't": 'were not',
        "won't": 'will not',
        "wouldn't": 'would not',
        "hasn't": 'has not',
        "haven't": 'have not',
        "what's": 'what is',
        "that's": 'that is',
    }

    sa = pd.Series(s.str.split()[0])
    sb = sa.map(map_apos).fillna(sa)
    sentence = sb.str.cat(sep=' ')
    s = pd.Series([sentence])
    
    # step: expand shortcuts
    shortcuts = {'u': 'you', 'y': 'why', 'r': 'are',
                 'doin': 'doing', 'hw': 'how',
                 'k': 'okay', 'm': 'am', 'b4': 'before',
                 'idc': "i do not care", 'ty': 'thankyou',
                 'wlcm': 'welcome', 'bc': 'because',
                 '<3': 'love', 'xoxo': 'love',
                 'ttyl': 'talk to you later', 'gr8': 'great',
                 'bday': 'birthday', 'awsm': 'awesome',
                 'gud': 'good', 'h8': 'hate',
                 'lv': 'love', 'dm': 'direct message',
                 'rt': 'retweet', 'wtf': 'hate',
                 'idgaf': 'hate','irl': 'in real life',
                 'yolo': 'you only live once'}

    sa = pd.Series(s.str.split()[0])
    sb = sa.map(shortcuts).fillna(sa)
    sentence = sb.str.cat(sep=' ')
    s = pd.Series([sentence])

    # step: remove punctuation
    s = s.str.translate(str.maketrans(' ',' ',
                                        string.punctuation))
    # step: remove digits
    s = s.str.translate(str.maketrans(' ', ' ', '\n'))
    s = s.str.translate(str.maketrans(' ', ' ', string.digits))

    # step: remove repeated substring yesyes ==> yes
    s = s.str.replace(r'(\w+)\1',r'\1',regex=True)

    # step: remove stop words
    stop = set(stopwords.words('English'))
    extra_stop_words = ['...']
    stop.update(extra_stop_words) # inplace operation
    s = s.str.split()
    s = s.apply(lambda x: [I for I in x if I not in stop])

    # step: convert word to base form or lemmatize
    lemmatizer = nltk.stem.WordNetLemmatizer()
    s = s.apply(lambda lst: [lemmatizer.lemmatize(word) 
                               for word in lst])

    return s.to_numpy()[0]

text = "rt text2num! yesyes gud www.xy.com amazing"
process_text(text)

['retwet', 'textnum', 'yes', 'god', 'wwxycom', 'amazing']

In [47]:
def add_features_emoji(df):
    # we need to remove url first
    df[mcle] = df[maincol].str.replace('http\S+|www.\S+', '', case=False)
    df[mcle] = df[mcle].apply(process_text_emoji)
    df[mce] = df[mcle].str.join(' ')

    return df

add_features_emoji(df.head().copy())

Unnamed: 0,index,id,label,tweet,tweet_lst_clean_emoji,tweet_clean_emoji
0,0,1,0.0,#fingerprint #Pregnancy Test https://goo.gl/h1MfQV #android #apps #beautiful #cute #health #igers #iphoneonly #iphonesia #iphone,"[fingerprint, pregnancy, test, android, aps, beautiful, cute, health, igers, iphoneonly, iphonesia, iphone]",fingerprint pregnancy test android aps beautiful cute health igers iphoneonly iphonesia iphone
1,1,2,0.0,Finally a transparant silicon case ^^ Thanks to my uncle :) #yay #Sony #Xperia #S #sonyexperias… http://instagram.com/p/YGEt5JC6JM/,"[finaly, transparant, silicon, case, thanks, uncle, hapyfaceorsmiley, yay, sony, xperia, sonyexperias]",finaly transparant silicon case thanks uncle hapyfaceorsmiley yay sony xperia sonyexperias
2,2,3,0.0,We love this! Would you go? #talk #makememories #unplug #relax #iphone #smartphone #wifi #connect... http://fb.me/6N3LsUpCu,"[love, would, go, talk, makemories, unplug, relax, iphone, smartphone, wifi, conect]",love would go talk makemories unplug relax iphone smartphone wifi conect
3,3,4,0.0,I'm wired I know I'm George I was made that way ;) #iphone #cute #daventry #home http://instagr.am/p/Li_5_ujS4k/,"[wired, know, george, made, way, winkorsmirk, iphone, cute, daventry, home]",wired know george made way winkorsmirk iphone cute daventry home
4,4,5,1.0,What amazing service! Apple won't even talk to me about a question I have unless I pay them $19.95 for their stupid support!,"[amazing, service, aple, wil, even, talk, question, unles, pay, stupid, suport]",amazing service aple wil even talk question unles pay stupid suport


In [24]:
%%time

# This takes long time for full data
df = parallelize_dataframe(df, add_features_emoji)

CPU times: user 21.3 ms, sys: 21.6 ms, total: 42.9 ms
Wall time: 7.31 s


In [25]:
df.head()

Unnamed: 0,index,id,label,tweet,tweet_lst_clean,tweet_clean,hashtags_lst,hashtags,tweet_lst_clean_emoji,tweet_clean_emoji
0,0,1,0.0,#fingerprint #Pregnancy Test https://goo.gl/h1...,"[fingerprint, pregnancy, test, android, aps, b...",fingerprint pregnancy test android aps beautif...,[],,"[fingerprint, pregnancy, test, htpskepticalano...",fingerprint pregnancy test htpskepticalanoyedu...
1,1,2,0.0,Finally a transparant silicon case ^^ Thanks t...,"[finaly, transparant, silicon, case, thanks, u...",finaly transparant silicon case thanks uncle y...,[],,"[finaly, transparant, silicon, case, thanks, u...",finaly transparant silicon case thanks uncle h...
2,2,3,0.0,We love this! Would you go? #talk #makememorie...,"[love, would, go, talk, makemories, unplug, re...",love would go talk makemories unplug relax iph...,[],,"[love, would, go, talk, makemories, unplug, re...",love would go talk makemories unplug relax iph...
3,3,4,0.0,I'm wired I know I'm George I was made that wa...,"[wired, know, george, made, way, iphone, cute,...",wired know george made way iphone cute daventr...,[],,"[wired, know, george, made, way, winkorsmirk, ...",wired know george made way winkorsmirk iphone ...
4,4,5,1.0,What amazing service! Apple won't even talk to...,"[amazing, service, aple, wil, even, talk, ques...",amazing service aple wil even talk question un...,[],,"[amazing, service, aple, wil, even, talk, ques...",amazing service aple wil even talk question un...


In [26]:
note = """
Look the clean tweet properly:
- look for url links
- look for ellipsis e.g. #sonyexperias…
- convert emoji and emoticons eg. :) with library emot
""";

# Text Features Generation

In [27]:
def create_text_features(df):
    # total
    df['total_length'] = df[maincol].apply(len)

    # num of word and sentence
    df['num_words'] = df[maincol].apply(lambda x: len(x.split()))

    df['num_sent']=df[maincol].apply(lambda x: 
                                len(re.findall("\n",str(x)))+1)

    df['num_unique_words'] = df[maincol].apply(
        lambda x: len(set(w for w in x.split())))

    df["num_words_title"] = df[maincol].apply(lambda x: len([w for w in str(x).split() if w.istitle()]))

    df['num_uppercase'] = df[maincol].apply(
        lambda x: sum(1 for c in x if c.isupper()))

    # num of certain characters ! ? . @
    df['num_exclamation_marks'] = df[maincol].apply(lambda x: x.count('!'))

    df['num_question_marks'] = df[maincol].apply(lambda x: x.count('?'))

    df['num_punctuation'] = df[maincol].apply(
        lambda x: sum(x.count(w) for w in '.,;:'))

    df['num_symbols'] = df[maincol].apply(
        lambda x: sum(x.count(w) for w in '*&$%'))
    
    df['num_digits'] = df[maincol].apply(lambda x: len([x for x in x.split() if x.isdigit()]))

    # average
    df["avg_word_len"] = df[maincol].apply(lambda x: np.mean([len(w) for w in str(x).split()]))
    
    df['avg_uppercase'] = df.apply(
        lambda row: float(row['num_uppercase'])/float(row['total_length']),
                                    axis=1)
    
    df['avg_unique'] = df['num_unique_words'] / df['num_words']
    
    return df

In [28]:
%%time
df = parallelize_dataframe(df, create_text_features)

CPU times: user 24.3 ms, sys: 23.4 ms, total: 47.7 ms
Wall time: 143 ms


In [29]:
df.head()

Unnamed: 0,index,id,label,tweet,tweet_lst_clean,tweet_clean,hashtags_lst,hashtags,tweet_lst_clean_emoji,tweet_clean_emoji,...,num_words_title,num_uppercase,num_exclamation_marks,num_question_marks,num_punctuation,num_symbols,num_digits,avg_word_len,avg_uppercase,avg_unique
0,0,1,0.0,#fingerprint #Pregnancy Test https://goo.gl/h1...,"[fingerprint, pregnancy, test, android, aps, b...",fingerprint pregnancy test android aps beautif...,[],,"[fingerprint, pregnancy, test, htpskepticalano...",fingerprint pregnancy test htpskepticalanoyedu...,...,2,5,0,0,2,0,0,8.923077,0.039062,1.0
1,1,2,0.0,Finally a transparant silicon case ^^ Thanks t...,"[finaly, transparant, silicon, case, thanks, u...",finaly transparant silicon case thanks uncle y...,[],,"[finaly, transparant, silicon, case, thanks, u...",finaly transparant silicon case thanks uncle h...,...,5,12,0,0,3,0,0,6.764706,0.091603,1.0
2,2,3,0.0,We love this! Would you go? #talk #makememorie...,"[love, would, go, talk, makemories, unplug, re...",love would go talk makemories unplug relax iph...,[],,"[love, would, go, talk, makemories, unplug, re...",love would go talk makemories unplug relax iph...,...,2,6,1,1,5,0,0,7.266667,0.04878,1.0
3,3,4,0.0,I'm wired I know I'm George I was made that wa...,"[wired, know, george, made, way, iphone, cute,...",wired know george made way iphone cute daventr...,[],,"[wired, know, george, made, way, winkorsmirk, ...",wired know george made way winkorsmirk iphone ...,...,3,7,0,0,3,0,0,5.647059,0.0625,0.882353
4,4,5,1.0,What amazing service! Apple won't even talk to...,"[amazing, service, aple, wil, even, talk, ques...",amazing service aple wil even talk question un...,[],,"[amazing, service, aple, wil, even, talk, ques...",amazing service aple wil even talk question un...,...,4,4,2,0,1,1,0,4.434783,0.032258,0.956522


# Script

In [204]:
%%writefile sentiment_analysis_data_processing.py

# load the path
import sys
sys.path.append('/Users/poudel/opt/miniconda3/envs/nlp/lib/python3.7/site-packages')

# load the libraries
import numpy as np
import pandas as pd
import time
import re
import string
from urllib.parse import urlparse
import multiprocessing as mp
import nltk
from nltk.corpus import stopwords

time_start = time.time()

# Load the data
df_train_raw = pd.read_csv('../data/raw/train.csv')
df_test_raw = pd.read_csv('../data/raw/test.csv')
df = df_train_raw.append(df_test_raw)
df = df.reset_index()

# Variables
target = 'label'
maincol = 'tweet'
mc = maincol + '_clean'
mcl = maincol + '_lst_clean'
mce = mc + '_emoji'
mcle = mcl + '_emoji'

# ==================== Useful functions ==============
def parallelize_dataframe(df, func):
    ncores = mp.cpu_count()
    df_split = np.array_split(df, ncores)
    pool = mp.Pool(ncores)
    df = pd.concat(pool.map(func, df_split))
    pool.close()
    pool.join()
    return df

def is_url(url):
    try:
        result = urlparse(url)
        return all([result.scheme, result.netloc])
    except ValueError:
        return False
    
#================== Text processing =================
def process_text(text):
    """
    Do a basic text processing.

    Parameters
    -----------
    text : string

    Returns
    --------
    This function returns pandas series having one list
    with clean text.
    1: split combined text
    2: lowercase
    3: expand apostrophes
    4: remove punctuation
    5: remove digits
    6: remove repeated substring
    7: remove stop words
    8: lemmatize

    Example:
    ========
    import re
    import string
    from nltk.corpus import stopwords
    import nltk
    
    text = "I'm typing text2num! areYou ? If yesyes say yes pals!"
    process_text(text)
    # ['typing', 'textnum', 'yes', 'say', 'yes', 'pal']

    """
    s = pd.Series([text])
    
    # step: Split combined words areYou ==> are You
    #s = s.apply(lambda x: re.sub(r'([a-z])([A-Z])',r'\1 \2',x))

    # step: lowercase
    s = s.str.lower()
    
    # step: remove ellipsis
    #s = s.str.replace(r'(\w)\u2026+',r'\1',regex=True)
    s = s.str.replace(r'…+',r'')

    # step: remove url
    #s = s.str.replace('http\S+|www.\S+', '', case=False)
    s = pd.Series([' '.join(y for y in x.split() if not is_url(y)) for x in s])

    # step: expand apostrophes
    map_apos = {
        "you're": 'you are',
        "i'm": 'i am',
        "he's": 'he is',
        "she's": 'she is',
        "it's": 'it is',
        "they're": 'they are',
        "can't": 'can not',
        "couldn't": 'could not',
        "don't": 'do not',
        "don;t": 'do not',
        "didn't": 'did not',
        "doesn't": 'does not',
        "isn't": 'is not',
        "wasn't": 'was not',
        "aren't": 'are not',
        "weren't": 'were not',
        "won't": 'will not',
        "wouldn't": 'would not',
        "hasn't": 'has not',
        "haven't": 'have not',
        "what's": 'what is',
        "that's": 'that is',
    }

    sa = pd.Series(s.str.split()[0])
    sb = sa.map(map_apos).fillna(sa)
    sentence = sb.str.cat(sep=' ')
    s = pd.Series([sentence])
    
    # step: expand shortcuts
    shortcuts = {'u': 'you', 'y': 'why', 'r': 'are',
                 'doin': 'doing', 'hw': 'how',
                 'k': 'okay', 'm': 'am', 'b4': 'before',
                 'idc': "i do not care", 'ty': 'thankyou',
                 'wlcm': 'welcome', 'bc': 'because',
                 '<3': 'love', 'xoxo': 'love',
                 'ttyl': 'talk to you later', 'gr8': 'great',
                 'bday': 'birthday', 'awsm': 'awesome',
                 'gud': 'good', 'h8': 'hate',
                 'lv': 'love', 'dm': 'direct message',
                 'rt': 'retweet', 'wtf': 'hate',
                 'idgaf': 'hate','irl': 'in real life',
                 'yolo': 'you only live once'}

    sa = pd.Series(s.str.split()[0])
    sb = sa.map(shortcuts).fillna(sa)
    sentence = sb.str.cat(sep=' ')
    s = pd.Series([sentence])

    # step: remove punctuation
    s = s.str.translate(str.maketrans(' ',' ',
                                        string.punctuation))
    # step: remove digits
    s = s.str.translate(str.maketrans(' ', ' ', '\n'))
    s = s.str.translate(str.maketrans(' ', ' ', string.digits))

    # step: remove repeated substring yesyes ==> yes
    s = s.str.replace(r'(\w+)\1',r'\1',regex=True)

    # step: remove stop words
    stop = set(stopwords.words('English'))
    extra_stop_words = ['...']
    stop.update(extra_stop_words) # inplace operation
    s = s.str.split()
    s = s.apply(lambda x: [I for I in x if I not in stop])

    # step: convert word to base form or lemmatize
    lemmatizer = nltk.stem.WordNetLemmatizer()
    s = s.apply(lambda lst: [lemmatizer.lemmatize(word) 
                               for word in lst])

    return s.to_numpy()[0]

def add_features(df):
    df[mcl] = df[maincol].apply(process_text)
    df[mc] = df[mcl].str.join(' ')
    df['hashtags_lst'] = df[maincol].str.findall(r'#.*?(?=\s|$)')
    
    #df['hashtags'] = df[maincol].apply(lambda x: len([x for x in x.split() if x.startswith('#')]))
    
    df['hashtags'] = df['hashtags_lst'].str.join(' ')

    return df

print("Creating clean tweet and hashtags ...")
df = parallelize_dataframe(df, add_features)

#======================= Text Feature Generation =====
def create_text_features(df):
    # total
    df['total_length'] = df[maincol].apply(len)

    # num of word and sentence
    df['num_words'] = df[maincol].apply(lambda x: len(x.split()))

    df['num_sent']=df[maincol].apply(lambda x: 
                                len(re.findall("\n",str(x)))+1)

    df['num_unique_words'] = df[maincol].apply(
        lambda x: len(set(w for w in x.split())))

    df["num_words_title"] = df[maincol].apply(lambda x: len([w for w in str(x).split() if w.istitle()]))

    df['num_uppercase'] = df[maincol].apply(
        lambda x: sum(1 for c in x if c.isupper()))

    # num of certain characters ! ? . @
    df['num_exclamation_marks'] = df[maincol].apply(lambda x: x.count('!'))

    df['num_question_marks'] = df[maincol].apply(lambda x: x.count('?'))

    df['num_punctuation'] = df[maincol].apply(
        lambda x: sum(x.count(w) for w in '.,;:'))

    df['num_symbols'] = df[maincol].apply(
        lambda x: sum(x.count(w) for w in '*&$%'))
    
    df['num_digits'] = df[maincol].apply(lambda x: len([x for x in x.split() if x.isdigit()]))

    # average
    df["avg_word_len"] = df[maincol].apply(lambda x: np.mean([len(w) for w in str(x).split()]))
    
    df['avg_uppercase'] = df.apply(
        lambda row: float(row['num_uppercase'])/float(row['total_length']),
                                    axis=1)

    df['avg_unique'] = df['num_unique_words'] / df['num_words']
    
    return df

print("Adding Text features ...")
df = parallelize_dataframe(df, create_text_features)

#===================== Emoticons =====================
from emoticons.py import *
def convert_emoticons(text):
    for emot in EMOTICONS:
        text = re.sub(u'('+emot+')', "_".join(EMOTICONS[emot].replace(",","").split()), text)
    return text

#===================== Save clean data =========================
df.to_csv('../data/processed/df_combined_clean.csv',index=False)

time_taken = time.time() - time_start
m,s = divmod(time_taken,60)
print(f"Data cleaning finished in {m} min {s:.2f} sec.")

Overwriting sentiment_analysis_data_processing.py


# Script for emoji

In [49]:
%%writefile sentiment_analysis_data_processing.py

# load the path
import sys
sys.path.append('/Users/poudel/opt/miniconda3/envs/nlp/lib/python3.7/site-packages')

# load the libraries
import numpy as np
import pandas as pd
import time
import re
import string
from urllib.parse import urlparse
import multiprocessing as mp
import nltk
from nltk.corpus import stopwords

time_start = time.time()

# Load the data
df_train_raw = pd.read_csv('../data/raw/train.csv')
df_test_raw = pd.read_csv('../data/raw/test.csv')
df = df_train_raw.append(df_test_raw)
df = df.reset_index()

# Variables
target = 'label'
maincol = 'tweet'
mc = maincol + '_clean'
mcl = maincol + '_lst_clean'
mce = mc + '_emoji'
mcle = mcl + '_emoji'

# ==================== Useful functions ==============
def parallelize_dataframe(df, func):
    ncores = mp.cpu_count()
    df_split = np.array_split(df, ncores)
    pool = mp.Pool(ncores)
    df = pd.concat(pool.map(func, df_split))
    pool.close()
    pool.join()
    return df

def is_url(url):
    try:
        result = urlparse(url)
        return all([result.scheme, result.netloc])
    except ValueError:
        return False
    
#================== Text processing =================
def process_text(text):
    """
    Do a basic text processing.

    Parameters
    -----------
    text : string

    Returns
    --------
    This function returns pandas series having one list
    with clean text.
    1: split combined text
    2: lowercase
    3: expand apostrophes
    4: remove punctuation
    5: remove digits
    6: remove repeated substring
    7: remove stop words
    8: lemmatize

    Example:
    ========
    import re
    import string
    from nltk.corpus import stopwords
    import nltk
    
    text = "I'm typing text2num! areYou ? If yesyes say yes pals!"
    process_text(text)
    # ['typing', 'textnum', 'yes', 'say', 'yes', 'pal']

    """
    s = pd.Series([text])
    
    # step: Split combined words areYou ==> are You
    #s = s.apply(lambda x: re.sub(r'([a-z])([A-Z])',r'\1 \2',x))

    # step: lowercase
    s = s.str.lower()
    
    # step: remove ellipsis
    #s = s.str.replace(r'(\w)\u2026+',r'\1',regex=True)
    s = s.str.replace(r'…+',r'')

    # step: remove url
    #s = s.str.replace('http\S+|www.\S+', '', case=False)
    s = pd.Series([' '.join(y for y in x.split() if not is_url(y)) for x in s])

    # step: expand apostrophes
    map_apos = {
        "you're": 'you are',
        "i'm": 'i am',
        "he's": 'he is',
        "she's": 'she is',
        "it's": 'it is',
        "they're": 'they are',
        "can't": 'can not',
        "couldn't": 'could not',
        "don't": 'do not',
        "don;t": 'do not',
        "didn't": 'did not',
        "doesn't": 'does not',
        "isn't": 'is not',
        "wasn't": 'was not',
        "aren't": 'are not',
        "weren't": 'were not',
        "won't": 'will not',
        "wouldn't": 'would not',
        "hasn't": 'has not',
        "haven't": 'have not',
        "what's": 'what is',
        "that's": 'that is',
    }

    sa = pd.Series(s.str.split()[0])
    sb = sa.map(map_apos).fillna(sa)
    sentence = sb.str.cat(sep=' ')
    s = pd.Series([sentence])
    
    # step: expand shortcuts
    shortcuts = {'u': 'you', 'y': 'why', 'r': 'are',
                 'doin': 'doing', 'hw': 'how',
                 'k': 'okay', 'm': 'am', 'b4': 'before',
                 'idc': "i do not care", 'ty': 'thankyou',
                 'wlcm': 'welcome', 'bc': 'because',
                 '<3': 'love', 'xoxo': 'love',
                 'ttyl': 'talk to you later', 'gr8': 'great',
                 'bday': 'birthday', 'awsm': 'awesome',
                 'gud': 'good', 'h8': 'hate',
                 'lv': 'love', 'dm': 'direct message',
                 'rt': 'retweet', 'wtf': 'hate',
                 'idgaf': 'hate','irl': 'in real life',
                 'yolo': 'you only live once'}

    sa = pd.Series(s.str.split()[0])
    sb = sa.map(shortcuts).fillna(sa)
    sentence = sb.str.cat(sep=' ')
    s = pd.Series([sentence])

    # step: remove punctuation
    s = s.str.translate(str.maketrans(' ',' ',
                                        string.punctuation))
    # step: remove digits
    s = s.str.translate(str.maketrans(' ', ' ', '\n'))
    s = s.str.translate(str.maketrans(' ', ' ', string.digits))

    # step: remove repeated substring yesyes ==> yes
    s = s.str.replace(r'(\w+)\1',r'\1',regex=True)

    # step: remove stop words
    stop = set(stopwords.words('English'))
    extra_stop_words = ['...']
    stop.update(extra_stop_words) # inplace operation
    s = s.str.split()
    s = s.apply(lambda x: [I for I in x if I not in stop])

    # step: convert word to base form or lemmatize
    lemmatizer = nltk.stem.WordNetLemmatizer()
    s = s.apply(lambda lst: [lemmatizer.lemmatize(word) 
                               for word in lst])

    return s.to_numpy()[0]

def add_features(df):
    df[mcl] = df[maincol].apply(process_text)
    df[mc] = df[mcl].str.join(' ')
    df['hashtags_lst'] = df[maincol].str.findall(r'#.*?(?=\s|$)')
    
    #df['hashtags'] = df[maincol].apply(lambda x: len([x for x in x.split() if x.startswith('#')]))
    
    df['hashtags'] = df['hashtags_lst'].str.join(' ')

    return df

print("Creating clean tweet and hashtags ...")
df = parallelize_dataframe(df, add_features)

#======================= Text Feature Generation =====
def create_text_features(df):
    # total
    df['total_length'] = df[maincol].apply(len)

    # num of word and sentence
    df['num_words'] = df[maincol].apply(lambda x: len(x.split()))

    df['num_sent']=df[maincol].apply(lambda x: 
                                len(re.findall("\n",str(x)))+1)

    df['num_unique_words'] = df[maincol].apply(
        lambda x: len(set(w for w in x.split())))

    df["num_words_title"] = df[maincol].apply(lambda x: len([w for w in str(x).split() if w.istitle()]))

    df['num_uppercase'] = df[maincol].apply(
        lambda x: sum(1 for c in x if c.isupper()))

    # num of certain characters ! ? . @
    df['num_exclamation_marks'] = df[maincol].apply(lambda x: x.count('!'))

    df['num_question_marks'] = df[maincol].apply(lambda x: x.count('?'))

    df['num_punctuation'] = df[maincol].apply(
        lambda x: sum(x.count(w) for w in '.,;:'))

    df['num_symbols'] = df[maincol].apply(
        lambda x: sum(x.count(w) for w in '*&$%'))
    
    df['num_digits'] = df[maincol].apply(lambda x: len([x for x in x.split() if x.isdigit()]))

    # average
    df["avg_word_len"] = df[maincol].apply(lambda x: np.mean([len(w) for w in str(x).split()]))
    
    df['avg_uppercase'] = df.apply(
        lambda row: float(row['num_uppercase'])/float(row['total_length']),
                                    axis=1)

    df['avg_unique'] = df['num_unique_words'] / df['num_words']
    
    return df

print("Adding Text features ...")
df = parallelize_dataframe(df, create_text_features)

#===================== Manipulating emoticons and emojis
from emojis import *
from emoticons import *

def convert_emoticons(text):
    for emot in EMOTICONS:
        text = re.sub(u'('+emot+')', "_".join(EMOTICONS[emot].replace(",","").split()), text)
    return text

def convert_emojis(text):
    for emot in UNICODE_EMO:
        text = re.sub(r'('+emot+')', "_".join(UNICODE_EMO[emot].replace(",","").replace(":","").split()), text)
    return text

def process_text_emoji(text):
    """
    Do a basic text processing.

    Parameters
    -----------
    text : string
        
    Returns
    --------
    This function returns pandas series having one list
    with clean text.
    1: split combined text
    2: lowercase
    3: expand apostrophes
    4: remove punctuation
    5: remove digits
    6: remove repeated substring
    7: remove stop words
    8: lemmatize

    Example:
    ========
    import re
    import string
    from nltk.corpus import stopwords
    import nltk
    
    text = "I'm typing text2num! areYou ? If yesyes say yes pals!"
    process_text(text)
    # ['typing', 'textnum', 'yes', 'say', 'yes', 'pal']

    """
    s = pd.Series([text])
    
    # step: expand emoticons and emojis
    s = s.apply(convert_emoticons)
    s = s.apply(convert_emojis)

    # step: Split combined words areYou ==> are You
    #s = s.apply(lambda x: re.sub(r'([a-z])([A-Z])',r'\1 \2',x))

    # step: lowercase
    s = s.str.lower()
    
    # step: remove ellipsis
    #s = s.str.replace(r'(\w)\u2026+',r'\1',regex=True)
    s = s.str.replace(r'…+',r'')

    # step: remove url
    #s = s.str.replace('http\S+|www.\S+', '', case=False)
    s = pd.Series([' '.join(y for y in x.split() if not is_url(y)) for x in s])

    # step: expand apostrophes
    map_apos = {
        "you're": 'you are',
        "i'm": 'i am',
        "he's": 'he is',
        "she's": 'she is',
        "it's": 'it is',
        "they're": 'they are',
        "can't": 'can not',
        "couldn't": 'could not',
        "don't": 'do not',
        "don;t": 'do not',
        "didn't": 'did not',
        "doesn't": 'does not',
        "isn't": 'is not',
        "wasn't": 'was not',
        "aren't": 'are not',
        "weren't": 'were not',
        "won't": 'will not',
        "wouldn't": 'would not',
        "hasn't": 'has not',
        "haven't": 'have not',
        "what's": 'what is',
        "that's": 'that is',
    }

    sa = pd.Series(s.str.split()[0])
    sb = sa.map(map_apos).fillna(sa)
    sentence = sb.str.cat(sep=' ')
    s = pd.Series([sentence])
    
    # step: expand shortcuts
    shortcuts = {'u': 'you', 'y': 'why', 'r': 'are',
                 'doin': 'doing', 'hw': 'how',
                 'k': 'okay', 'm': 'am', 'b4': 'before',
                 'idc': "i do not care", 'ty': 'thankyou',
                 'wlcm': 'welcome', 'bc': 'because',
                 '<3': 'love', 'xoxo': 'love',
                 'ttyl': 'talk to you later', 'gr8': 'great',
                 'bday': 'birthday', 'awsm': 'awesome',
                 'gud': 'good', 'h8': 'hate',
                 'lv': 'love', 'dm': 'direct message',
                 'rt': 'retweet', 'wtf': 'hate',
                 'idgaf': 'hate','irl': 'in real life',
                 'yolo': 'you only live once'}

    sa = pd.Series(s.str.split()[0])
    sb = sa.map(shortcuts).fillna(sa)
    sentence = sb.str.cat(sep=' ')
    s = pd.Series([sentence])

    # step: remove punctuation
    s = s.str.translate(str.maketrans(' ',' ',
                                        string.punctuation))
    # step: remove digits
    s = s.str.translate(str.maketrans(' ', ' ', '\n'))
    s = s.str.translate(str.maketrans(' ', ' ', string.digits))

    # step: remove repeated substring yesyes ==> yes
    s = s.str.replace(r'(\w+)\1',r'\1',regex=True)

    # step: remove stop words
    stop = set(stopwords.words('English'))
    extra_stop_words = ['...']
    stop.update(extra_stop_words) # inplace operation
    s = s.str.split()
    s = s.apply(lambda x: [I for I in x if I not in stop])

    # step: convert word to base form or lemmatize
    lemmatizer = nltk.stem.WordNetLemmatizer()
    s = s.apply(lambda lst: [lemmatizer.lemmatize(word) 
                               for word in lst])

    return s.to_numpy()[0]

def add_features_emoji(df):
    # we need to remove url first
    df[mcle] = df[maincol].str.replace('http\S+|www.\S+', '', case=False)
    df[mcle] = df[mcle].apply(process_text_emoji)
    df[mce] = df[mcle].str.join(' ')

    return df

print("Adding Emoticons and emoji features ...")
df = parallelize_dataframe(df, add_features_emoji)

#===================== Save clean data =========================
df.to_csv('../data/processed/df_combined_clean.csv',index=False)

time_taken = time.time() - time_start
m,s = divmod(time_taken,60)
print(f"Data cleaning finished in {m} min {s:.2f} sec.")

# Data cleaning finished in 12.0 min 6.19 sec.

Overwriting sentiment_analysis_data_processing.py
