In [5]:
# Importing Pandas and NumPy
import pandas as pd
import numpy as np
import re
import random

In [6]:
class Preprocessor :
    def __init__(self):
        print("*** PREPROCESSING ***")

    
    hash_regex = re.compile(r"#(\w+)")

    def hash_repl(self, match):
        return '__HASH_' + match.group(1).upper()

    
    hndl_regex = re.compile(r"@(\w+)")

    def hndl_repl(self, match):
        return '__HNDL'  # _'+match.group(1).upper()

    
    url_regex = re.compile(r"(http|https|ftp)://[a-zA-Z0-9\./]+")

    
    word_bound_regex = re.compile(r"\W+")

    # Repeating words like hurrrryyyyyy
    rpt_regex = re.compile(r"(.)\1{1,}", re.IGNORECASE);

    def rpt_repl(self, match):
        return match.group(1) + match.group(1)

    
    emoticons = \
        [('__EMOT_SMILEY', [':-)', ':)', '(:', '(-:', ]), \
         ('__EMOT_LAUGH', [':-D', ':D', 'X-D', 'XD', 'xD', ]), \
         ('__EMOT_LOVE', ['<3', ':\*', ]), \
         ('__EMOT_WINK', [';-)', ';)', ';-D', ';D', '(;', '(-;', ]), \
         ('__EMOT_FROWN', [':-(', ':(', '(:', '(-:', ]), \
         ('__EMOT_CRY', [':,(', ':\'(', ':"(', ':((']), \
         ]

    
    punctuations = \
        [  # ('',		['.', ] )	,\
            # ('',		[',', ] )	,\
            # ('',		['\'', '\"', ] )	,\
            ('__PUNC_EXCL', ['!', ]), \
            ('__PUNC_QUES', ['?', ]), \
            ('__PUNC_ELLP', ['...', ]), \
            

    
    def print_config(self, cfg):
        for (x, arr) in cfg:
            print(x, '\t')
            for a in arr:
                print (a, '\t')
            print ('')

    def print_emoticons(self):
        self.print_config(self.emoticons)

    def print_punctuations(self):
        self.print_config(self.punctuations)

    # For emoticon regexes
    def escape_paren(self, arr):
        return [text.replace(')', '[)}\]]').replace('(', '[({\[]') for text in arr]

    def regex_union(self, arr):
        return '(' + '|'.join(arr) + ')'

    def get_emoticons_regex(self):
        emoticons_regex = [(repl, re.compile(self.regex_union(self.escape_paren(regx)))) \
                           for (repl, regx) in self.emoticons]
        return emoticons_regex



    
    def punctuations_repl(self, match):
        text = match.group(0)
        repl = []
        for (key, parr) in self.punctuations:
            for punc in parr:
                if punc in text:
                    repl.append(key)
        if (len(repl) > 0):
            return ' ' + ' '.join(repl) + ' '
        else:
            return ' '

    def processHashtags( self, text, subject='', query=[]):
        return re.sub( self.hash_regex, self.hash_repl, text )

    def processHandles( self, text, subject='', query=[]):
        return re.sub( self.hndl_regex, self.hndl_repl, text )

    def processUrls( self, text, subject='', query=[]):
        return re.sub( self.url_regex, ' __URL ', text )

    def processEmoticons( self, text, subject='', query=[]):
        for (repl, regx) in self.get_emoticons_regex() :
            text = re.sub(regx, '  ' +repl +' ', text)
        return text

    def processPunctuations( self, text, subject='', query=[]):
        return re.sub( self.word_bound_regex , self.punctuations_repl, text )

    def processRepeatings( 	self, text, subject='', query=[]):
        return re.sub( self.rpt_regex, self.rpt_repl, text )

    def processQueryTerm( self, text, subject='', query=[]):
        query_regex = "|".join([ re.escape(q) for q in query])
        return re.sub( query_regex, '__QUER', text, flags=re.IGNORECASE )

    def countHandles(self, text):
        return len( re.findall( self.hndl_regex, text) )
    def countHashtags(self, text):
        return len( re.findall( self.hash_regex, text) )
    def countUrls(self, text):
        return len( re.findall( self.url_regex, text) )
    def countEmoticons(self, text):
        count = 0
        for (repl, regx) in self.get_emoticons_regex() :
            count += len( re.findall( regx, text) )
        return count

   
    def processAll(self, text, subject='', query=[]):

        if(len(query ) >0):
            query_regex = "|".join([ re.escape(q) for q in query])
            text = re.sub( query_regex, '__QUER', text, flags=re.IGNORECASE )

        text = re.sub( self.hash_regex, self.hash_repl, text )
        text = re.sub( self.hndl_regex, self.hndl_repl, text )
        text = re.sub( self.url_regex, ' __URL ', text )

        for (repl, regx) in self.get_emoticons_regex() :
            text = re.sub(regx, '  ' +repl +' ', text)


        text = text.replace('\'' ,'')
        

        text = re.sub( self.word_bound_regex , self.punctuations_repl, text )
        text = re.sub( self.rpt_regex, self.rpt_repl, text )

        return text



In [7]:
print("reading csv")
tweet_data = pd.read_csv("./data/tweets.csv",encoding='ISO-8859-1',names=["label", "id", "date", "query_flag", "user", "tweet"])
tweets = tweet_data['tweet']
print("pre-processing started")
preprocess = Preprocessor()
tweets_1 = tweets.apply(lambda x: preprocess.processAll(x))
tweetdf = pd.DataFrame({'sentiment': tweet_data['label'],
                   'tweet': tweets_1})
tweetdf.to_csv("./data/preprocessed_tweets.csv" ,index=False)
fid = open("./data/preprocessed_tweets.csv", "r")
li = fid.readlines()
fid.close()
print("shuffling started")
random.shuffle(li)
fid1 = open("./data/preprocessed_tweets_shuffled.csv", "w")
fid1.writelines(li)
fid1.close()
print("shuffled")

reading csv
pre-processing started
*** PREPROCESSING ***
shuffling started
shuffled
