In [130]:
import pandas as pd
pd.options.display.max_colwidth = 2000

In [131]:
trainData = pd.read_csv('train.csv')
trainData.head()

Unnamed: 0,id,label,tweet
0,1,0,#fingerprint #Pregnancy Test https://goo.gl/h1MfQV #android #apps #beautiful #cute #health #igers #iphoneonly #iphonesia #iphone
1,2,0,Finally a transparant silicon case ^^ Thanks to my uncle :) #yay #Sony #Xperia #S #sonyexperias… http://instagram.com/p/YGEt5JC6JM/
2,3,0,We love this! Would you go? #talk #makememories #unplug #relax #iphone #smartphone #wifi #connect... http://fb.me/6N3LsUpCu
3,4,0,I'm wired I know I'm George I was made that way ;) #iphone #cute #daventry #home http://instagr.am/p/Li_5_ujS4k/
4,5,1,What amazing service! Apple won't even talk to me about a question I have unless I pay them $19.95 for their stupid support!


In [132]:
import re
def removeAtMention(tweet):
    tweetAfterRemovingAt = [re.sub(r'^@\w*','',eachword) for eachword in tweet.split()]
    tweetSentence = ' '.join(tweetAfterRemovingAt)
    return tweetSentence.strip()

trainData['tweet'] = trainData['tweet'].apply(removeAtMention)

In [133]:
def removeHashTags(tweet):
    tweetAfterRemovingHash = [re.sub(r'^#','',eachword) if eachword.startswith('#') 
                                                        else eachword for eachword in tweet.split()]
    tweetSentence = ' '.join(tweetAfterRemovingHash)
    return tweetSentence.strip()

trainData['tweet'] = trainData['tweet'].apply(removeHashTags)

In [134]:
def removeUrl(text):
    """
    Remove urls from text
    Example: link to latest cricket score. https://xyz.com/a/b => link to latest cricket score.
    Args:
        text (str): text
    Returns:
        text (str): text with removed urls
    """
    flag=0
    urlfree = []
    for word in text.split():
        if not (word.startswith("www") or word.startswith("http") or 
                word.endswith(".html") or re.search('.com',word)):
            urlfree.append(word)
    urlfree = " ".join(urlfree)

    urls = re.finditer(r'http[\w]*:\/\/[\w]*\.?[\w-]+\.+[\w]+[\/\w]+', urlfree)
    for i in urls:
        urlfree = re.sub(i.group().strip(), '', urlfree)
    return urlfree

trainData['tweet'] = trainData['tweet'].apply(removeUrl)

In [135]:
def getIsVulgar(sentence):
    words = sentence.split()
    if '$&@*#' in words:
        return 1
    return 0

trainData['isVulgar'] = trainData['tweet'].apply(getIsVulgar)

In [136]:
def replaceVulgarPattern(sentence):
    words = sentence.split()
    wordsReplaced = [eachword if eachword != '$&@*#' else 'bad' for eachword in words]
    sentenceReplaced = ' '.join(wordsReplaced)
    return sentenceReplaced

trainData['tweet'] = trainData['tweet'].apply(replaceVulgarPattern)

In [137]:
def replaceMiddleVulgarPattern(sentence):
    wordsReplaced = [re.sub(r'\$\&\@\*\#',' wrong ',eachword) for eachword in sentence.split()]
    sentenceReplaced = ' '.join(wordsReplaced)
    return sentenceReplaced

trainData['tweet'] = trainData['tweet'].apply(replaceMiddleVulgarPattern)

In [138]:
from emo import emo
def emoticonsScore(tweet):
    words = tweet.split()
    emoScore = 0
    for word in words:
        if word in emo:
            emoScore += emo[word]
    return emoScore

trainData['emoScore'] = trainData['tweet'].apply(emoticonsScore)

In [139]:
def remove_punctuations(text):
    """
    Removed special characters from text
    Example: he: I am going. are you coming? => he I am going. are you coming
   
    Args:
        text (str): text
   
    Returns:
        clean_text (str): cleaned text with removed special characters
    """
    regex_pattern = re.compile(r'[\,+\:\?\!\"\(\)!\.\%\[\]\<]+')
    clean_text = regex_pattern.sub(r' ', text)
    clean_text = clean_text.replace('-', '')
    return clean_text

trainData['tweet'] = trainData['tweet'].apply(remove_punctuations)

In [140]:
import re
from nltk.corpus import wordnet
class RepeatReplacer(object):
    def __init__(self):
        self.repeat_regexp = re.compile(r'(\w*)(\w)\2(\w*)')
        self.repl = r'\1\2\3'
    def replace(self, word):
        if wordnet.synsets(word):
            return word
        repl_word = self.repeat_regexp.sub(self.repl, word)
        if repl_word != word:
            return self.replace(repl_word)
        else:
            return repl_word

def replaceRepeatedChar(tweet):
    replacer = RepeatReplacer()
    tweetAfterReplacement = [replacer.replace(eachword) for eachword in tweet.split()]
    tweetSentence = ' '.join(tweetAfterReplacement)
    return tweetSentence.strip()

trainData['tweet'] = trainData['tweet'].apply(replaceRepeatedChar)

In [141]:
import re
def separate_digit_text(text):
    """
    Separate digit and words with space in text
    Example: I will be booking tickets for 2adults => I will be booking tickets for 2 adults   
    Args:
        text (str): text
    Returns:
        clean_text (str): cleaned text with separated digits and words
    """
    regex_patter = re.compile(r'([\d]+)([a-zA-Z]+)')
    clean_text = regex_patter.sub(r'\1 \2', text)
    return clean_text

trainData['tweet'] = trainData['tweet'].apply(separate_digit_text)

In [142]:
from slangs import slangs_dict

def slang_look_up(text):
    """
    Replace slang word in text to their original form
    Example: hi, thanq so mch => hi, thank you so much
    Args:
        text (str): text
    Returns:
        slanged (str): cleaned text with replaced slang
    """
    words = text.split()
    new_text = []

    for word in words:
        word_s = word.lower()
        if word_s in slangs_dict:
            new_text.append(slangs_dict[word_s])
        else:
            new_text.append(word)
    slanged = " ".join(new_text)
    return slanged

trainData['tweet'] = trainData['tweet'].apply(slang_look_up)

In [143]:
from appos import appos_dict

def appos_look_up(text):
    """
    Convert apostrophes word to original form
    Example: I don't know what is going on?  => I do not know what is going on? 
    Args:
        text (str): text 
    Returns:
        apposed (str) : text with converted apostrophes
    """
    words = text.split()
    new_text = []
    for word in words:
        word_s = word.lower()
        if word_s in appos_dict:
            new_text.append(appos_dict[word_s])
        else:
            new_text.append(word)
    apposed = " ".join(new_text)
    return apposed

trainData['tweet'] = trainData['tweet'].apply(appos_look_up)

In [144]:
def replace_digits_with_char(text, replace_char=' digit '):
    """
    Replace digits to `replace_char`
    Example: I will be there on 22 april. => I will be there on dd april.
    Args:
        text (str): text
        replace_char (str): character with which digit has to be replaced
    Returns:
        clean_text (str): clean text with replaced char for digits
    """
    regex_pattern = re.compile(r'[0-9]+')
    clean_text = regex_pattern.sub(replace_char, text)
    
    return clean_text

trainData['tweet'] = trainData['tweet'].apply(replace_digits_with_char)

In [145]:
from nltk.stem.wordnet import WordNetLemmatizer
from nltk.stem.snowball import SnowballStemmer
from nltk.stem import PorterStemmer
from nltk import word_tokenize

sb_stem = SnowballStemmer("english", ignore_stopwords=True)
pt_stem = PorterStemmer()

def stem_text(text, stemmer='snowball'):
    """
    Convert words in text into their root form
    Example: I am playing in ground => I am play in ground 
    Args:
        text (str): text
        
    Returns:
        text_stem (str): cleaned text with replaced stem words
    """
#     text = remove_inside_braces(text)
    tokens = word_tokenize(text)
    if stemmer == 'snowball':
        text_stem = " ".join([sb_stem.stem(w) for w in tokens])
    else:
        text_stem = " ".join([pt_stem.stem(w) for w in tokens])
    
    return text_stem

trainData['tweet'] = trainData['tweet'].apply(stem_text)

In [146]:
def removeNonAscii(tweet):
    #For Removing non ascii characters 
    sentenceClean = ''.join([i if ord(i) < 128 else ' ' for i in tweet])
    sentenceClean = ' '.join([eachWord.strip() for eachWord in sentenceClean.split()])
    return sentenceClean

trainData['tweet'] = trainData['tweet'].apply(removeNonAscii)

In [147]:
def removeMorePunctuations(text):
    """
    Removed special characters from text
    Example: he: I am going. are you coming? => he I am going. are you coming
   
    Args:
        text (str): text
   
    Returns:
        clean_text (str): cleaned text with removed special characters
    """
    regex_pattern = re.compile(r'[\<\*\#\$\>\@\&\'\~\_\/\;\=\%]+')
    clean_text = regex_pattern.sub(r' ', text)
    clean_text = clean_text.replace('-', '')
    return clean_text

trainData['tweet'] = trainData['tweet'].apply(removeMorePunctuations)

In [148]:
separate_digit_text('#SamsungGalaxyS9')

'#SamsungGalaxyS9'

In [149]:
trainData.to_csv('CleanData.csv',index=None,columns=['id','tweet','isVulgar','emoScore','label'])

In [150]:
trainData['tweet'][4]

'what amaz servic appl will not even talk to me about a question i have unl i pay them   digit digit for their stupid support'

In [151]:
trainData.iloc[7]['tweet']

'new type see charger cabl uk bay amazon etsi new year rob cross tobi young evemun mcmafia taylor spectr digit newyear start digit recip technolog samsunggalaxi digit iphonex'

In [152]:
trainData.head(50)

Unnamed: 0,id,label,tweet,isVulgar,emoScore
0,1,0,fingerprint pregnanc test android ap beauti cute health iger iphoneon iphonesia iphon,0,0
1,2,0,final a transpar silicon case ^^ thank to my uncl yay soni xperia s sonyexperias,0,2
2,3,0,we love this would you go talk makememori unplug relax iphon smartphon wifi connect,0,0
3,4,0,i am wire i know i am georg i was made that way iphon cute daventri home,0,1
4,5,1,what amaz servic appl will not even talk to me about a question i have unl i pay them digit digit for their stupid support,0,0
5,6,1,iphon softwar updat fuck up my phone big time stupid iphon,0,0
6,7,0,happi for us instap instadaili us soni xperia xperiaz,0,0
7,8,0,new type see charger cabl uk bay amazon etsi new year rob cross tobi young evemun mcmafia taylor spectr digit newyear start digit recip technolog samsunggalaxi digit iphonex,0,0
8,9,0,bout to go shop again listen to music iphon justm music likeforlik folowforfolow,0,0
9,10,0,photo fun selfi pool water soni camera picoftheday sun instagod boy cute outdoor,0,0


In [153]:
emoticonsListValues = {}
for eachEmoticon in emoticonsList:
    emoticonsListValues[eachEmoticon] = emo[eachEmoticon]


NameError: name 'emoticonsList' is not defined

In [None]:
# emoticonsListValues

In [None]:
emoticons_look_up(':P is vamsi:( :) :<(')

In [None]:
def isNonAsciiWord (sentence):
    for eachChar in sentence:
        if ord(eachChar) > 128:
            return 1
    return 0
    

In [None]:
dataWithNonAsciiChars = trainData[trainData['isNonAscii'] == 1]

In [None]:
trainData['isNonAscii'] = trainData['tweet'].apply(isNonAsciiWord)

In [None]:
dataWithNonAsciiChars.to_csv('data.csv',index=None)

In [None]:
pd.set_option('expand_frame_repr', True)

In [None]:
replaceRepeatedChar('I REALLLLY need a new iphone charger, both of miiiiiiiine spaz and short ouuuuut CONSTANTLY and it pisses me off like no other #$&@*# #apple #iphones  ')

In [None]:
ord('¶')