In [1]:
import pandas as pd
pd.options.display.max_colwidth = 2000

In [2]:
testData = pd.read_csv('test.csv')
testData.head()

Unnamed: 0,id,tweet
0,7921,I hate the new #iphone upgrade. Won't let me download apps. #ugh #apple sucks
1,7922,currently shitting my fucking pants. #apple #iMac #cashmoney #raddest #swagswagswag http://instagr.am/p/UUIS0bIBZo/
2,7923,"I'd like to puts some CD-ROMS on my iPad, is that possible?' — Yes, but wouldn't that block the screen?\n"
3,7924,"My ipod is officially dead. I lost all my pictures and videos from the 1D and 5sos concert,and from Vet Camp #hatinglife #sobbing"
4,7925,Been fighting iTunes all night! I only want the music I $&@*# paid for


In [3]:
import re
def removeAtMention(tweet):
    tweetAfterRemovingAt = [re.sub(r'^@\w*','',eachword) for eachword in tweet.split()]
    tweetSentence = ' '.join(tweetAfterRemovingAt)
    return tweetSentence.strip()

testData['tweet'] = testData['tweet'].apply(removeAtMention)

In [4]:
def removeHashTags(tweet):
    tweetAfterRemovingHash = [re.sub(r'^#','',eachword) if eachword.startswith('#') 
                                                        else eachword for eachword in tweet.split()]
    tweetSentence = ' '.join(tweetAfterRemovingHash)
    return tweetSentence.strip()

testData['tweet'] = testData['tweet'].apply(removeHashTags)

In [5]:
def removeUrl(text):
    """
    Remove urls from text
    Example: link to latest cricket score. https://xyz.com/a/b => link to latest cricket score.
    Args:
        text (str): text
    Returns:
        text (str): text with removed urls
    """
    flag=0
    urlfree = []
    for word in text.split():
        if not (word.startswith("www") or word.startswith("http") or 
                word.endswith(".html") or re.search('.com',word)):
            urlfree.append(word)
    urlfree = " ".join(urlfree)

    urls = re.finditer(r'http[\w]*:\/\/[\w]*\.?[\w-]+\.+[\w]+[\/\w]+', urlfree)
    for i in urls:
        urlfree = re.sub(i.group().strip(), '', urlfree)
    return urlfree

testData['tweet'] = testData['tweet'].apply(removeUrl)

In [6]:
def getIsVulgar(sentence):
    words = sentence.split()
    if '$&@*#' in words:
        return 1
    return 0

testData['isVulgar'] = testData['tweet'].apply(getIsVulgar)

In [7]:
def replaceVulgarPattern(sentence):
    words = sentence.split()
    wordsReplaced = [eachword if eachword != '$&@*#' else 'bad' for eachword in words]
    sentenceReplaced = ' '.join(wordsReplaced)
    return sentenceReplaced

testData['tweet'] = testData['tweet'].apply(replaceVulgarPattern)

In [8]:
def replaceMiddleVulgarPattern(sentence):
    wordsReplaced = [re.sub(r'\$\&\@\*\#',' wrong ',eachword) for eachword in sentence.split()]
    sentenceReplaced = ' '.join(wordsReplaced)
    return sentenceReplaced

testData['tweet'] = testData['tweet'].apply(replaceMiddleVulgarPattern)

In [9]:
from emo import emo
def emoticonsScore(tweet):
    words = tweet.split()
    emoScore = 0
    for word in words:
        if word in emo:
            emoScore += emo[word]
    return emoScore

testData['emoScore'] = testData['tweet'].apply(emoticonsScore)

In [10]:
def remove_punctuations(text):
    """
    Removed special characters from text
    Example: he: I am going. are you coming? => he I am going. are you coming
   
    Args:
        text (str): text
   
    Returns:
        clean_text (str): cleaned text with removed special characters
    """
    regex_pattern = re.compile(r'[\,+\:\?\!\"\(\)!\.\%\[\]\<]+')
    clean_text = regex_pattern.sub(r' ', text)
    clean_text = clean_text.replace('-', '')
    return clean_text

testData['tweet'] = testData['tweet'].apply(remove_punctuations)

In [11]:
import re
from nltk.corpus import wordnet
class RepeatReplacer(object):
    def __init__(self):
        self.repeat_regexp = re.compile(r'(\w*)(\w)\2(\w*)')
        self.repl = r'\1\2\3'
    def replace(self, word):
        if wordnet.synsets(word):
            return word
        repl_word = self.repeat_regexp.sub(self.repl, word)
        if repl_word != word:
            return self.replace(repl_word)
        else:
            return repl_word

def replaceRepeatedChar(tweet):
    replacer = RepeatReplacer()
    tweetAfterReplacement = [replacer.replace(eachword) for eachword in tweet.split()]
    tweetSentence = ' '.join(tweetAfterReplacement)
    return tweetSentence.strip()

testData['tweet'] = testData['tweet'].apply(replaceRepeatedChar)

In [12]:
import re
def separate_digit_text(text):
    """
    Separate digit and words with space in text
    Example: I will be booking tickets for 2adults => I will be booking tickets for 2 adults   
    Args:
        text (str): text
    Returns:
        clean_text (str): cleaned text with separated digits and words
    """
    regex_patter = re.compile(r'([\d]+)([a-zA-Z]+)')
    clean_text = regex_patter.sub(r'\1 \2', text)
    return clean_text

testData['tweet'] = testData['tweet'].apply(separate_digit_text)

In [13]:
from slangs import slangs_dict

def slang_look_up(text):
    """
    Replace slang word in text to their original form
    Example: hi, thanq so mch => hi, thank you so much
    Args:
        text (str): text
    Returns:
        slanged (str): cleaned text with replaced slang
    """
    words = text.split()
    new_text = []

    for word in words:
        word_s = word.lower()
        if word_s in slangs_dict:
            new_text.append(slangs_dict[word_s])
        else:
            new_text.append(word)
    slanged = " ".join(new_text)
    return slanged

testData['tweet'] = testData['tweet'].apply(slang_look_up)

In [14]:
from appos import appos_dict

def appos_look_up(text):
    """
    Convert apostrophes word to original form
    Example: I don't know what is going on?  => I do not know what is going on? 
    Args:
        text (str): text 
    Returns:
        apposed (str) : text with converted apostrophes
    """
    words = text.split()
    new_text = []
    for word in words:
        word_s = word.lower()
        if word_s in appos_dict:
            new_text.append(appos_dict[word_s])
        else:
            new_text.append(word)
    apposed = " ".join(new_text)
    return apposed

testData['tweet'] = testData['tweet'].apply(appos_look_up)

In [15]:
def replace_digits_with_char(text, replace_char=' digit '):
    """
    Replace digits to `replace_char`
    Example: I will be there on 22 april. => I will be there on dd april.
    Args:
        text (str): text
        replace_char (str): character with which digit has to be replaced
    Returns:
        clean_text (str): clean text with replaced char for digits
    """
    regex_pattern = re.compile(r'[0-9]+')
    clean_text = regex_pattern.sub(replace_char, text)
    
    return clean_text

testData['tweet'] = testData['tweet'].apply(replace_digits_with_char)

In [16]:
from nltk.stem.wordnet import WordNetLemmatizer
from nltk.stem.snowball import SnowballStemmer
from nltk.stem import PorterStemmer
from nltk import word_tokenize

sb_stem = SnowballStemmer("english", ignore_stopwords=True)
pt_stem = PorterStemmer()

def stem_text(text, stemmer='snowball'):
    """
    Convert words in text into their root form
    Example: I am playing in ground => I am play in ground 
    Args:
        text (str): text
        
    Returns:
        text_stem (str): cleaned text with replaced stem words
    """
#     text = remove_inside_braces(text)
    tokens = word_tokenize(text)
    if stemmer == 'snowball':
        text_stem = " ".join([sb_stem.stem(w) for w in tokens])
    else:
        text_stem = " ".join([pt_stem.stem(w) for w in tokens])
    
    return text_stem

testData['tweet'] = testData['tweet'].apply(stem_text)

In [17]:
def removeNonAscii(tweet):
    #For Removing non ascii characters 
    sentenceClean = ''.join([i if ord(i) < 128 else ' ' for i in tweet])
    sentenceClean = ' '.join([eachWord.strip() for eachWord in sentenceClean.split()])
    return sentenceClean

testData['tweet'] = testData['tweet'].apply(removeNonAscii)

In [18]:
def removeMorePunctuations(text):
    """
    Removed special characters from text
    Example: he: I am going. are you coming? => he I am going. are you coming
   
    Args:
        text (str): text
   
    Returns:
        clean_text (str): cleaned text with removed special characters
    """
    regex_pattern = re.compile(r'[\<\*\#\$\>\@\&\'\~\_\/\;\=\%]+')
    clean_text = regex_pattern.sub(r' ', text)
    clean_text = clean_text.replace('-', '')
    return clean_text

testData['tweet'] = testData['tweet'].apply(removeMorePunctuations)

In [19]:
separate_digit_text('#SamsungGalaxyS9')

'#SamsungGalaxyS9'

In [21]:
testData.to_csv('CleanDataTest.csv',index=None,columns=['id','tweet','isVulgar','emoScore'])