In [1]:
#Libraries used
import pandas as pd
import nltk
import re
from unidecode import unidecode
from nltk.stem import PorterStemmer
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import TweetTokenizer
from nltk.tokenize import sent_tokenize 
from nltk.corpus import stopwords


In [2]:
""" Needed for some functions in class """

#To stop_words
stop_words = set(stopwords.words('english'))

#Word_corpus
words_corp = set(nltk.corpus.words.words())


#regex dict contains regex identification of word which need to be replced with an specific word, in oreder to remove noise, contractions or special words.
regex_dict = {'phone': ('\+[0-9]{12}','phone'), 'email':('regex','e-mail'), 'good': ('goo[o]*d[d]*','good'),
              'bad': ('ba[a]*d[d]*','bad'), 'sad': ('sa[a]*d[d]*','sad'), 'url': ('@^(https?|ftp)://[^\s/$.?#].[^\s]*$@iS','url'),
             'not': ("n\'t"," not"), 'are':("\'re", " are"), 'is':("\'s", " is"), 'would': ("\'d", " would"),
             'will': ("\'ll", " will"), 'have':("\'ve", " have"), 'am': ("\'m", " am"), 'int': ('\w*\d\w*', ''), 'n': ("\'n", '')}

In [3]:

class Preprocessor:
    def __init__(self,tweet_text):
        self.string = tweet_text.lower()  # Self.string referring to the string(tweet) using for all functions.
    
    def stemWords(self):
        ps = PorterStemmer()  #Steammer
        roots=[]
        
        for word in self.string.split(" "):
            roots.append(ps.stem(word))
        stemmed = ' '.join(roots)  #String with all words root
        self.string = stemmed
        return self.string
    
    
    def lemWords(self):
        lm = WordNetLemmatizer()  #Lemmatizer
        lem_words = []
        
        for word in self.string.split(" "):
            lem_words.append(lm.lemmatize(word,pos='a'))
        lematized = ' '.join(lem_words)  #String with all lemmatized words
        self.string = lematized
        return self.string
    
    
    def stopWords(self):
        new_sentence = []
        for word in self.string.split(" "):
            if word not in stop_words:   #Removing stopwords
                new_sentence.append(word)
            else: continue
        cleaned = ' '.join(new_sentence)  #Str without stopw
        self.string = cleaned
        return cleaned
    
    def removeNoise(self):
        first_clear=[]
        for w in self.string.split(" "):
            #isalnum for removing punctuation and endwith to take into account verbs in 3rd person
            if w.isalnum() and len(w)>1 and (w in words_corp or w.endswith('s')):
                first_clear.append(w)
        clean = ' '.join(first_clear)
        self.string = unidecode(clean) #Unicode to remove accents and simbols over letters
        return self.string
    
    
    def wordTokenize(self):
        #Twitter tokenizer was used to consider emojis, hastags and user names while tokenizing
        tt = TweetTokenizer(strip_handles=True, reduce_len=True)
        tokenized = ' '.join(tt.tokenize(self.string))
        self.string = tokenized
        return self.string
        
    
    def phraseTokenize(self):
        # Sent_tokenize help to tokenize str by sentences
        sentt = sent_tokenize
        cleaned =' '.join(sentt(self.string))
        self.string = cleaned
        return self.string
    
    
    def extractRegex(self,regex_dict):
        txt = self.string
        for regx,rep_name in list(regex_dict.values()):  #When a regex in dict cmatch with a word, that words is replaced with rgx name_value
            txt = re.sub(regx,rep_name,txt)
        self.string = txt 
        return self.string

***TESTING IT***

In [4]:
df = pd.read_csv('tweets.csv',sep=",") 

In [5]:
#imported data
df.head(2)

Unnamed: 0,tweet,senti
0,"@united Oh, we are sure it's not planned, but ...",0
1,History exam studying ugh,0


In [6]:
#Calling class' functions for all tweets
new_tweets = {}
count = 0
for tweet,senti in zip(df['tweet'],df['senti']):
    tw = Preprocessor(tweet)
    tw.phraseTokenize()
    tw.wordTokenize()
    tw.extractRegex(regex_dict)
    tw.removeNoise()
    tw.stopWords()
    tw.lemWords()
    final_tw = tw.stemWords()
    new_tweets[count] = (final_tw,senti)
    count+=1    

In [7]:
#Dict of prepocessed tweets
new_tweets

{0: ('oh sure occur absolut consist usual flight daili', 0),
 1: ('histori exam ugh', 0),
 2: ('yeah look like busi yeah', 0),
 3: ('love twitter', 4),
 4: ('realli dont want phone servic suck come signal', 0),
 5: ('want either might get pilotless plane driverless car', 0),
 6: ('super cool next stop road car', 4),
 7: ('aw night end badli', 0),
 8: ('lot buzz lucki free', 4),
 9: ('got new pair shoe pic late', 2),
 10: ('hey chanc updat flight', 2),
 11: ('learn book review', 2),
 12: ('dentist great expens', 0),
 13: ('love chocol milk yeah', 4),
 14: ('oh good glad feel good realli good crazi week though good way', 4),
 15: ('appl even bare anyth coupl featur got self drive car glass tango project ara',
  2),
 16: ('today rad made final cross countri love ps everythin cool mate', 4),
 17: ('happi', 4),
 18: ('night museum wolverin junk food perfect', 4),
 19: ('say cut small talk new slogan give us money apolog bob', 0),
 20: ('fight sever time week even bother', 0),
 21: ('back wo

In [8]:
Prep_tweets = pd.DataFrame.from_dict(new_tweets, orient='index',columns=['Tweet','Senti_val'])

In [14]:
#View as a new DFrame
Prep_tweets.tail(15)

Unnamed: 0,Tweet,Senti_val
283,time warner cabl phone rep nail ugh cabl work ...,0
284,day market happi stock,4
285,hi great hear see cruis cannot wait hope well ...,4
286,coupon code expir soon,2
287,own playoff ad,2
288,shame forc make car white hous think sell think,0
289,eat,2
290,fan crazi inde,0
291,sick worri chug thing place,0
292,dad day ate mesa grill last night met bobbi fl...,4
