In [1]:
from bs4 import BeautifulSoup as bs
import re,csv, os, itertools, pandas as pd,docx2txt
from tqdm import tqdm
from pattern.web import PDF
from nltk.tokenize import RegexpTokenizer
from nltk.stem import WordNetLemmatizer
from nltk import sent_tokenize, word_tokenize
from spacy.lang.id import Indonesian
from html import unescape
from unidecode import unidecode
from bz2 import BZ2File as bz2
from textblob import TextBlob

def LoadStopWords(lang):
    L = lang.lower().strip()
    if L == 'en' or L == 'english' or L == 'inggris':
        lemmatizer = WordNetLemmatizer()
        stops =  set([t.strip() for t in LoadDocuments(file = './Corpus/stopwords_eng.txt')[0]])
    elif L == 'id' or L == 'indonesia' or L=='indonesian':
        lemmatizer = Indonesian() 
        stops = set([t.strip() for t in LoadDocuments(file = './Corpus/stopwords_id.txt')[0]])
    else:
        print('Warning, language not recognized. Empty StopWords Given')
        stops = set(); lemmatizer = None
    return stops, lemmatizer

def fixTags(T):
    getHashtags = re.compile(r"#(\w+)")
    pisahtags = re.compile(r'[A-Z][^A-Z]*')
    t = T
    tagS = re.findall(getHashtags, T)
    for tag in tagS:
        proper_words = ' '.join(re.findall(pisahtags, tag))
        t = t.replace('#'+tag,proper_words)
    return t

def readBz2(file):
    with bz2(file, "r") as bzData:
        txt = []
        for line in bzData:
            try:
                txt.append(line.strip().decode('utf-8','replace'))
            except:
                pass
    return ' '.join(txt)

def LoadDocuments(dPath=None,types=None, file = None): # types = ['pdf','doc','docx','txt','bz2']
    Files, Docs = [], []
    if types:
        for tipe in types:
            Files += crawlFiles(dPath,tipe)
    if file:
        Files = [file]
    if not types and not file: # get all files regardless of their extensions
        Files += crawlFiles(dPath)
    for f in Files:
        if f[-3:].lower()=='pdf':
            try:
                Docs.append(PDF(f).string)
            except:
                print('error reading{0}'.format(f))
        elif f[-3:].lower()=='txt' or f[-3:].lower()=='dic':
            try:
                df=open(f,"r",encoding="utf-8", errors='replace')
                Docs.append(df.readlines());df.close()
            except:
                print('error reading{0}'.format(f))
        elif f[-3:].lower()=='bz2':
            try:
                Docs.append(readBz2(f))
            except:
                print('error reading{0}'.format(f))
        elif f[-4:].lower()=='docx':
            try:
                Docs.append(docx2txt.process(f))
            except:
                print('error reading{0}'.format(f))
        elif f[-3:].lower()=='csv':
            Docs.append(pd.read_csv(f))
        else:
            print('Unsupported format {0}'.format(f))
    if file:
        Docs = Docs[0]
    return Docs, Files

def DelPic(text): #untuk menghilangkan informasi gambar
    D = text.split()
    D = [d for d in D if 'pic.twitter.com' not in d]
    return ' ' .join(D)

def LoadSlang(DirSlang):
    Slangs =LoadDocuments(file = DirSlang)
    SlangDict={}
    for slang in Slangs[0]:
        try:
            key, value = slang.split(':')
            SlangDict[key.strip()] = value.strip()
        except:
            pass
    return SlangDict

#POS Tagging
from nltk.tag import CRFTagger
def postag(text):
    #Tokenisasi Data
    tokenized_sents = word_tokenize(text)
    #pemberian Tag tiap token
    ct = CRFTagger()
    ct.set_model_file('./Corpus/all_indo_man_tag_corpus_model.crf.tagger') 
    #directorynya disesuaikan meletakan file crfnya, harus download dlu file crfnya
    pt = ct.tag(tokenized_sents)
    ptN = []
    noun = set(['NN','NNP', 'NNS','NNPS'])
    tmp = []
    for w in pt:
        if w[1] in noun:
            tmp.append(w[0])
    if len(tmp)>0:
        ptN.append(' '.join(tmp))
    return ' '.join(ptN)


def cleanText(T, fix={}, lang = 'id', lemma=None, stops = set(), symbols_remove = False, min_charLen = 0): 
    # lang & stopS only 2 options : 'en' atau 'id'
    # symbols ASCII atau alnum
    pattern = re.compile(r'http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\(\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+')
    t = re.sub(pattern,' ',T) #remove urls if any
    t = DelPic(t)
    t = unescape(t) # html entities fix
    t = fixTags(t) # fix abcDef
    t = t.lower().strip() # lowercase
    t = unidecode(t)
    t = ''.join(''.join(s)[:2] for _, s in itertools.groupby(t)) # remove repetition
    t = sent_tokenize(t) # sentence segmentation. String to list
    for i, K in enumerate(t):
        if symbols_remove:
            K = re.sub(r'[^.,a-zA-Z0-9 \n\.]',' ',K)
        
        cleanList = []
        if lang =='en':
            listKata = word_tokenize(K) # word tokenize
            for token in listKata:
                if token in fix.keys():
                    token = fix[token]
                if lemma:
                    token = lemma.lemmatize(token)
                if stops:
                    if len(token)>=min_charLen and token not in stops:
                        cleanList.append(token)
                else:
                    if len(token)>=min_charLen:
                        cleanList.append(token)
            t[i] = ' '.join(cleanList)
        else:
            if lemma:
                K = lemma(K)
                listKata = [token.text for token in K]
            else:
                listKata = TextBlob(K).words
                
            for token in listKata:
                if token in fix.keys():
                    token = fix[token]
                
                if lemma:
                    token = lemma(token)[0].lemma_
                if stops:    
                    if len(token)>=min_charLen and token not in stops:
                        cleanList.append(token)
                else:
                    if len(token)>=min_charLen:
                        cleanList.append(token)
            t[i] = ' '.join(cleanList)
    return ' '.join(t) 

DaftarSlang = './Corpus/slang.dic'
Slangs=LoadSlang(DaftarSlang)

In [2]:
fName = './Indosat.csv'
df_isat= pd.read_csv(fName)
df_isat.head()

Unnamed: 0,Time,Username,Tweet,Replies,Retweets,Likes,Language,urlStatus
0,Jan 31,@abdullahnurdien,Udah ngeh blm min?\nItu sms 86mb jam 16.06 tan...,1,0,0,Indonesian,https://twitter.com/abdullahnurdien/status/958...
1,Jan 31,@abdullahnurdien,Dibaca itu yang kemarin min 86 mb... Pas jam 1...,0,0,0,Indonesian,https://twitter.com/abdullahnurdien/status/958...
2,Jan 31,@abdullahnurdien,@IndosatCare pusing saya min pake indosat... \...,2,0,0,Indonesian,https://twitter.com/abdullahnurdien/status/958...
3,Jan 31,@JasaWirya,"Sudah min sudah, lagian apa yang mau di clear ...",0,0,0,Indonesian,https://twitter.com/JasaWirya/status/958414587...
4,Jan 30,@ojieijo,"@IndosatCare masi juga lelet euy,, apakah bts ...",1,0,0,Indonesian,https://twitter.com/ojieijo/status/95837580477...


In [3]:
#Tanpa POS Tag
import pickle
from tqdm import tqdm_notebook as tqdm

listTweet = [d for d in df_isat[' Tweet']]

stops, lemmatizer = LoadStopWords(lang='id')
for i,d in tqdm(enumerate(listTweet)):
    listTweet[i] = cleanText(d,Slangs, lemma=lemmatizer,lang='id', stops = stops, symbols_remove = True, min_charLen = 2)
print("done!!! {0} tweet".format(i+1))

HBox(children=(IntProgress(value=1, bar_style='info', max=1), HTML(value='')))


done!!! 4359 tweet


In [4]:
listTweet[0]

'sadar min sms 86 mb jam 16.06 tanggal 30.01 kemarin paketin 16.07 dapet sms gambar komplain sms 2.59 tweet sisa 160 mb dapet sms'

In [5]:
#Dengan Postag

listTweetPos = listTweet

stops, lemmatizer = LoadStopWords(lang='id')
for i,d in tqdm(enumerate(listTweetPos)):
    listTweetPos[i] = postag(d)
print("done!!! {0} tweet".format(i+1))

HBox(children=(IntProgress(value=1, bar_style='info', max=1), HTML(value='')))


done!!! 4359 tweet


In [6]:
listTweetPos[0]

'sadar min sms mb jam tanggal kemarin paketin tweet sisa'

In [7]:
#save to csv file 
df_TweetCleanNPT=pd.DataFrame(listTweet,columns=['TweetClean_NonPT'])
df_TweetCleanPT=pd.DataFrame(listTweetPos,columns=['TweetClean_PT'])
tweetClean=pd.concat([df_isat[' Tweet'],df_TweetCleanNPT, df_TweetCleanPT], axis=1)

In [8]:
tweetClean.head()

Unnamed: 0,Tweet,TweetClean_NonPT,TweetClean_PT
0,Udah ngeh blm min?\nItu sms 86mb jam 16.06 tan...,sadar min sms mb jam tanggal kemarin paketin t...,sadar min sms mb jam tanggal kemarin paketin t...
1,Dibaca itu yang kemarin min 86 mb... Pas jam 1...,dibaca kemarin min mb jam paketin,dibaca kemarin min mb jam paketin
2,@IndosatCare pusing saya min pake indosat... \...,pusing min kuota,pusing min kuota
3,"Sudah min sudah, lagian apa yang mau di clear ...",min dibilang diuninstal diinstal sebulan min k...,min dibilang diuninstal diinstal sebulan min k...
4,"@IndosatCare masi juga lelet euy,, apakah bts ...",masi didaerah karna kantor,masi didaerah karna kantor


In [9]:
def dataframe_to_csv(filename, DataFrame):
    """Export entire DataFrame to csv."""
    output = DataFrame
    output.to_csv(filename, index=True)

In [11]:
#Save Hasil Prepocessing
dataframe_to_csv('./TweetClean.csv', tweetClean)