<div style='background-color:#1155CC'>
<br>
<p style='text-align:center; font-size:200%; color:white; font-weight:bold'>Basic Text Preprocessing</p><br>
<p style='text-align:center; font-size:150%; color:white; font-weight:500'>Andrean Yonathan - Universitas Diponegoro</p>
<br>
</div>

<p style = 'text-align:center; font-size:250%; font-weight:bold'>Text Preprocessing</p>

# Import Library

In [1]:
import pandas as pd
import numpy as np
from emot.emo_unicode import UNICODE_EMOJI, EMOTICONS_EMO
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
import contractions
import re
pd.set_option('display.max_colwidth', 150)
import warnings
warnings.filterwarnings('ignore')

# 1. Emoji and Emoticon Normalization

In [2]:
def convert_emojis(text):
    for emot in UNICODE_EMOJI:
        text = text.replace(emot, '_'.join(UNICODE_EMOJI[emot].replace(',','').replace(':','').split()))
    return text

def convert_emoticons(text):
    for emot in EMOTICONS_EMO:
        text = text.replace(emot, '_'.join(EMOTICONS_EMO[emot].replace(",","").split()))
    return text

# 2. Contractions Normalization

Reference: https://pypi.org/project/pycontractions/

In [3]:
def cont_to_meaning(text):
    for tx in text.split():
        if tx in contractions.contractions_dict.keys():
            text = text.replace(tx, contractions.contractions_dict[tx])
    return text

# 3. Text Cleaning

In [4]:
def text_cleaning(tx):
    tx = re.compile('rt @').sub('@', tx, count=1) # remove rt
    tx = re.sub(r"(?:\@|http|www.)\S+", " ", tx) # remove username and URL
    tx = re.sub(r'[^\x00-\x7f]', r'', tx) # remove hex character
    tx = re.sub(r'#[0-9a-zA-Z_]+', r'', tx) # remove hashtag
    tx = re.sub(r'[\d]', r'', tx) # remove number
    tx = tx.replace("\n", ' ') # remove \n
    tx = tx.replace("_", ' ') # replace underscore on emoticon
    tx = re.sub(r'[^\w\s]', '', tx) # remove punctuation
    rpt_regex = re.compile(r"(.)\1{1,}", re.IGNORECASE) # regex to normalized repeated character like hiii
    tx = rpt_regex.sub(r"\1\1", tx) # remove repeated word
    tx = tx.strip() # trim head and tail
    tx = re.sub(' +', ' ', tx) # remove multiple space
    return tx

# 4. Stop Words Removal

In [5]:
list_stopwords = stopwords.words('english')

def stopword_removal(text):
    word_list = text.split()
    text = ' '.join(i for i in word_list if i not in list_stopwords)
    return text

# 5. Stemming

In [6]:
# stemming Bahasa Inggris
from nltk.stem import PorterStemmer
stemmer = PorterStemmer()

def stem_english(text):
    text = ' '.join([stemmer.stem(tx) for tx in text.split()])
    return text

# 6. Tokenization

In [7]:
def word_token(tx):
    text = word_tokenize(tx)
    return text

# Function for Text Preprocessing

In [8]:
def text_processing(text):
    text = text.lower() # lower casing
    text = convert_emojis(text) # convert emoji
    text = convert_emoticons(text) # convert emoticon
    text = cont_to_meaning(text) # contraction normalization
    text = text_cleaning(text) # text cleaning
    text = stopword_removal(text) # stop words removal
    text = stem_english(text) # stemming
    text = word_token(text) # tokenization
    return text

# DATA AMAZON

In [9]:
data_amazon = pd.read_csv('sentiment labelled sentences/amazon_cells_labelled.csv', header = 1)
data_amazon.head()

Unnamed: 0,Good case,Excellent value.,1,Unnamed: 3,Unnamed: 4,Unnamed: 5
0,Great for the jawbone.,1,,,,
1,Tied to charger for conversations lasting more than 45 minutes.MAJOR PROBLEMS!!,0,,,,
2,The mic is great.,1,,,,
3,I have to jiggle the plug to get it to line up right to get decent volume.,0,,,,
4,If you have several dozen or several hundred contacts,then imagine the fun of sending each of them one by one.,0.0,,,


In [10]:
# mengambil kolom Good case dan Excellent value
data_amazon = data_amazon[['Good case', ' Excellent value.']]
data_amazon.rename(columns = {' Excellent value.' : 'Excellent value'}, inplace = True)
data_amazon.head()

Unnamed: 0,Good case,Excellent value
0,Great for the jawbone.,1
1,Tied to charger for conversations lasting more than 45 minutes.MAJOR PROBLEMS!!,0
2,The mic is great.,1
3,I have to jiggle the plug to get it to line up right to get decent volume.,0
4,If you have several dozen or several hundred contacts,then imagine the fun of sending each of them one by one.


## cleaning kolom Good Case

In [11]:
text_list = []
for text in list(data_amazon['Good case']):
    text = text_processing(text)
    text_list.append(text)
data_amazon['Good case - after'] = text_list

In [12]:
data_amazon.rename(columns = {'Good case' : 'Good case - before'}, inplace = True)
data_amazon[['Good case - before', 'Good case - after']]

Unnamed: 0,Good case - before,Good case - after
0,Great for the jawbone.,"[great, jawbon]"
1,Tied to charger for conversations lasting more than 45 minutes.MAJOR PROBLEMS!!,"[tie, charger, convers, last, minutesmajor, problem]"
2,The mic is great.,"[mic, great]"
3,I have to jiggle the plug to get it to line up right to get decent volume.,"[jiggl, plug, get, line, right, get, decent, volum]"
4,If you have several dozen or several hundred contacts,"[sever, dozen, sever, hundr, contact]"
...,...,...
993,The screen does get smudged easily because it touches your ear and face.,"[screen, get, smudg, easili, touch, ear, face]"
994,What a piece of junk.. I lose more calls on this phone.,"[piec, junk, lose, call, phone]"
995,Item Does Not Match Picture.,"[item, match, pictur]"
996,The only thing that disappoint me is the infra red port (irda).,"[thing, disappoint, infra, red, port, irda]"


## cleaning kolom Excellent value

In [13]:
exc = []
for val in data_amazon['Excellent value']:
    if len(val) > 1:
        val = np.nan
    exc.append(val)
data_amazon['Excellent value'] = exc
data_amazon['Excellent value'].fillna(data_amazon['Excellent value'].median(), inplace = True)
data_amazon['Excellent value'] = data_amazon['Excellent value'].astype('int')

In [14]:
data_amazon[['Good case - before', 'Good case - after', 'Excellent value']]

Unnamed: 0,Good case - before,Good case - after,Excellent value
0,Great for the jawbone.,"[great, jawbon]",1
1,Tied to charger for conversations lasting more than 45 minutes.MAJOR PROBLEMS!!,"[tie, charger, convers, last, minutesmajor, problem]",0
2,The mic is great.,"[mic, great]",1
3,I have to jiggle the plug to get it to line up right to get decent volume.,"[jiggl, plug, get, line, right, get, decent, volum]",0
4,If you have several dozen or several hundred contacts,"[sever, dozen, sever, hundr, contact]",1
...,...,...,...
993,The screen does get smudged easily because it touches your ear and face.,"[screen, get, smudg, easili, touch, ear, face]",0
994,What a piece of junk.. I lose more calls on this phone.,"[piec, junk, lose, call, phone]",0
995,Item Does Not Match Picture.,"[item, match, pictur]",0
996,The only thing that disappoint me is the infra red port (irda).,"[thing, disappoint, infra, red, port, irda]",0


# DATA IMDB

In [15]:
data_imdb = pd.read_csv('sentiment labelled sentences/imdb_labelled.csv')
data_imdb.head()

Unnamed: 0,A very,very,very slow-moving,aimless movie about a distressed,drifting young man.,0,Unnamed: 6,Unnamed: 7
0,Not sure who was more lost - the flat characters or the audience,nearly half of whom walked out.,0,,,,,
1,Attempting artiness with black & white and clever camera angles,the movie disappointed - became even more ridiculous - as the acting was poor and the plot and lines almost non-existent.,0,,,,,
2,Very little music or anything to speak of.,0,,,,,,
3,The best scene in the movie was when Gerardo is trying to find a song that keeps running through his head.,1,,,,,,
4,The rest of the movie lacks art,charm,meaning... If it's about emptiness,it works I guess because it's empty.,0.0,,,


In [16]:
# rename kolom 'A very' menjadi 'text - before'
data_imdb.rename(columns = {'A very' : 'text - before'}, inplace = True)
data_imdb.head()

Unnamed: 0,text - before,very,very slow-moving,aimless movie about a distressed,drifting young man.,0,Unnamed: 6,Unnamed: 7
0,Not sure who was more lost - the flat characters or the audience,nearly half of whom walked out.,0,,,,,
1,Attempting artiness with black & white and clever camera angles,the movie disappointed - became even more ridiculous - as the acting was poor and the plot and lines almost non-existent.,0,,,,,
2,Very little music or anything to speak of.,0,,,,,,
3,The best scene in the movie was when Gerardo is trying to find a song that keeps running through his head.,1,,,,,,
4,The rest of the movie lacks art,charm,meaning... If it's about emptiness,it works I guess because it's empty.,0.0,,,


## cleaning kolom text

In [17]:
text_list = []
for text in list(data_imdb['text - before']):
    text = text_processing(text)
    text_list.append(text)
data_imdb['text - after'] = text_list

In [18]:
data_imdb[['text - before', 'text - after']].head()

Unnamed: 0,text - before,text - after
0,Not sure who was more lost - the flat characters or the audience,"[sure, lost, flat, charact, audienc]"
1,Attempting artiness with black & white and clever camera angles,"[attempt, arti, black, white, clever, camera, angl]"
2,Very little music or anything to speak of.,"[littl, music, anyth, speak]"
3,The best scene in the movie was when Gerardo is trying to find a song that keeps running through his head.,"[best, scene, movi, gerardo, tri, find, song, keep, run, head]"
4,The rest of the movie lacks art,"[rest, movi, lack, art]"


# DATA YELP

In [19]:
data_yelp = pd.read_csv('sentiment labelled sentences/yelp_labelled.csv', header = None)
data_yelp.head()

Unnamed: 0,0,1,2,3,4,5
0,Wow... Loved this place.,1,,,,
1,Crust is not good.,0,,,,
2,Not tasty and the texture was just nasty.,0,,,,
3,Stopped by during the late May bank holiday off Rick Steve recommendation and loved it.,1,,,,
4,The selection on the menu was great and so were the prices.,1,,,,


In [20]:
# rename kolom '0' menjadi 'text - before'
data_yelp.rename(columns = {0 : 'text - before'}, inplace = True)
data_yelp.head()

Unnamed: 0,text - before,1,2,3,4,5
0,Wow... Loved this place.,1,,,,
1,Crust is not good.,0,,,,
2,Not tasty and the texture was just nasty.,0,,,,
3,Stopped by during the late May bank holiday off Rick Steve recommendation and loved it.,1,,,,
4,The selection on the menu was great and so were the prices.,1,,,,


## cleaning kolom 'text - before'

In [21]:
text_list = []
for text in list(data_yelp['text - before']):
    text = text_processing(text)
    text_list.append(text)
data_yelp['text - after'] = text_list

In [22]:
data_yelp[['text - before', 'text - after']].head()

Unnamed: 0,text - before,text - after
0,Wow... Loved this place.,"[wow, love, place]"
1,Crust is not good.,"[crust, good]"
2,Not tasty and the texture was just nasty.,"[tasti, textur, nasti]"
3,Stopped by during the late May bank holiday off Rick Steve recommendation and loved it.,"[stop, late, may, bank, holiday, rick, steve, recommend, love]"
4,The selection on the menu was great and so were the prices.,"[select, menu, great, price]"
