# Preprocessing Data Textual - Pembangunan Model Emosi

## Library Preparation

In [1]:
!pip install Sastrawi

Collecting Sastrawi
[?25l  Downloading https://files.pythonhosted.org/packages/6f/4b/bab676953da3103003730b8fcdfadbdd20f333d4add10af949dd5c51e6ed/Sastrawi-1.0.1-py2.py3-none-any.whl (209kB)
[K     |█▋                              | 10kB 15.0MB/s eta 0:00:01[K     |███▏                            | 20kB 20.9MB/s eta 0:00:01[K     |████▊                           | 30kB 18.3MB/s eta 0:00:01[K     |██████▎                         | 40kB 16.0MB/s eta 0:00:01[K     |███████▉                        | 51kB 9.4MB/s eta 0:00:01[K     |█████████▍                      | 61kB 10.8MB/s eta 0:00:01[K     |███████████                     | 71kB 8.9MB/s eta 0:00:01[K     |████████████▌                   | 81kB 9.9MB/s eta 0:00:01[K     |██████████████                  | 92kB 10.2MB/s eta 0:00:01[K     |███████████████▋                | 102kB 8.7MB/s eta 0:00:01[K     |█████████████████▏              | 112kB 8.7MB/s eta 0:00:01[K     |██████████████████▊             | 122kB 8.7

In [28]:
import requests
import os
import joblib
import pandas as pd
from google.colab import drive
from google.colab import files
from Sastrawi.Stemmer.StemmerFactory import StemmerFactory
from Sastrawi.StopWordRemover.StopWordRemoverFactory import StopWordRemoverFactory

In [3]:
drive.mount('/content/gdrive')
drive_dir = "/content/gdrive/MyDrive/Teknik Informatika/Semester 7/TUGAS AKHIR/TA 2"

Mounted at /content/gdrive


## Preprocess Data

In [29]:
def remove_unknown_tag(text):
    text = text.replace("<unk>","")
    return text

def remove_noise_tag(text):
    text = text.replace("<noise>","")
    return text

def remove_language_tag(text):
    text = text.replace("<eng>","")
    text = text.replace("</eng>","")
    text = text.replace("<arab>","")
    text = text.replace("</arab>","")
    text = text.replace("<jawa>","")
    text = text.replace("</jawa>","")
    return text

def remove_tag(text):
    text = text.replace("<","")
    text = text.replace(">","")
    return text

def word_normalization(text):
    headers = {'x-api-key': 'G9c0AssPEjGQD7Qo0q8IIJKUG9GmoHz8dEcSRner', 'Content-Type': 'application/json'}
    body = {'text': text}
    r_pos = requests.post('https://api.prosa.ai/v1/normals', headers = headers, json = body)
    return r_pos.json()['text']

def stemming(text):
    factory = StemmerFactory()
    stemmer = factory.create_stemmer()
    return stemmer.stem(text)

def remove_stop_word(text):
  factory = StopWordRemoverFactory()
  stopword = factory.create_stop_word_remover()
  return stopword.remove(text)

### Basic Preprocess

In [9]:
def preprocess(text):
    text = remove_unknown_tag(text)
    text = remove_noise_tag(text)
    text = remove_language_tag(text)
    text = remove_tag(text)
    return text

In [10]:
data = joblib.load(drive_dir + "/Eksperimen/Data/raw.pkl")
texts = data["Transkripsi"]

In [11]:
processed = []
i = 0
for text in texts:
    if i % 100 == 0: print("Preprocessing...", i, "data text already done.")
    processed.append(preprocess(text))
    i = i + 1

Preprocessing... 0 data text already done.
Preprocessing... 100 data text already done.
Preprocessing... 200 data text already done.
Preprocessing... 300 data text already done.
Preprocessing... 400 data text already done.
Preprocessing... 500 data text already done.
Preprocessing... 600 data text already done.
Preprocessing... 700 data text already done.
Preprocessing... 800 data text already done.
Preprocessing... 900 data text already done.
Preprocessing... 1000 data text already done.
Preprocessing... 1100 data text already done.
Preprocessing... 1200 data text already done.
Preprocessing... 1300 data text already done.
Preprocessing... 1400 data text already done.
Preprocessing... 1500 data text already done.
Preprocessing... 1600 data text already done.
Preprocessing... 1700 data text already done.
Preprocessing... 1800 data text already done.
Preprocessing... 1900 data text already done.
Preprocessing... 2000 data text already done.
Preprocessing... 2100 data text already done.


In [12]:
processed_data = data.copy()
processed_data['Transkripsi'] = processed
processed_data = processed_data.sort_values(by=['ID Ucapan'])
processed_data = processed_data.reset_index()
processed_data = processed_data.drop(columns=['index'])
processed_data.head()

Unnamed: 0,ID Ucapan,Transkripsi,Label Emosi,Abstraksi Emosi
0,1001001,waktu satu sma itu kaya ada guru yang ngomongi...,Senang,Senang
1,1001002,gua masih enggak sadar kan,Terkejut,
2,1001003,terus ya udahlah berlalu aja,Sedih,
3,1001004,nah terus gua enggak tahu kenapa tiba-tiba gua...,Senang,
4,1001005,itu kaya ih gua ngestalk dulu ah mau buat capt...,Senang,


In [13]:
processed_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10822 entries, 0 to 10821
Data columns (total 4 columns):
 #   Column           Non-Null Count  Dtype 
---  ------           --------------  ----- 
 0   ID Ucapan        10822 non-null  object
 1   Transkripsi      10822 non-null  object
 2   Label Emosi      10822 non-null  object
 3   Abstraksi Emosi  2003 non-null   object
dtypes: object(4)
memory usage: 338.3+ KB


In [14]:
vocab = []
for text in processed_data['Transkripsi']:
  for word in text.split():
    vocab.append(word)
vocab = set(vocab)
print("Total kata : ", len(vocab))

Total kata :  8714


In [15]:
joblib.dump(processed_data, "basic.pkl")
files.download("basic.pkl")

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

### + Word Normalization

In [None]:
def preprocess(text):
    word_normalization(text)
    return text

In [None]:
data = joblib.load(drive_dir + "/Eksperimen/Data/basic.pkl")
texts = data["Transkripsi"]

In [None]:
processed = []
i = 0
for text in texts:
    if i % 100 == 0: print("Preprocessing...", i, "data text already done.")
    processed.append(preprocess(text))
    i = i + 1

In [None]:
processed_data = data.copy()
processed_data['Transkripsi'] = processed
processed_data = processed_data.sort_values(by=['ID Ucapan'])
processed_data = processed_data.reset_index()
processed_data = processed_data.drop(columns=['index'])
processed_data.head()

Unnamed: 0,ID Ucapan,Transkripsi,Label Emosi,Abstraksi Emosi
0,1001001,waktu satu sma itu kaya ada guru yang ngomongi...,Senang,Senang
1,1001002,gua masih enggak sadar kan,Terkejut,
2,1001003,terus ya udahlah berlalu aja,Sedih,
3,1001004,nah terus gua enggak tahu kenapa tiba-tiba gua...,Senang,
4,1001005,itu kaya ih gua ngestalk dulu ah mau buat capt...,Senang,


In [None]:
processed_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10822 entries, 0 to 10821
Data columns (total 4 columns):
 #   Column           Non-Null Count  Dtype 
---  ------           --------------  ----- 
 0   ID Ucapan        10822 non-null  object
 1   Transkripsi      10822 non-null  object
 2   Label Emosi      10822 non-null  object
 3   Abstraksi Emosi  2003 non-null   object
dtypes: object(4)
memory usage: 338.3+ KB


In [None]:
vocab = []
for text in processed_data['Transkripsi']:
  for word in text.split():
    vocab.append(word)
vocab = set(vocab)
print("Total kata : ", len(vocab))

Total kata :  8714


In [None]:
joblib.dump(processed_data, "normalization.pkl")
files.download("normalization.pkl")

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

### + Stemming

In [20]:
def preprocess(text):
    text = stemming(text)
    return text

In [21]:
data = joblib.load(drive_dir + "/Eksperimen/Data/normalization.pkl")
texts = data["Transkripsi"]

In [22]:
processed = []
i = 0
for text in texts:
    if i % 100 == 0: print("Preprocessing...", i, "data text already done.")
    processed.append(preprocess(text))
    i = i + 1

Preprocessing... 0 data text already done.
Preprocessing... 100 data text already done.
Preprocessing... 200 data text already done.
Preprocessing... 300 data text already done.
Preprocessing... 400 data text already done.
Preprocessing... 500 data text already done.
Preprocessing... 600 data text already done.
Preprocessing... 700 data text already done.
Preprocessing... 800 data text already done.
Preprocessing... 900 data text already done.
Preprocessing... 1000 data text already done.
Preprocessing... 1100 data text already done.
Preprocessing... 1200 data text already done.
Preprocessing... 1300 data text already done.
Preprocessing... 1400 data text already done.
Preprocessing... 1500 data text already done.
Preprocessing... 1600 data text already done.
Preprocessing... 1700 data text already done.
Preprocessing... 1800 data text already done.
Preprocessing... 1900 data text already done.
Preprocessing... 2000 data text already done.
Preprocessing... 2100 data text already done.


In [23]:
processed_data = data.copy()
processed_data['Transkripsi'] = processed
processed_data = processed_data.sort_values(by=['ID Ucapan'])
processed_data = processed_data.reset_index()
processed_data = processed_data.drop(columns=['index'])
processed_data.head()

Unnamed: 0,ID Ucapan,Transkripsi,Label Emosi,Abstraksi Emosi
0,1001001,waktu satu sama itu kaya ada guru yang bicara ...,Senang,Senang
1,1001002,gua masih enggak sadar kan,Terkejut,
2,1001003,terus ya udahlah lalu aja,Sedih,
3,1001004,nah terus gua enggak tahu kenapa tiba gua kaya...,Senang,
4,1001005,itu kaya ih gua mengestalk dulu ah mau buat ca...,Senang,


In [24]:
processed_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10822 entries, 0 to 10821
Data columns (total 4 columns):
 #   Column           Non-Null Count  Dtype 
---  ------           --------------  ----- 
 0   ID Ucapan        10822 non-null  object
 1   Transkripsi      10822 non-null  object
 2   Label Emosi      10822 non-null  object
 3   Abstraksi Emosi  2003 non-null   object
dtypes: object(4)
memory usage: 338.3+ KB


In [25]:
vocab = []
for text in processed_data['Transkripsi']:
  for word in text.split():
    vocab.append(word)
vocab = set(vocab)
print("Total kata : ", len(vocab))

Total kata :  5054


In [27]:
joblib.dump(processed_data, "stemming.pkl")
files.download("stemming.pkl")

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

### + Stop Word Elimination

In [37]:
remove_stop_word('Dengan Menggunakan Python dan Library Sastrawi saya dapat melakukan proses Stopword Removal')

'Dengan Menggunakan Python Library Sastrawi dapat melakukan proses Stopword Removal'

In [38]:
def preprocess(text):
    text = remove_stop_word(text)
    return text

In [39]:
data = joblib.load(drive_dir + "/Eksperimen/Data/stemming.pkl")
texts = data["Transkripsi"]

In [40]:
processed = []
i = 0
for text in texts:
    if i % 100 == 0: print("Preprocessing...", i, "data text already done.")
    processed.append(preprocess(text))
    i = i + 1

Preprocessing... 0 data text already done.
Preprocessing... 100 data text already done.
Preprocessing... 200 data text already done.
Preprocessing... 300 data text already done.
Preprocessing... 400 data text already done.
Preprocessing... 500 data text already done.
Preprocessing... 600 data text already done.
Preprocessing... 700 data text already done.
Preprocessing... 800 data text already done.
Preprocessing... 900 data text already done.
Preprocessing... 1000 data text already done.
Preprocessing... 1100 data text already done.
Preprocessing... 1200 data text already done.
Preprocessing... 1300 data text already done.
Preprocessing... 1400 data text already done.
Preprocessing... 1500 data text already done.
Preprocessing... 1600 data text already done.
Preprocessing... 1700 data text already done.
Preprocessing... 1800 data text already done.
Preprocessing... 1900 data text already done.
Preprocessing... 2000 data text already done.
Preprocessing... 2100 data text already done.


In [41]:
processed_data = data.copy()
processed_data['Transkripsi'] = processed
processed_data = processed_data.sort_values(by=['ID Ucapan'])
processed_data = processed_data.reset_index()
processed_data = processed_data.drop(columns=['index'])
processed_data.head()

Unnamed: 0,ID Ucapan,Transkripsi,Label Emosi,Abstraksi Emosi
0,1001001,waktu satu sama kaya guru bicara dia,Senang,Senang
1,1001002,gua enggak sadar kan,Terkejut,
2,1001003,terus udahlah lalu aja,Sedih,
3,1001004,nah terus gua enggak tahu tiba gua kayak tahu ...,Senang,
4,1001005,kaya ih gua mengestalk dulu ah mau buat captio...,Senang,


In [42]:
processed_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10822 entries, 0 to 10821
Data columns (total 4 columns):
 #   Column           Non-Null Count  Dtype 
---  ------           --------------  ----- 
 0   ID Ucapan        10822 non-null  object
 1   Transkripsi      10822 non-null  object
 2   Label Emosi      10822 non-null  object
 3   Abstraksi Emosi  2003 non-null   object
dtypes: object(4)
memory usage: 338.3+ KB


In [43]:
vocab = []
for text in processed_data['Transkripsi']:
  for word in text.split():
    vocab.append(word)
vocab = set(vocab)
print("Total kata : ", len(vocab))

Total kata :  5046


In [44]:
joblib.dump(processed_data, "stopword.pkl")
files.download("stopword.pkl")

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

### InaNLP

In [None]:
processed_data_textual = []
i = 0
for text in data_textual['Transkripsi']:
    print("Preprocessing...", i, "data text already done.")
    processed_data_textual.append(preprocess(text))
    i = i + 1

In [None]:
for text in data_textual['Transkripsi'][4570:]:
    print("Preprocessing...", i, "data text already done.")
    processed_data_textual.append(preprocess(text))
    i = i + 1

In [None]:
for text in data_textual['Transkripsi'][5941:]:
    print("Preprocessing...", i, "data text already done.")
    processed_data_textual.append(preprocess(text))
    i = i + 1

Preprocessing... 5941 data text already done.
Preprocessing... 5942 data text already done.
Preprocessing... 5943 data text already done.
Preprocessing... 5944 data text already done.
Preprocessing... 5945 data text already done.
Preprocessing... 5946 data text already done.
Preprocessing... 5947 data text already done.
Preprocessing... 5948 data text already done.
Preprocessing... 5949 data text already done.
Preprocessing... 5950 data text already done.
Preprocessing... 5951 data text already done.
Preprocessing... 5952 data text already done.
Preprocessing... 5953 data text already done.
Preprocessing... 5954 data text already done.
Preprocessing... 5955 data text already done.
Preprocessing... 5956 data text already done.
Preprocessing... 5957 data text already done.
Preprocessing... 5958 data text already done.
Preprocessing... 5959 data text already done.
Preprocessing... 5960 data text already done.
Preprocessing... 5961 data text already done.
Preprocessing... 5962 data text al

In [None]:
processed_data_inanlp = data_textual.copy()
processed_data_inanlp['Transkripsi'] = processed_data_textual
processed_data_inanlp = processed_data_inanlp.sort_values(by=['ID Ucapan'])
processed_data_inanlp = processed_data_inanlp.reset_index()
processed_data_inanlp = processed_data_inanlp.drop(columns=['index'])
processed_data_inanlp.head()

Unnamed: 0,ID Ucapan,Transkripsi,Label Emosi,Abstraksi Emosi
0,1001001,waktu satu sama itu kaya ada guru yang membica...,Senang,Senang
1,1001002,gua masih enggak sadar kan,Terkejut,
2,1001003,terus ya udahlah berlalu aja,Sedih,
3,1001004,nah terus gua enggak tahu kenapa tiba-tiba gua...,Senang,
4,1001005,itu kaya ih gua mengestalk dulu ah mau buat ca...,Senang,


In [None]:
processed_data_inanlp.head()

Unnamed: 0,ID Ucapan,Transkripsi,Label Emosi,Abstraksi Emosi
0,1001001,waktu satu sama itu kaya ada guru yang membica...,Senang,Senang
1,1001002,gua masih enggak sadar kan,Terkejut,
2,1001003,terus ya udahlah berlalu aja,Sedih,
3,1001004,nah terus gua enggak tahu kenapa tiba-tiba gua...,Senang,
4,1001005,itu kaya ih gua mengestalk dulu ah mau buat ca...,Senang,


In [None]:
processed_data_inanlp.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10822 entries, 0 to 10821
Data columns (total 4 columns):
 #   Column           Non-Null Count  Dtype 
---  ------           --------------  ----- 
 0   ID Ucapan        10822 non-null  object
 1   Transkripsi      10822 non-null  object
 2   Label Emosi      10822 non-null  object
 3   Abstraksi Emosi  2003 non-null   object
dtypes: object(4)
memory usage: 338.3+ KB


In [None]:
vocab = []
for text in processed_data_inanlp['Transkripsi']:
  for word in text.split():
    vocab.append(word)
vocab = set(vocab)
print("Total kata : ", len(vocab))

Total kata :  8339


In [None]:
joblib.dump(processed_data_inanlp, "processed_data_syntax_analyzer.pkl")
files.download("processed_data_syntax_analyzer.pkl")

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>