## Load Dataset

In [2]:
import pandas as pd
import re

In [3]:
# Load dataset

df = pd.read_csv("train_preprocess.tsv.txt" ,encoding="latin1",sep='\t',header=None,names=["text","label"])

In [4]:
df.head()

Unnamed: 0,text,label
0,warung ini dimiliki oleh pengusaha pabrik tahu...,positive
1,mohon ulama lurus dan k212 mmbri hujjah partai...,neutral
2,lokasi strategis di jalan sumatera bandung . t...,positive
3,betapa bahagia nya diri ini saat unboxing pake...,positive
4,duh . jadi mahasiswa jangan sombong dong . kas...,negative


In [5]:
df.shape

(11000, 2)

In [6]:
df.isnull().sum()

text     0
label    0
dtype: int64

In [7]:
#Cek duplikasi data

df.duplicated().sum()

67

In [8]:
# Hapus duplicated data

df = df.drop_duplicates()

In [9]:
print('Jumlah Duplikasi Data = {}'.format(df.duplicated().sum()))
print('Duplikasi Data Tela dihapus...')

Jumlah Duplikasi Data = 0
Duplikasi Data Tela dihapus...


In [10]:
#cek lagi
df.shape

(10933, 2)

In [11]:
df.head()

Unnamed: 0,text,label
0,warung ini dimiliki oleh pengusaha pabrik tahu...,positive
1,mohon ulama lurus dan k212 mmbri hujjah partai...,neutral
2,lokasi strategis di jalan sumatera bandung . t...,positive
3,betapa bahagia nya diri ini saat unboxing pake...,positive
4,duh . jadi mahasiswa jangan sombong dong . kas...,negative


- Ada 67 data yang diduplikasi dalam dataset dan sudah dihapus, Update terdiri dari 10933 baris.

## Text Processing

In [12]:
df_alay = pd.read_csv('new_kamusalay.csv', encoding='ISO-8859-1', header=None)
df_alay = df_alay.rename(columns={0: 'alay', 1: 'formal'})

In [13]:
df_alay.head()

Unnamed: 0,alay,formal
0,anakjakartaasikasik,anak jakarta asyik asyik
1,pakcikdahtua,pak cik sudah tua
2,pakcikmudalagi,pak cik muda lagi
3,t3tapjokowi,tetap jokowi
4,3x,tiga kali


In [14]:
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords
list_stopwords = set(stopwords.words('indonesian'))

def data_cleaning (text):
    clean1 = re.sub ('\\n','', text)
    clean2 = re.sub ('RT',' ', clean1)
    clean3 = re.sub ('USER', ' ', clean2)
    clean4 = re.sub ('(http|https):\/\/s+', ' ', clean3)
    clean5 = re.sub ('[^0-9a-zA-Z]+', ' ', clean4)
    clean6 = re.sub ('x[a-z0-9]{2}', ' ', clean5)
    clean7 = re.sub ("\d+", ' ', clean6)
    clean8 = re.sub ('  +', '', clean7)
    return clean8

def case_folding (text):
    return text.lower()

def alay_normalization(text):
    res = ''
    for item in text.split():
        if item in df_alay['alay'].values:
            res += df_alay[df_alay['alay'] == item]['formal'].iloc[0]
        else:
            res += item
        res += ' '
    return res

def stopword_removal(text):
    resp = ''
    for item in text.split():
        if item not in list_stopwords:
            resp += item
        resp +=' '
    clean = re.sub('  +', ' ', resp)
    return clean

def cleansing(text):
    text = data_cleaning(text)
    text = case_folding(text)
    text = alay_normalization(text)
    text = stopword_removal(text)
    return text

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\admin\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [15]:
# Apply cleansing
df['text'] = df['text'].apply(cleansing)

In [16]:
df.head()

Unnamed: 0,text,label
0,warung dimiliki pengusaha pabrik puluhan terke...,positive
1,mohon ulama lurus kmmbri hujjah partai diwlh s...,neutral
2,lokasi strategis jalan sumatra bandung nya nya...,positive
3,betapa bahagia nya unbo paket barang nya bagus...,positive
4,aduh mahasiswa sombong kasih kartu kuning bela...,negative


In [18]:
import sqlite3

# Membuat koneksi ke database SQLite
conn = sqlite3.connect('database.db')

In [19]:
# Menyimpan DataFrame ke dalam tabel 'tabel_cleansed' dalam database
df.to_sql('tabel_cleansed', con=conn, index=False, if_exists='replace')

10933

In [20]:
print("DataFrame yang telah di-cleansing berhasil disimpan ke dalam tabel 'tabel_cleansed' di database.")

DataFrame yang telah di-cleansing berhasil disimpan ke dalam tabel 'tabel_cleansed' di database.


In [21]:
# Close the database

conn.commit()
conn.close()