**UTS MACHINE LEARNING**

Studi Kasus menggunakan dataset tweet_emotions

In [1]:
!pip install nltk

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [2]:
from google.colab import drive
drive.mount("/content/drive", force_remount=True)

Mounted at /content/drive


In [3]:
import numpy as np
import pandas as pd
import re
import string
import matplotlib.pyplot as plt
import nltk
nltk.download('punkt')
nltk.download('stopwords')
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


In [4]:
#Membuat dataframe
df = pd.read_csv('/content/drive/My Drive/Colab Notebooks/tweet_emotions.csv')

df.head(5)

Unnamed: 0,tweet_id,sentiment,content
0,1956967341,empty,@tiffanylue i know i was listenin to bad habi...
1,1956967666,sadness,Layin n bed with a headache ughhhh...waitin o...
2,1956967696,sadness,Funeral ceremony...gloomy friday...
3,1956967789,enthusiasm,wants to hang out with friends SOON!
4,1956968416,neutral,@dannycastillo We want to trade with someone w...


**PRA PENGOLAHAN DATA**

1. Case Folding - Mengubah semua bentuk huruf dalam sebuah teks atau dokumen menjadi huruf kecil semua

2. Operasi tokenisasi atau Tokenizing -  memisahkan teks menjadi potongan-potongan berupa token, bisa berupa potongan huruf, kata, atau kalimat, sebelum dianalisis lebih lanjut

3. Filtering/Stopword Removal - menghapus kata-kata umum yang tidak memiliki makna misalnya re, is, i, am, was, were, they, you, the, dan lain sebagainya

In [5]:
#CATATAN: PADA DATA TWITTER TERDAPAT MENTION (@something) YANG ANDA HARUS TANGANI SEBELUM MASUK KE TAHAP EKSTRAKSI FITUR

#Membuat fungsi untuk membersihkan text pada kolom content
def cleanTxt(text):
  text = re.sub(r'@[A-Za-z0-9]+', '', text) #Remove @mention
  text = re.sub(r'#', '', text) #Remove the # symbol
  text = re.sub(r'RT[\s]+', '', text) #Remove RT
  text = re.sub(r'https?:\/\/\S+', '', text) #Remove the hyper Link
  return text

cleantext = df['content']=df['content'].apply(cleanTxt) #Memproses clean text dengan menerapkan fungsi yang sudah dibuat

print('Hasil Clean Text: \n')
cleantext.head() #Menampilkan text yang sudah dibersihkan

Hasil Clean Text: 



0     i know  i was listenin to bad habit earlier a...
1    Layin n bed with a headache  ughhhh...waitin o...
2                  Funeral ceremony...gloomy friday...
3                 wants to hang out with friends SOON!
4     We want to trade with someone who has Houston...
Name: content, dtype: object

In [6]:
#PRA PENGOLAHAN DATA - OPERASI CASE FOLDING

#Membuat fungsi untuk membersihkan text pada kolom content
def casefolding(text):
  text = lower_case = text.lower() # Mengubah Teks menjadi Lowercase
  text = re.sub(r"\d+", "", text) # Menghapus Angka
  text = text.translate(str.maketrans("","",string.punctuation)) # Menghapus Tanda Baca
  text = text.strip() # Menghapus karakter kosong (white space)
  text = re.sub('\s+',' ',text) # Menghapus beberapa whitespace menjadi whitespace tunggal
  return text

casefolding = df['casefolding'] = cleantext.apply(casefolding) #Memproses case folding dengan menerapkan fungsi yang sudah dibuat

print('Hasil Case Folding: \n') 
casefolding.head() #Menampilkan text yang sudah melalui case folding

Hasil Case Folding: 



0    i know i was listenin to bad habit earlier and...
1    layin n bed with a headache ughhhhwaitin on yo...
2                        funeral ceremonygloomy friday
3                  wants to hang out with friends soon
4    we want to trade with someone who has houston ...
Name: content, dtype: object

In [7]:
#PRA PENGOLAHAN DATA - OPERASI TOKENIZATION

def tokenization(text):
  #text = re.split('\W+',text)
  text = word_tokenize(text)
  return text

tokenization = df['tokenization'] = casefolding.apply(tokenization) #Memproses tokenizing dengan menerapkan fungsi yang sudah dibuat

print('Hasil Tokenization: \n')
tokenization.head() #Menampilkan hasil tokenizing per kata

Hasil Tokenization: 



0    [i, know, i, was, listenin, to, bad, habit, ea...
1    [layin, n, bed, with, a, headache, ughhhhwaiti...
2                    [funeral, ceremonygloomy, friday]
3          [wants, to, hang, out, with, friends, soon]
4    [we, want, to, trade, with, someone, who, has,...
Name: content, dtype: object

In [8]:
#PRA PENGOLAHAN DATA - OPERASI FILTERING/STOPWORD REMOVAL
#clean stopwords
stop_words = set(stopwords.words('english'))

def clean_stopwords(text):
  text = [w for w in text if w not in stop_words]
  return text

# Buat kolom tambahan untuk data description yang telah distopwordsremoval  
clean_stopwords = df['clean_stopwords'] = tokenization.apply(clean_stopwords)

print('Hasil Filtering: \n') 
clean_stopwords.head()

Hasil Filtering: 



0    [know, listenin, bad, habit, earlier, started,...
1        [layin, n, bed, headache, ughhhhwaitin, call]
2                    [funeral, ceremonygloomy, friday]
3                         [wants, hang, friends, soon]
4        [want, trade, someone, houston, tickets, one]
Name: content, dtype: object

In [9]:
#PRA PENGOLAHAN DATA - PENGGABUNGAN KATA
def join_words(text):
  text =np.array(text)
  text = ' '.join(text)
  return text

join_words = df['join_words'] = clean_stopwords.apply(join_words)

print('Hasil Penggabungan Kata: \n') 
join_words.head()

Hasil Penggabungan Kata: 



0    know listenin bad habit earlier started freaki...
1               layin n bed headache ughhhhwaitin call
2                        funeral ceremonygloomy friday
3                              wants hang friends soon
4               want trade someone houston tickets one
Name: content, dtype: object

In [10]:
# Melihat kolom dan baris dataframe 
df

Unnamed: 0,tweet_id,sentiment,content,casefolding,tokenization,clean_stopwords,join_words
0,1956967341,empty,i know i was listenin to bad habit earlier a...,i know i was listenin to bad habit earlier and...,"[i, know, i, was, listenin, to, bad, habit, ea...","[know, listenin, bad, habit, earlier, started,...",know listenin bad habit earlier started freaki...
1,1956967666,sadness,Layin n bed with a headache ughhhh...waitin o...,layin n bed with a headache ughhhhwaitin on yo...,"[layin, n, bed, with, a, headache, ughhhhwaiti...","[layin, n, bed, headache, ughhhhwaitin, call]",layin n bed headache ughhhhwaitin call
2,1956967696,sadness,Funeral ceremony...gloomy friday...,funeral ceremonygloomy friday,"[funeral, ceremonygloomy, friday]","[funeral, ceremonygloomy, friday]",funeral ceremonygloomy friday
3,1956967789,enthusiasm,wants to hang out with friends SOON!,wants to hang out with friends soon,"[wants, to, hang, out, with, friends, soon]","[wants, hang, friends, soon]",wants hang friends soon
4,1956968416,neutral,We want to trade with someone who has Houston...,we want to trade with someone who has houston ...,"[we, want, to, trade, with, someone, who, has,...","[want, trade, someone, houston, tickets, one]",want trade someone houston tickets one
...,...,...,...,...,...,...,...
39995,1753918954,neutral,,,[],[],
39996,1753919001,love,Happy Mothers Day All my love,happy mothers day all my love,"[happy, mothers, day, all, my, love]","[happy, mothers, day, love]",happy mothers day love
39997,1753919005,love,Happy Mother's Day to all the mommies out ther...,happy mothers day to all the mommies out there...,"[happy, mothers, day, to, all, the, mommies, o...","[happy, mothers, day, mommies, woman, man, lon...",happy mothers day mommies woman man long youre...
39998,1753919043,happiness,WASSUP BEAUTIFUL!!! FOLLOW ME!! PEEP OUT MY ...,wassup beautiful follow me peep out my new hit...,"[wassup, beautiful, follow, me, peep, out, my,...","[wassup, beautiful, follow, peep, new, hit, si...",wassup beautiful follow peep new hit singles w...


**ENCODE LABEL**

In [11]:
# Encode Label
# Label Emosi : anger, boredom, empty, enthusiasm, fun, happiness, hate, love, neutral, relief, sadness, surprise, worry

# Membuat data untuk label baru menggunakan angka
new_labels = {
    'anger': 0,
    'boredom': 1,
    'empty': 2,
    'enthusiasm': 3,
    'fun': 4,
    'happiness': 5,
    'hate': 6,
    'love': 7,
    'neutral': 8,
    'relief': 9,
    'sadness': 10,
    'surprise': 11,
    'worry': 12
}

#Encode label
df['sentiment'] = df['sentiment'].map(new_labels)

#Cek data
df.head()

Unnamed: 0,tweet_id,sentiment,content,casefolding,tokenization,clean_stopwords,join_words
0,1956967341,2,i know i was listenin to bad habit earlier a...,i know i was listenin to bad habit earlier and...,"[i, know, i, was, listenin, to, bad, habit, ea...","[know, listenin, bad, habit, earlier, started,...",know listenin bad habit earlier started freaki...
1,1956967666,10,Layin n bed with a headache ughhhh...waitin o...,layin n bed with a headache ughhhhwaitin on yo...,"[layin, n, bed, with, a, headache, ughhhhwaiti...","[layin, n, bed, headache, ughhhhwaitin, call]",layin n bed headache ughhhhwaitin call
2,1956967696,10,Funeral ceremony...gloomy friday...,funeral ceremonygloomy friday,"[funeral, ceremonygloomy, friday]","[funeral, ceremonygloomy, friday]",funeral ceremonygloomy friday
3,1956967789,3,wants to hang out with friends SOON!,wants to hang out with friends soon,"[wants, to, hang, out, with, friends, soon]","[wants, hang, friends, soon]",wants hang friends soon
4,1956968416,8,We want to trade with someone who has Houston...,we want to trade with someone who has houston ...,"[we, want, to, trade, with, someone, who, has,...","[want, trade, someone, houston, tickets, one]",want trade someone houston tickets one


In [12]:
# Memisahkan Fitur dengan Label
x = df['join_words'].values 
y = df['sentiment'].values

**EKSTRAKSI FITUR**

In [13]:
# Ekstraksi Fitur menggunakan TF-IDF Vectorizer

from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer

#split data training dan testing
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=50)

#inisiasi TfidfVectorizer
bow = TfidfVectorizer()

#fitting dan transform x_train dengan TfidfVectorizer
x_train = bow.fit_transform(x_train)

#transform x_test
x_test = bow.transform(x_test)

#display x_train
print(x_train)

  (0, 6205)	0.20376194488024502
  (0, 9378)	0.35656838537159646
  (0, 14936)	0.46772056146677005
  (0, 26810)	0.3523013459787868
  (0, 13429)	0.4423390347510864
  (0, 18032)	0.26530958244734903
  (0, 9838)	0.3580659586106687
  (0, 22705)	0.3069154076552215
  (1, 16692)	0.2372127989963308
  (1, 11125)	0.21941447803529965
  (1, 16452)	0.2870949331813048
  (1, 12437)	0.5196708475580234
  (1, 15068)	0.71374158672116
  (1, 6205)	0.18351847794032036
  (2, 21960)	0.35790631330849804
  (2, 2274)	0.18010586415732605
  (2, 12383)	0.10986579329401797
  (2, 887)	0.16061263294645856
  (2, 9277)	0.3039730032687927
  (2, 24129)	0.3039730032687927
  (2, 5716)	0.24392997941315106
  (2, 12253)	0.23557038394113095
  (2, 15009)	0.22485925192922934
  (2, 7617)	0.2143605447281244
  (2, 346)	0.35790631330849804
  :	:
  (31996, 15038)	0.1602073576984036
  (31996, 10867)	0.21564795598225864
  (31997, 8861)	0.3984053565380047
  (31997, 6013)	0.3832804693343966
  (31997, 24877)	0.36422536499700375
  (31997, 4343

In [14]:
#tranformasi ke bentuk array
x_train.todense()

matrix([[0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.],
        ...,
        [0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.]])

**TRAINING DAN EVALUASI MODEL**

In [15]:
#Pembuatan Model dengan algoritma klasifikasi Naive Bayes
#Proses Evaluasi menggunakan metric akurasi

from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score

#inisiasi MultinominalNB
mnb = MultinomialNB()

#fit model
mnb.fit(x_train, y_train)

#prediksi dengan data training
y_pred_train = mnb.predict(x_train)

#evaluasi akurasi data training
acc_train = accuracy_score(y_train, y_pred_train)

#prediksi dengan data testing
y_pred_test = mnb.predict(x_test)

#evaluasi akurasi data testing
acc_test = accuracy_score(y_test, y_pred_test)

#print hasil evaluasi
print(f'Hasil akurasi data train: {acc_train}')
print(f'Hasil akurasi data test: {acc_test}')

Hasil akurasi data train: 0.45965625
Hasil akurasi data test: 0.30125
