In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
import csv
import re
import pandas as pd
import string
import nltk
from nltk.tokenize import word_tokenize
from nltk.probability import FreqDist
from nltk.corpus import stopwords

In [None]:
nltk.download('punkt')
nltk.download('wordnet')
nltk.download('omw-1.4')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to /root/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


True

In [None]:
with open("/content/drive/MyDrive/Portofolio/Zero-Shot Classification for Tweeter/dataset_unlabeled.csv", "r", encoding="utf-8") as file:
    reader = csv.reader(file, delimiter=';', quotechar='"')
    rows = [row for row in reader]

# Ubah menjadi dataframe
data = pd.DataFrame(rows[1:], columns=rows[0]) #Kolom pertama (0) menjadi label fiturnya dan kolom kedua (1) menjadi isi datanya

In [None]:
data.head()

Unnamed: 0,IDText,Text
0,TXT0001,Lu mau org2 pro-demokrasi di negara ini bisa p...
1,TXT0002,Prabowo ditanya soal hutang luar negeri dia me...
2,TXT0003,kiki_daliyo Ganjar Pranowo itulah beliau soso...
3,TXT0004,@kumparan Prabowo Gibran yang bisa melakukan i...
4,TXT0005,@sniperruben45 @uda_zulhendra @ainunnajib Lah ...


In [None]:
def remove_special(text):
    # remove tab, new line, and back slash
    text = text.replace('\t', " ").replace('\n', " ").replace('\\u', " ").replace('\\', "")
    # remove non ASCII (emoticon, chinese word, etc.)
    text = text.encode('ascii', 'replace').decode('ascii')
    # remove mention, link, hashtag
    text = ' '.join(re.sub(r"([@#][A-Za-z0-9_]+)|(\bhttps?://\S+)", " ", text).split())
    # remove incomplete URL
    text = text.replace("http://", " ").replace("https://", " ")
    return text

data['Text'] = data['Text'].apply(remove_special)

# remove number
def remove_number(text):
    return re.sub(r"\d+", "", text)

data['Text'] = data['Text'].apply(remove_number)

# remove punctuation
def remove_punctuation(text):
    return text.translate(str.maketrans("", "", string.punctuation))

data['Text'] = data['Text'].apply(remove_punctuation)

# remove whitespace leading & trailing
def remove_whitespace_LT(text):
    return text.strip()

data['Text'] = data['Text'].apply(remove_whitespace_LT)

# remove multiple whitespace into single whitespace
def remove_whitespace_multiple(text):
    return re.sub(r'\s+', ' ', text)

data['Text'] = data['Text'].apply(remove_whitespace_multiple)

# remove single character
def remove_single_char(text):
    return re.sub(r"\b[a-zA-Z]\b", "", text)

data['Text'] = data['Text'].apply(remove_single_char)

# Case folding to convert text to lowercase
def case_folding(text):
    return text.lower()

# Apply case folding
data['Text'] = data['Text'].apply(case_folding)

# NLTK word tokenize
def word_tokenize_wrapper(text):
    return word_tokenize(text)

data['Text_Token'] = data['Text'].apply(word_tokenize_wrapper)

data.head()

Unnamed: 0,IDText,Text,Text_Token
0,TXT0001,lu mau org prodemokrasi di negara ini bisa pun...,"[lu, mau, org, prodemokrasi, di, negara, ini, ..."
1,TXT0002,prabowo ditanya soal hutang luar negeri dia me...,"[prabowo, ditanya, soal, hutang, luar, negeri,..."
2,TXT0003,kikidaliyo ganjar pranowo itulah beliau sosok ...,"[kikidaliyo, ganjar, pranowo, itulah, beliau, ..."
3,TXT0004,prabowo gibran yang bisa melakukan itu semua d...,"[prabowo, gibran, yang, bisa, melakukan, itu, ..."
4,TXT0005,lah justru yg gak nyambung junjungan elu aomkm...,"[lah, justru, yg, gak, nyambung, junjungan, el..."


In [None]:
# Membaca kamus alay
kamus_alay = pd.read_csv("/content/drive/MyDrive/Portofolio/Zero-Shot Classification for Tweeter/colloquial-indonesian-lexicon.csv")
kamus_dict = dict(zip(kamus_alay['slang'], kamus_alay['formal']))

kamus_alay.head()

Unnamed: 0,slang,formal,In-dictionary,context,category1,category2,category3
0,woww,wow,1,wow,elongasi,0,0
1,aminn,amin,1,Selamat ulang tahun kakak tulus semoga panjang...,elongasi,0,0
2,met,selamat,1,Met hari netaas kak!? Wish you all the best @t...,abreviasi,0,0
3,netaas,menetas,1,Met hari netaas kak!? Wish you all the best @t...,afiksasi,elongasi,0
4,keberpa,keberapa,0,Birthday yg keberpa kak?,abreviasi,0,0


In [None]:
# Fungsi untuk normalisasi
def normalize_text(text_tokens, kamus):
    return [kamus.get(word, word) for word in text_tokens]

# Menerapkan normalisasi pada kolom Text_Token
data['Text_Token_Normalized'] = data['Text_Token'].apply(lambda tokens: normalize_text(tokens, kamus_dict))

# Menampilkan hasil
data.head()

Unnamed: 0,IDText,Text,Text_Token,Text_Token_Normalized
0,TXT0001,lu mau org prodemokrasi di negara ini bisa pun...,"[lu, mau, org, prodemokrasi, di, negara, ini, ...","[lu, mau, orang, prodemokrasi, di, negara, ini..."
1,TXT0002,prabowo ditanya soal hutang luar negeri dia me...,"[prabowo, ditanya, soal, hutang, luar, negeri,...","[prabowo, ditanya, soal, hutang, luar, negeri,..."
2,TXT0003,kikidaliyo ganjar pranowo itulah beliau sosok ...,"[kikidaliyo, ganjar, pranowo, itulah, beliau, ...","[kikidaliyo, ganjar, pranowo, itulah, beliau, ..."
3,TXT0004,prabowo gibran yang bisa melakukan itu semua d...,"[prabowo, gibran, yang, bisa, melakukan, itu, ...","[prabowo, gibran, yang, bisa, melakukan, itu, ..."
4,TXT0005,lah justru yg gak nyambung junjungan elu aomkm...,"[lah, justru, yg, gak, nyambung, junjungan, el...","[lah, justru, yang, enggak, menyambung, junjun..."


In [None]:
# data['Text_Token_Filtered_Merged'] = data['Text_Token_Filtered'].apply(lambda tokens: ' '.join(tokens))
data['Text_Merged'] = data['Text_Token_Normalized'].apply(lambda tokens: ' '.join(tokens))

In [None]:
data = data.drop(columns=['Text_Token', 'Text_Token_Normalized'])

In [None]:
data.head()

Unnamed: 0,IDText,Text,Text_Merged
0,TXT0001,lu mau org prodemokrasi di negara ini bisa pun...,lu mau orang prodemokrasi di negara ini bisa p...
1,TXT0002,prabowo ditanya soal hutang luar negeri dia me...,prabowo ditanya soal hutang luar negeri dia me...
2,TXT0003,kikidaliyo ganjar pranowo itulah beliau sosok ...,kikidaliyo ganjar pranowo itulah beliau sosok ...
3,TXT0004,prabowo gibran yang bisa melakukan itu semua d...,prabowo gibran yang bisa melakukan itu semua d...
4,TXT0005,lah justru yg gak nyambung junjungan elu aomkm...,lah justru yang enggak menyambung junjungan lu...


Model Used for Translation (Indonesia to English) : google-t5/t5-small
https://huggingface.co/google-t5/t5-small

In [None]:
pip install deep-translator

Collecting deep-translator
  Downloading deep_translator-1.11.4-py3-none-any.whl.metadata (30 kB)
Downloading deep_translator-1.11.4-py3-none-any.whl (42 kB)
[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/42.3 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m42.3/42.3 kB[0m [31m3.2 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: deep-translator
Successfully installed deep-translator-1.11.4


In [None]:
pip install tqdm



In [None]:
from deep_translator import GoogleTranslator
from tqdm import tqdm
import pandas as pd

# Fungsi untuk menerjemahkan teks
def translate_text(text):
    try:
        return GoogleTranslator(source='id', target='en').translate(text)
    except Exception as e:
        print(f"Error translating text: {e}")
        return text  # Return the original text in case of an error

# Menerapkan tqdm pada DataFrame
tqdm.pandas()

# Menerjemahkan kolom Text_Merged dengan loading bar
data['Translated_Text'] = data['Text_Merged'].progress_apply(translate_text)


100%|██████████| 1000/1000 [12:11<00:00,  1.37it/s]


In [None]:
data.head(10)

Unnamed: 0,IDText,Text,Text_Merged,Translated_Text
0,TXT0001,lu mau org prodemokrasi di negara ini bisa pun...,lu mau orang prodemokrasi di negara ini bisa p...,You want pro-democracy people in this country ...
1,TXT0002,prabowo ditanya soal hutang luar negeri dia me...,prabowo ditanya soal hutang luar negeri dia me...,"Prabowo was asked about foreign debt, he answe..."
2,TXT0003,kikidaliyo ganjar pranowo itulah beliau sosok ...,kikidaliyo ganjar pranowo itulah beliau sosok ...,kikidaliyo ganjar pranowo that's the figure wh...
3,TXT0004,prabowo gibran yang bisa melakukan itu semua d...,prabowo gibran yang bisa melakukan itu semua d...,Prabowo Gibran who can do all that for the wel...
4,TXT0005,lah justru yg gak nyambung junjungan elu aomkm...,lah justru yang enggak menyambung junjungan lu...,It's actually the one who doesn't connect with...
5,TXT0006,nelayan tak bisa terus terjebak dalam kredit m...,nelayan tak bisa terus terjebak dalam kredit m...,Fishermen cannot continue to be trapped in bad...
6,TXT0007,prabowo anti kebebasan pers,prabowo anti kebebasan pers,Prabowo is against press freedom
7,TXT0008,kontrak terbuka ganjar pranowo adalah cermin d...,kontrak terbuka ganjar pranowo adalah cermin d...,Ganjar Pranowo's open contract is a reflection...
8,TXT0009,ganjar mahfud hebat pak ganjar selalu menjungj...,ganjar mahfud hebat pak ganjar selalu menjungj...,"Ganjar Mahfud is great, Mr. Ganjar always upho..."
9,TXT0010,ganjarmahfud adalah harapan baru untuk pertumb...,ganjarmahfud adalah harapan baru untuk pertumb...,ganjarmahfud is a new hope for our economic gr...


In [None]:
data.to_csv("/content/drive/MyDrive/Portofolio/Zero-Shot Classification for Tweeter/translated_texts.csv", index=False)

In [None]:
data_tr = pd.read_csv("/content/drive/MyDrive/Portofolio/Zero-Shot Classification for Tweeter/translated_texts.csv")

Model Used for Zero-Shot Classification :tasksource/deberta-small-long-nli
https://huggingface.co/tasksource/deberta-small-long-nli

In [None]:
from transformers import pipeline

# Inisialisasi pipeline zero-shot classification
classifier = pipeline("zero-shot-classification", model="tasksource/deberta-small-long-nli")

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


config.json:   0%|          | 0.00/19.1k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/568M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/1.28k [00:00<?, ?B/s]

spm.model:   0%|          | 0.00/2.46M [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/8.66M [00:00<?, ?B/s]

added_tokens.json:   0%|          | 0.00/23.0 [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/286 [00:00<?, ?B/s]

In [None]:
# Label kandidat sesuai dengan klasifikasi yang diinginkan
candidate_labels = [
    "Ideology",
    "Politics",
    "Economy",
    "Social Culture",
    "Defense and Security",
    "Natural Resources",
    "Geography",
    "Demographics"
]

In [None]:
def classify_text(text):
    result = classifier(text, candidate_labels)
    # Mengambil label dengan skor tertinggi
    return result['labels'][0]

In [None]:
data_tr['Classification_Label'] = data_tr['Translated_Text'].head(10).progress_apply(classify_text)

100%|██████████| 10/10 [00:36<00:00,  3.61s/it]


In [None]:
data_tr.head(10)

Unnamed: 0,IDText,Text,Text_Merged,Translated_Text,Classification_Label
0,TXT0001,lu mau org prodemokrasi di negara ini bisa pun...,lu mau orang prodemokrasi di negara ini bisa p...,You want pro-democracy people in this country ...,Politics
1,TXT0002,prabowo ditanya soal hutang luar negeri dia me...,prabowo ditanya soal hutang luar negeri dia me...,"Prabowo was asked about foreign debt, he answe...",Economy
2,TXT0003,kikidaliyo ganjar pranowo itulah beliau sosok ...,kikidaliyo ganjar pranowo itulah beliau sosok ...,kikidaliyo ganjar pranowo that's the figure wh...,Social Culture
3,TXT0004,prabowo gibran yang bisa melakukan itu semua d...,prabowo gibran yang bisa melakukan itu semua d...,Prabowo Gibran who can do all that for the wel...,Social Culture
4,TXT0005,lah justru yg gak nyambung junjungan elu aomkm...,lah justru yang enggak menyambung junjungan lu...,It's actually the one who doesn't connect with...,Economy
5,TXT0006,nelayan tak bisa terus terjebak dalam kredit m...,nelayan tak bisa terus terjebak dalam kredit m...,Fishermen cannot continue to be trapped in bad...,Natural Resources
6,TXT0007,prabowo anti kebebasan pers,prabowo anti kebebasan pers,Prabowo is against press freedom,Politics
7,TXT0008,kontrak terbuka ganjar pranowo adalah cermin d...,kontrak terbuka ganjar pranowo adalah cermin d...,Ganjar Pranowo's open contract is a reflection...,Politics
8,TXT0009,ganjar mahfud hebat pak ganjar selalu menjungj...,ganjar mahfud hebat pak ganjar selalu menjungj...,"Ganjar Mahfud is great, Mr. Ganjar always upho...",Social Culture
9,TXT0010,ganjarmahfud adalah harapan baru untuk pertumb...,ganjarmahfud adalah harapan baru untuk pertumb...,ganjarmahfud is a new hope for our economic gr...,Economy
