In [None]:

import csv
import pandas as pd
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

In [None]:
# Download the CSV file
!wget -O dataset.csv https://raw.githubusercontent.com/okasugiarta/SATRIA-DATA/main/data/dataset_penyisihan_bdc_2024.csv


# Open the file and read the first few lines to inspect the file
with open('dataset.csv', 'r') as csvfile:
    csvreader = csv.reader(csvfile)
    print("Inspecting the first few lines of the file:")
    for i in range(5):
        print(next(csvreader))


try:
    # Try reading the file with default delimiter
    df = pd.read_csv('dataset.csv')
except pd.errors.ParserError:
    # If there's a parsing error, try a different delimiter or read without header
    df = pd.read_csv('dataset.csv', delimiter=';')  # Adjust delimiter if needed
    print("\nFile read with an alternative delimiter.\n")

print("\nFirst line (header) looks like this:")
print(df.columns)

print("\nEach data point looks like this:")
print(df.head())


--2024-06-16 03:34:02--  https://raw.githubusercontent.com/okasugiarta/SATRIA-DATA/main/data/dataset_penyisihan_bdc_2024.csv
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.108.133, 185.199.109.133, 185.199.110.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.108.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 1435985 (1.4M) [text/plain]
Saving to: ‘dataset.csv’


2024-06-16 03:34:02 (20.9 MB/s) - ‘dataset.csv’ saved [1435985/1435985]

Inspecting the first few lines of the file:
['text;label']
['Kunjungan Prabowo ini untuk meresmikan dan menyerahkan proyek bantuan air bersih di lima titik. #IndonesiaSentris #IndonesiaHijau #02Melanjutkan #AnakMudaIndonesiaEmas Prabowo Subianto;Sumber Daya Alam']
['RT Anies dapat tepuk tangan meriah saat jadi Rektor mewajibkan mata kuliah anti-korupsi untuk memutus mata rantai korupsi.. #AminMiskinkanKoruptor https://t.co/hgXsoQGaEa [RE ekowboy2];Politik']
['@CI

In [None]:

# parse_data_from_file
def parse_data_from_file(filename, delimiter=';'):
    """
    Extracts sentences and labels from a CSV file with a specified delimiter

    Args:
        filename (string): path to the CSV file
        delimiter (string): delimiter used in the CSV file

    Returns:
        sentences, labels (list of string, list of string): tuple containing lists of sentences and labels
    """
    sentences = []
    labels = []
    with open(filename, 'r') as csvfile:
        reader = csv.reader(csvfile, delimiter=delimiter)
        next(reader, None)  # Skip the header row
        for row in reader:
            sentence = row[0]  # First column
            label = row[1]     # Second column
            sentences.append(sentence)
            labels.append(label)
    return sentences, labels


In [None]:
# grader-required-cell

# Test your function

# With original dataset
sentences, labels = parse_data_from_file("dataset_penyisihan_bdc_2024.csv")

print("ORIGINAL DATASET:\n")
print(f"There are {len(sentences)} sentences in the dataset.\n")
print(f"First sentence has {len(sentences[0].split())} words (after removing stopwords).\n")
print(f"There are {len(labels)} labels in the dataset.\n")
print(f"The first 10 labels are {labels[:10]}\n\n")

# With a miniature version of the dataset that contains only first 5 rows
mini_sentences, mini_labels = parse_data_from_file("dataset_penyisihan_bdc_2024.csv")

print("MINIATURE DATASET:\n")
print(f"There are {len(mini_sentences)} sentences in the miniature dataset.\n")
print(f"First sentence has {len(mini_sentences[0].split())} words (after removing stopwords).\n")
print(f"There are {len(mini_labels)} labels in the miniature dataset.\n")
print(f"The first 10 labels are {mini_labels[:10]}")

ORIGINAL DATASET:

There are 5000 sentences in the dataset.

First sentence has 20 words (after removing stopwords).

There are 5000 labels in the dataset.

The first 10 labels are ['Sumber Daya Alam', 'Politik', 'Demografi', 'Politik', 'Politik', 'Politik', 'Pertahanan dan Keamanan', 'Politik', 'Politik', 'Politik']


MINIATURE DATASET:

There are 5000 sentences in the miniature dataset.

First sentence has 20 words (after removing stopwords).

There are 5000 labels in the miniature dataset.

The first 10 labels are ['Sumber Daya Alam', 'Politik', 'Demografi', 'Politik', 'Politik', 'Politik', 'Pertahanan dan Keamanan', 'Politik', 'Politik', 'Politik']


In [None]:
# Install NLTK and download stopwords
!pip install nltk

import nltk
import re
import pandas as pd
from nltk.corpus import stopwords

# Download stopwords
nltk.download('stopwords')

# Define the text cleaning function
def clean_text(text, language='indonesian'):
    """
    Clean text by removing stopwords, special characters, punctuation, mentions, hashtags,
    and non-standard alphanumeric strings

    Args:
        text (string): input text to be cleaned
        language (string): language of the stopwords

    Returns:
        string: cleaned text
    """
    stop_words = set(stopwords.words(language))

    # Remove mentions, hashtags, and URLs
    text = re.sub(r'(@[A-Za-z0-9_]+)|(#\w+)|(\bhttps?://\S+)', '', text)
    # Remove non-standard characters and punctuation, while preserving spaces
    text = re.sub(r'[^a-zA-Z\s]', '', text)
    # Remove extra spaces
    text = re.sub(r'\s+', ' ', text).strip()

    words = text.split()
    filtered_words = [word for word in words if word.lower() not in stop_words and len(word) > 2]

    return ' '.join(filtered_words)


# Display the first few rows to understand the structure of the dataset
print(df.head())

# Apply the clean_text function to the text column
# Assuming the column containing text data is named 'text'. Adjust as necessary.
df['cleaned_text'] = df['text'].apply(clean_text)

# Display the first few rows of the cleaned dataset
print(df[['text', 'cleaned_text']].head())




[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


                                                text             label
0  Kunjungan Prabowo ini untuk meresmikan dan men...  Sumber Daya Alam
1  RT Anies dapat tepuk tangan meriah saat jadi R...           Politik
2  @CIqXqwGAT04tMtx4OCATxjoVq7vv/Y8HeYaIOgMFg8Y= ...         Demografi
3  RT @L3R8XFBw3WGbxRPSj0/0hHZTbqVGX7qtfwRg9zmhK7...           Politik
4  Anies Baswedan Harap ASN termasuk TNI dan Polr...           Politik
                                                text  \
0  Kunjungan Prabowo ini untuk meresmikan dan men...   
1  RT Anies dapat tepuk tangan meriah saat jadi R...   
2  @CIqXqwGAT04tMtx4OCATxjoVq7vv/Y8HeYaIOgMFg8Y= ...   
3  RT @L3R8XFBw3WGbxRPSj0/0hHZTbqVGX7qtfwRg9zmhK7...   
4  Anies Baswedan Harap ASN termasuk TNI dan Polr...   

                                        cleaned_text  
0  Kunjungan Prabowo meresmikan menyerahkan proye...  
1  Anies tepuk tangan meriah Rektor mewajibkan ma...  
2  YHeYaIOgMFgY emng bener sih pendukung goblok p...  
3  hHZTbqVGXqtfwR

Gunakan Tokenizer dari Keras untuk mengubah teks menjadi fitur numerik.

In [None]:
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

# Parameters
max_features = 5000
max_len = 100

# Step 2: Tokenize the text data
tokenizer = Tokenizer(num_words=max_features)
tokenizer.fit_on_texts(df['cleaned_text'])
X = tokenizer.texts_to_sequences(df['cleaned_text'])
X = pad_sequences(X, maxlen=max_len)

# Assuming the label column is named 'label'. Adjust as necessary.
y = df['label']


# Melatih Model dengan TensorFlow
Pilih model pembelajaran mesin yang sesuai, seperti LSTM atau model sederhana dengan Dense layers.

In [None]:
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense, SpatialDropout1D
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder

# Encode the labels for multi-class (assuming y contains categorical labels)
label_encoder = LabelEncoder()
y_encoded = label_encoder.fit_transform(y)

# Step 3: Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y_encoded, test_size=0.2, random_state=42)

# Determine the number of classes in your dataset
num_classes = len(label_encoder.classes_)

# Step 4: Define the model for multi-class classification
model = Sequential()
model.add(Embedding(max_features, 128, input_length=max_len))
model.add(SpatialDropout1D(0.2))
model.add(LSTM(100, dropout=0.2, recurrent_dropout=0.2))
model.add(Dense(num_classes, activation='softmax'))  # Use softmax for multi-class

model.compile(loss='sparse_categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

# Step 5: Train the model
history = model.fit(X_train, y_train, epochs=5, batch_size=64, validation_data=(X_test, y_test), verbose=2)


Epoch 1/5
63/63 - 26s - loss: 1.4302 - accuracy: 0.5870 - val_loss: 1.3456 - val_accuracy: 0.5860 - 26s/epoch - 416ms/step
Epoch 2/5
63/63 - 24s - loss: 1.1478 - accuracy: 0.6400 - val_loss: 1.0203 - val_accuracy: 0.7170 - 24s/epoch - 382ms/step
Epoch 3/5
63/63 - 22s - loss: 0.7482 - accuracy: 0.7853 - val_loss: 0.9074 - val_accuracy: 0.7250 - 22s/epoch - 348ms/step
Epoch 4/5
63/63 - 21s - loss: 0.5077 - accuracy: 0.8508 - val_loss: 0.9148 - val_accuracy: 0.7550 - 21s/epoch - 339ms/step
Epoch 5/5
63/63 - 21s - loss: 0.3615 - accuracy: 0.8932 - val_loss: 0.9698 - val_accuracy: 0.7520 - 21s/epoch - 332ms/step



# download the unlabeled dataset from GitHub

In [None]:
# Download the unlabeled dataset
!wget -O dataset_unlabeled_penyisihan_bdc_2024.csv https://raw.githubusercontent.com/okasugiarta/SATRIA-DATA/main/data/dataset_unlabeled_penyisihan_bdc_2024.csv


--2024-06-16 04:03:39--  https://raw.githubusercontent.com/okasugiarta/SATRIA-DATA/main/data/dataset_unlabeled_penyisihan_bdc_2024.csv
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.108.133, 185.199.109.133, 185.199.110.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.108.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 309666 (302K) [text/plain]
Saving to: ‘dataset_unlabeled_penyisihan_bdc_2024.csv’


2024-06-16 04:03:39 (6.19 MB/s) - ‘dataset_unlabeled_penyisihan_bdc_2024.csv’ saved [309666/309666]



# Prediksi pada Unlabeled Dataset
Gunakan model yang telah dilatih untuk memprediksi label pada unlabeled dataset.

In [None]:
import numpy as np
# Fungsi untuk membersihkan teks
def clean_text(text, language='indonesian'):
    stop_words = set(stopwords.words(language))

    # Hapus mentions, hashtags, dan URL
    text = re.sub(r'(@[A-Za-z0-9_]+)|(#\w+)|(\bhttps?://\S+)', '', text)
    # Hapus karakter non-standar dan tanda baca, sambil mempertahankan spasi
    text = re.sub(r'[^a-zA-Z\s]', '', text)
    # Hapus spasi ekstra
    text = re.sub(r'\s+', ' ', text).strip()

    words = text.split()
    filtered_words = [word for word in words if word.lower() not in stop_words and len(word) > 2]

    return ' '.join(filtered_words)

# Parameters
max_features = 5000
max_len = 100

# Load data unlabeled dengan kolom IDText dan Text
unlabeled_df = pd.read_csv('dataset_unlabeled_penyisihan_bdc_2024.csv', delimiter=';')

# Membersihkan teks pada kolom Text
unlabeled_df['cleaned_text'] = unlabeled_df['Text'].apply(clean_text)

# Tokenisasi dan padding sekuen teks
tokenizer = Tokenizer(num_words=max_features)
tokenizer.fit_on_texts(unlabeled_df['cleaned_text'])
X_unlabeled = tokenizer.texts_to_sequences(unlabeled_df['cleaned_text'])
X_unlabeled = pad_sequences(X_unlabeled, maxlen=max_len)

# Prediksi label untuk data unlabeled dengan model yang sudah dilatih
predicted_labels = model.predict(X_unlabeled)

# Memilih indeks dengan nilai tertinggi sebagai label prediksi
predicted_indices = np.argmax(predicted_labels, axis=1)
predicted_classes = label_encoder.inverse_transform(predicted_indices)  # Assuming you have label_encoder

# Memasukkan hasil prediksi ke dalam DataFrame
unlabeled_df['predicted_label'] = predicted_classes

# Menyimpan hasil prediksi ke file CSV atau menampilkan di layar
print(unlabeled_df[['IDText', 'Text', 'predicted_label']])

      IDText                                               Text  \
0    TXT0001  Lu mau org2 pro-demokrasi di negara ini bisa p...   
1    TXT0002  Prabowo ditanya soal hutang luar negeri dia me...   
2    TXT0003  kiki_daliyo  Ganjar Pranowo itulah beliau soso...   
3    TXT0004  @kumparan Prabowo Gibran yang bisa melakukan i...   
4    TXT0005  @sniperruben45 @uda_zulhendra @ainunnajib Lah ...   
..       ...                                                ...   
995  TXT0996  Bikin bangga deh, Ganjar-Mahfud mau alokasikan...   
996  TXT0997  Pak Jokowi sebelum pilpres 2024 berbesar hati ...   
997  TXT0998  @datuakrajoangek Sbaiknya si gemot nga usah ik...   
998  TXT0999  kebiasaan merembuk atau bermusyawarah jadi gay...   
999  TXT1000  Mirage Ditolak Juwono, Dibeli Prabowo, Jubir T...   

     predicted_label  
0                  4  
1                  5  
2                  5  
3                  5  
4                  5  
..               ...  
995                5  
996        

In [None]:
from sklearn.preprocessing import LabelEncoder

# Load the labeled dataset
df_train = pd.read_csv('dataset_penyisihan_bdc_2024.csv', delimiter=';')

# Encode labels in the training dataset
label_encoder = LabelEncoder()
y_train_encoded = label_encoder.fit_transform(df_train['label'])

# Assuming you have already predicted labels in `unlabeled_df['predicted_label']`
# In your case, it's `unlabeled_df['predicted_label']`

# Decode predicted labels back to original text labels
predicted_labels_text = label_encoder.inverse_transform(unlabeled_df['predicted_label'])

# Insert the predicted labels into the DataFrame
unlabeled_df['predicted_label_text'] = predicted_labels_text

# Display or save the relevant columns
print(unlabeled_df[['IDText', 'Text', 'predicted_label_text']])


      IDText                                               Text  \
0    TXT0001  Lu mau org2 pro-demokrasi di negara ini bisa p...   
1    TXT0002  Prabowo ditanya soal hutang luar negeri dia me...   
2    TXT0003  kiki_daliyo  Ganjar Pranowo itulah beliau soso...   
3    TXT0004  @kumparan Prabowo Gibran yang bisa melakukan i...   
4    TXT0005  @sniperruben45 @uda_zulhendra @ainunnajib Lah ...   
..       ...                                                ...   
995  TXT0996  Bikin bangga deh, Ganjar-Mahfud mau alokasikan...   
996  TXT0997  Pak Jokowi sebelum pilpres 2024 berbesar hati ...   
997  TXT0998  @datuakrajoangek Sbaiknya si gemot nga usah ik...   
998  TXT0999  kebiasaan merembuk atau bermusyawarah jadi gay...   
999  TXT1000  Mirage Ditolak Juwono, Dibeli Prabowo, Jubir T...   

        predicted_label_text  
0    Pertahanan dan Keamanan  
1                    Politik  
2                    Politik  
3                    Politik  
4                    Politik  
..       

# SAVE TO EXCEL

In [None]:
# Define the path and file name for saving the Excel file
output_file = 'SD2024040000423.csv'

# Save the DataFrame to Excel with comma delimiter
unlabeled_df.to_csv(output_file, sep=',', index=False)