In [100]:
# Import dependencies
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.svm import LinearSVC
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import accuracy_score, precision_score, recall_score
from sklearn.svm import LinearSVC
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.pipeline import Pipeline
import pandas as pd
import joblib

In [101]:
# Load Data

data = pd.read_csv('question_list.csv')
print(data)

                                          Question List
0                            completionHistory_question
1                                   "Internet mati nih"
2                                "Harga STB berapa ya?"
3                         "Internet saya putus sambung"
4     "Mengapa saya tidak dapat mengisi formulir rel...
...                                                 ...
1995                              "harga modem berapa?"
1996                  "kapan mybiznet ada di appstore?"
1997  "Saya tidak bisa melakukan pembayaran melalui ...
1998            "untuk promo biznet home ada apa saja?"
1999  "Min internet saya putus sambung dari sore tad...

[2000 rows x 1 columns]


In [102]:
# Overview Data

data.describe()

Unnamed: 0,Question List
count,2000
unique,1761
top,"""mau beli quota"""
freq,16


In [103]:
# Cek data yang duplikat

data.duplicated(subset='Question List').sum()

239

In [104]:
# Hapus data duplikat

data_unique = data.drop_duplicates(subset='Question List').reset_index(drop=True)
data_unique.describe()

Unnamed: 0,Question List
count,1761
unique,1761
top,completionHistory_question
freq,1


In [105]:
# Cek data

data_unique.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1761 entries, 0 to 1760
Data columns (total 1 columns):
 #   Column         Non-Null Count  Dtype 
---  ------         --------------  ----- 
 0   Question List  1761 non-null   object
dtypes: object(1)
memory usage: 13.9+ KB


In [106]:
# Hapus kolom yang tidak berupa pesan

data_filter = data_unique[data_unique['Question List'].str.contains('"')]
data_filter

Unnamed: 0,Question List
1,"""Internet mati nih"""
2,"""Harga STB berapa ya?"""
3,"""Internet saya putus sambung"""
4,"""Mengapa saya tidak dapat mengisi formulir rel..."
5,"""mengapa ketika ingin melakukan pembayaran tid..."
...,...
1756,"""harga modem berapa?"""
1757,"""kapan mybiznet ada di appstore?"""
1758,"""Saya tidak bisa melakukan pembayaran melalui ..."
1759,"""untuk promo biznet home ada apa saja?"""


In [107]:
# Data Labeling secara manual

# Ganti nama kolom
data_filter['question_list'] = data_filter['Question List'].str.replace('"', '', regex=False)

# Fungsi data labeling
def auto_label(text):
    text = text.lower()
    
    info_keywords = ['harga', 'biaya', 'bayar', 'pembayaran', 'promo', 'paket apa', 'tagihan', 'produk', 'registrasi', 'cara', 'informasi', 'info', 'change']
    request_keywords = ['mohon', 'tolong', 'minta', 'ganti', 'kirim', 'relokasi', 'terminasi', 'reset', 'cek area', 'instalasi', 'butuh', 'ingin', 'request', 'bisa dibantu']
    problem_keywords = ['lemot', 'lambat', 'putus', 'tidak bisa', 'masalah', 'error', 'modem', 'router', 'kuota', 'mati', 'gangguan', 'trouble', 'los']

    if any(kw in text for kw in info_keywords):
        return 'Information'
    elif any(kw in text for kw in request_keywords):
        return 'Request'
    elif any(kw in text for kw in problem_keywords):
        return 'Problem'
    else:
        return 'Unknown'

# Terapkan label
data_filter['label'] = data_filter['question_list'].apply(auto_label)

# Simpan ke file baru
output_path = "dataset_labeled.csv"
data_filter[['question_list', 'label']].to_csv(output_path, index=False)

output_path

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data_filter['question_list'] = data_filter['Question List'].str.replace('"', '', regex=False)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data_filter['label'] = data_filter['question_list'].apply(auto_label)


'dataset_labeled.csv'

In [108]:
# Cek jumlah data berdasarkan labelnya

data_labeled = pd.read_csv('dataset_labeled.csv')
label_counts_clean = data_labeled['label'].value_counts()
label_counts_clean

Unknown        721
Information    627
Problem        247
Request        165
Name: label, dtype: int64

In [109]:
# Hapus data dengan label Unknown
data_clean = data_labeled[data_labeled['label'] != 'Unknown'].copy()

# Reset indexnya supaya rapi
data_clean.reset_index(drop=True, inplace=True)

# Simpan file tanpa Unknown
output_path_clean = "dataset_labeled_clean.csv"
data_clean.to_csv(output_path_clean, index=False)

# Cek jumlah data berdasarkan labelnya
label_counts_clean = data_clean['label'].value_counts()
label_counts_clean

Information    627
Problem        247
Request        165
Name: label, dtype: int64

In [110]:
# Pembobotan data dengan TF-IDF dan stopwords

# Siapkan fitur dan label
X = data_clean['question_list']
y = data_clean['label']

# Inisialisasi dan transformasi TF-IDF dengan stopwords
stop_words = ['yang', 'dan', 'di', 'ke', 'untuk', 'saya', 'apa', 'itu']
vectorizer = TfidfVectorizer(stop_words=stop_words)
X_vec = vectorizer.fit_transform(X)

# Tampilkan bentuk hasil vektorisasi
X_vec.shape

(1039, 1026)

In [111]:
# Cek kamus kata dalam data

vocab_list = vectorizer.get_feature_names_out()
vocab_preview = vocab_list[:900]

vocab_preview, len(vocab_list)

(array(['000', '0d', '0ds', '0dw', '10', '1000', '10003', '1000308564',
        '10mbps', '12', '15', '150', '150mbps', '1d', '1dn', '1ds', '1dw',
        '1g', '2024', '2025', '25', '2d', '2dn', '2ds', '2dw', '30', '3d',
        '3dn', '3ds', '3dw', '45', '4ghz', '525', '5g', '5ghz', '6g',
        'abis', 'account', 'ada', 'adanya', 'adaptor', 'add', 'address',
        'adik', 'aga', 'agar', 'agent', 'aja', 'ajaa', 'ajuin', 'ajukan',
        'akan', 'akhir', 'akibat', 'aklau', 'akses', 'aktif', 'aktifnya',
        'aktit', 'aktivasi', 'aku', 'akun', 'akunnya', 'alamat', 'alatnya',
        'alkun', 'aman', 'ambah', 'ambil', 'an', 'analyzer', 'anda',
        'another', 'apaan', 'apabila', 'apakah', 'aplikasi', 'app', 'apps',
        'area', 'ask', 'assisten', 'atas', 'atau', 'atik', 'auto',
        'autodebet', 'available', 'awal', 'ayodance', 'bagai',
        'bagaiamana', 'bagaimana', 'bagainmana', 'bagiamana', 'bagimana',
        'baik', 'baiknya', 'bakal', 'bali', 'bandara', 'bandun

In [112]:
# Inisialisasi StratifiedKFold (agar pendistribusian merata dan memiliki 5 fold)
skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

# Loop dan simpan index train/test tiap fold
folds_info = []
for fold, (train_idx, test_idx) in enumerate(skf.split(X_vec, y), 1):
    folds_info.append({
        'Fold': fold,
        'Train size': len(train_idx),
        'Test size': len(test_idx),
        'Label distribusi train': y.iloc[train_idx].value_counts().to_dict(),
        'Label distribusi test': y.iloc[test_idx].value_counts().to_dict()
    })

folds_info

[{'Fold': 1,
  'Train size': 831,
  'Test size': 208,
  'Label distribusi train': {'Information': 502,
   'Problem': 197,
   'Request': 132},
  'Label distribusi test': {'Information': 125, 'Problem': 50, 'Request': 33}},
 {'Fold': 2,
  'Train size': 831,
  'Test size': 208,
  'Label distribusi train': {'Information': 502,
   'Problem': 197,
   'Request': 132},
  'Label distribusi test': {'Information': 125, 'Problem': 50, 'Request': 33}},
 {'Fold': 3,
  'Train size': 831,
  'Test size': 208,
  'Label distribusi train': {'Information': 501,
   'Problem': 198,
   'Request': 132},
  'Label distribusi test': {'Information': 126, 'Problem': 49, 'Request': 33}},
 {'Fold': 4,
  'Train size': 831,
  'Test size': 208,
  'Label distribusi train': {'Information': 501,
   'Problem': 198,
   'Request': 132},
  'Label distribusi test': {'Information': 126, 'Problem': 49, 'Request': 33}},
 {'Fold': 5,
  'Train size': 832,
  'Test size': 207,
  'Label distribusi train': {'Information': 502,
   'Probl

In [113]:
# Latih model dan evaluasi tiap fold
results = []
for fold, (train_idx, test_idx) in enumerate(skf.split(X_vec, y), 1):
    X_train, X_test = X_vec[train_idx], X_vec[test_idx]
    y_train, y_test = y.iloc[train_idx], y.iloc[test_idx]

    model = LinearSVC()
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)

    results.append({
        'Fold': fold,
        'Accuracy': accuracy_score(y_test, y_pred),
        'Precision (macro)': precision_score(y_test, y_pred, average='macro'),
        'Recall (macro)': recall_score(y_test, y_pred, average='macro')
    })

# Konversi ke DataFrame
results_final = pd.DataFrame(results)

# Tambahkan baris average
average_row = {
    'Fold': 'Average',
    'Accuracy': results_final['Accuracy'].mean(),
    'Precision (macro)': results_final['Precision (macro)'].mean(),
    'Recall (macro)': results_final['Recall (macro)'].mean()
}

results_final = pd.concat([results_final, pd.DataFrame([average_row])], ignore_index=True)

results_final


Unnamed: 0,Fold,Accuracy,Precision (macro),Recall (macro)
0,1,0.966346,0.963845,0.940162
1,2,0.9375,0.919865,0.920162
2,3,0.932692,0.910503,0.878376
3,4,0.942308,0.916315,0.919364
4,5,0.937198,0.92021,0.904183
5,Average,0.943209,0.926148,0.912449


In [114]:
# Buat pipeline yang gabungkan TF-IDF dan model klasifikasi
text_clf = Pipeline([
    ('tfidf', TfidfVectorizer(stop_words=stop_words)),
    ('clf', LinearSVC())
])

# Latih pipeline
text_clf.fit(X, y)

# Simpan pipeline ke file tunggal
joblib.dump(text_clf, 'text_classifier_pipeline.pkl')

['text_classifier_pipeline.pkl']