## Masukan library yang digunakan

In [1]:
import pandas as pd
import numpy as np 
import matplotlib.pyplot as plt

In [2]:
import nltk
nltk.download('stopwords')

#library untuk text analasis

[nltk_data] Downloading package stopwords to C:\Users\Alifia
[nltk_data]     Nurhasanah\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

## load dataset

In [3]:
#memanggil dataset sms_spam_new. dan akan muncul 5 baris awal dari dataset
data = pd.read_csv('dataset_sms_new.csv')
data.head()
#terdapat teks dan label. di mana label terdiri dari 3 jenis yaitu 2 = sms promo, 1 = sms penipu, 0 = sms biasa/normal

Unnamed: 0,teks,label
0,[PROMO] Beli paket Flash mulai 1GB di MY TELKO...,2
1,2.5 GB/30 hari hanya Rp 35 Ribu Spesial buat A...,2
2,"2016-07-08 11:47:11.Plg Yth, sisa kuota Flash ...",2
3,"2016-08-07 11:29:47.Plg Yth, sisa kuota Flash ...",2
4,4.5GB/30 hari hanya Rp 55 Ribu Spesial buat an...,2


## Text Preprocessing

Tahap preprocessing data berguna untuk membersihkan dataset sebelum nantinya diolah

### 1. Case Folding

Case folding: tahap pembersihan data. Contohnya menghilangkan url, angka, tanda baca, dan merubah menjadi huruf kecil

In [4]:
import re

# membuat fungsi untuk case folding
def casefolding(text):
    text = str(text)
    text = text.lower()                                   #fungsi merubah teks menjadi huruf kecil                           # Merubah kalimat menjadi huruf kecil
    text = re.sub(r'https?://\s+|www.\.\s+', '', text)    #fungsi menghapus url dari teks
    text = re.sub(r'[-+]?[0-9]+', '', text)               #fungsi menghapus angka dari teks
    text = re.sub(r'[^\w\s]', '', text)                   #fungsi menghapus simbol tanda baca dari teks
    text = text.strip()
    return text

In [5]:
# Membandingkan before after case folding
raw_sample = data['teks'].iloc[2]           #mengambil dari dataset kolom teks dan baris ke-2
case_folding = casefolding(raw_sample)

print('Raw data\t : ', raw_sample)          #memanggil data kondisi awal
print('case Folding\t :', case_folding)     #memanggil data case folding

Raw data	 :  2016-07-08 11:47:11.Plg Yth, sisa kuota Flash Anda 478KB. Download MyTelkomsel apps di http://tsel.me/tsel utk cek kuota&beli paket Flash atau hub *363#
case Folding	 : plg yth sisa kuota flash anda kb download mytelkomsel apps di httptselmetsel utk cek kuotabeli paket flash atau hub


Data setelah di-case folding, angka dan link akan hilang. Teks pun menjadi huruf kecil semua.

### 2. Word Normalization

Word Normalization: Membuat konversi untuk menormalisasi data. Word Normalization diambil dari data key_norm yang berisi kata yang disingkat menjadi teks normal untuk merapihkan pada dataset

In [6]:
key_norm = pd.read_csv('key_norm.csv')      #fungsi panggil data key_norm untuk melakukan konfersi

def text_normalize(text):                   #fungsi untuk normalize teks
    text = ' '.join([key_norm[key_norm['singkat'] == word]['hasil'].values[0]   #mengkonfersi kata singkat ke hasil
                     if (key_norm['singkat'] == word).any()                     #membuat kondisi melakukan perubahan jika kata dalam dataset singkat akan menjadi hasil
                     else word for word in text.split()
                     ])
    
    text = str.lower(text)
    return text


In [7]:
# Membandingkan before after word normalization

raw_data = data['teks'].iloc[696]           #raw_data = dataset yang belum dinormalisasi
word_normal = text_normalize(case_folding)  #world_nomal = dataset yang sudah di-casefolding sebelumnya. Jadi sebelum data dinormalisasi, harus di-case folding terlebih dahulu

print('raw_data\t :', raw_data)
print('word Normalize\t :', word_normal)

raw_data	 : Ngenet hemat anti lelet pakai IM3, khusus kamu 1GB Cuma SECENG. Yes! 1rb ajahh berlaku 1 hari, mau? Balas SMS ini ketik SECENG
word Normalize	 : pelanggan yang terhormat sisa kuota flash anda kb download mytelkomsel apps di httptselmetsel untuk cek kuotabeli paket flash atau hub


Raw data akan muncul dari data kolom 696
Sedangkan data yang sudah dinormalisasi akan berubah sesuai dengan dataset dari key_norm.csv. Seperti plg yth menjadi pelanggan yang terhormat.

### 3. Filtering (Stopword Removal)

In [8]:
from nltk.tokenize import sent_tokenize, word_tokenize #panggil library nltk
from nltk.corpus import stopwords

stopwords_ind = stopwords.words('indonesian')           #gunakan stopwords indonesia

In [9]:
len(stopwords_ind)

758

In [10]:
# melihat daftar stopword dari nltk
stopwords_ind

['ada',
 'adalah',
 'adanya',
 'adapun',
 'agak',
 'agaknya',
 'agar',
 'akan',
 'akankah',
 'akhir',
 'akhiri',
 'akhirnya',
 'aku',
 'akulah',
 'amat',
 'amatlah',
 'anda',
 'andalah',
 'antar',
 'antara',
 'antaranya',
 'apa',
 'apaan',
 'apabila',
 'apakah',
 'apalagi',
 'apatah',
 'artinya',
 'asal',
 'asalkan',
 'atas',
 'atau',
 'ataukah',
 'ataupun',
 'awal',
 'awalnya',
 'bagai',
 'bagaikan',
 'bagaimana',
 'bagaimanakah',
 'bagaimanapun',
 'bagi',
 'bagian',
 'bahkan',
 'bahwa',
 'bahwasanya',
 'baik',
 'bakal',
 'bakalan',
 'balik',
 'banyak',
 'bapak',
 'baru',
 'bawah',
 'beberapa',
 'begini',
 'beginian',
 'beginikah',
 'beginilah',
 'begitu',
 'begitukah',
 'begitulah',
 'begitupun',
 'bekerja',
 'belakang',
 'belakangan',
 'belum',
 'belumlah',
 'benar',
 'benarkah',
 'benarlah',
 'berada',
 'berakhir',
 'berakhirlah',
 'berakhirnya',
 'berapa',
 'berapakah',
 'berapalah',
 'berapapun',
 'berarti',
 'berawal',
 'berbagai',
 'berdatangan',
 'beri',
 'berikan',
 'berikut'

In [11]:
# Membuat fungsi stowprd removal

# kita juga dapat menambahkan kata baru dalam stopword
more_stopword = ['tsel', 'gb', 'rb', 'btw']
stopwords_ind = stopwords_ind + more_stopword

# fungsi untuk stopword removalnya
def remove_stop_word(text):
    clean_words = []            #kasih parameter clean words
    text = text.split()
    for word in text:
        if word not in stopwords_ind:
            clean_words.append(word)
    return " ".join(clean_words)

In [12]:
#membandingkan before dan after dari raw sample, case folding, dan stopword removal

raw_sample = data['teks'].iloc[696]
case_folding = casefolding(raw_sample)
stopword_removal = remove_stop_word(case_folding)
#mengambil data 696 lalu data melalui tahap case folding setelah itu ke tahap stopword removal

print('raw data\t\t :', raw_data)
print('case folding \t\t :', case_folding)
print('stopword removal \t :', stopword_removal)

raw data		 : Ngenet hemat anti lelet pakai IM3, khusus kamu 1GB Cuma SECENG. Yes! 1rb ajahh berlaku 1 hari, mau? Balas SMS ini ketik SECENG
case folding 		 : ngenet hemat anti lelet pakai im khusus kamu gb cuma seceng yes rb ajahh berlaku  hari mau balas sms ini ketik seceng
stopword removal 	 : ngenet hemat anti lelet pakai im khusus seceng yes ajahh berlaku balas sms ketik seceng


Raw data (data mentah) setelah melalui tahap case folding akan terdapat beberapa perubahan (huruf menjadi kecil semua). Setelah melalui tahap stopword removal, akan ada beberapa kata yang terhapus (kata btw) berdasarkan library

### 4. Stemming

Pastikan sudah install library sastrawi untuk melakukan analisis terhadap dokumen yang dimiliki.

In [13]:
!pip -q install sastrawi

In [14]:
# Merubah kata menjadi kata dasar
from Sastrawi.Stemmer.StemmerFactory import StemmerFactory  #memanggil library sastrawi
factory = StemmerFactory()
stemmer = factory.create_stemmer()

# Membuat fungsi untung stemming bahasa indonesia
def stemming(text):
    text = stemmer.stem(text)
    return text

In [15]:
#membandingkan before dan after dari raw sample, case folding, stopword removal, dan stemming

raw_sample = data['teks'].iloc[696]
case_folding = casefolding(raw_sample)
stopword_removal = remove_stop_word(case_folding)
text_stemming = stemming(stopword_removal)

print('Raw Data \t\t :', raw_sample)
print('Case Folding \t\t :', case_folding)
print('Stopword Removal \t :', stopword_removal)
print('Stemming \t\t :', text_stemming)

Raw Data 		 : Ngenet hemat anti lelet pakai IM3, khusus kamu 1GB Cuma SECENG. Yes! 1rb ajahh berlaku 1 hari, mau? Balas SMS ini ketik SECENG
Case Folding 		 : ngenet hemat anti lelet pakai im khusus kamu gb cuma seceng yes rb ajahh berlaku  hari mau balas sms ini ketik seceng
Stopword Removal 	 : ngenet hemat anti lelet pakai im khusus seceng yes ajahh berlaku balas sms ketik seceng
Stemming 		 : ngenet hemat anti lelet pakai im khusus seceng yes ajahh laku balas sms ketik seceng


Hasil case folding (kalimat menjadi huruf kecil), hasil stopword removal (beberapa teks ada yang terhapus seperti btw), hasil stemming (melakukan menjadi kata dasar)

## Text Preprocessing Pipeline

Melihat hasil dari keempat tahap preprocessing. Kita akan membuat fungsi tahap preprocessing

In [16]:
#membuat fungsi untuk menggabungkan seluruh langkah text preprocessing
def text_preprocessing_process(text):
    text = casefolding(text)        #panggil setiap tahap 
    text = text_normalize(text)
    text = remove_stop_word(text)
    text = stemming(text)
    return text 

In [25]:
#mengeksekusi tiap tahap preprocessing ke dalam dataset yang kita punya (dataset spam)
#hasilnya akan menghasilkan clean teks
#time digunakan untuk melihat berapa lama proses cleaning teksnya

%%time
data['clean_teks']= data['teks'].apply(text_preprocessing_process)

UsageError: Line magic function `%%time` not found.


In [26]:
%%time
data['clean_teks']= data['teks'].apply(text_preprocessing_process)

CPU times: total: 2min 32s
Wall time: 6min 46s


Waktu kecepatan proses cleaning tergantung laptop masing-masing. Jika sudah melakukan cleaning, kita dapat melihat kembali datanya.

In [27]:
data

Unnamed: 0,teks,label,clean_teks
0,[PROMO] Beli paket Flash mulai 1GB di MY TELKO...,2,promo beli paket flash my telkomsel app dpt ex...
1,2.5 GB/30 hari hanya Rp 35 Ribu Spesial buat A...,2,rupiah ribu spesial pilih aktif promo sd novem...
2,"2016-07-08 11:47:11.Plg Yth, sisa kuota Flash ...",2,langgan hormat sisa kuota flash kb download my...
3,"2016-08-07 11:29:47.Plg Yth, sisa kuota Flash ...",2,langgan hormat sisa kuota flash kb download my...
4,4.5GB/30 hari hanya Rp 55 Ribu Spesial buat an...,2,rupiah ribu spesial pilih aktif buru skb
...,...,...,...
1631,"Yooo sama2, oke nanti aku umumin di grup kelas",0,yooo oke umumin grup kelas
1632,😁 sebelumnya ga ad nulis kerudung. Kirain warn...,0,ga nulis kerudung kirain warna jins
1633,Mba mau kirim 300 ya,0,mbak kirim ya
1634,nama1 beaok bwrangkat pagi...mau cas atay tra...,0,nama beaok bwrangkat pagimau cas atay tranfer


Hasilnya akan ada teks dataset mentah dan juga dataset yang sudah melalui proses preprocessing/cleaning

In [28]:
# Simpan data yang sudah di-Preprocessing ke dalam file csv

data.to_csv('clean_data.csv')

Jika sudah, nantinya data yang sudah clean akan otomatis tersimpan di folder

## Feature Engineering

In [29]:
# Pisahkan kolom feature dan target
x = data['clean_teks']
y = data['label']

In [30]:
x
#menampilkan data cleanning

0       promo beli paket flash my telkomsel app dpt ex...
1       rupiah ribu spesial pilih aktif promo sd novem...
2       langgan hormat sisa kuota flash kb download my...
3       langgan hormat sisa kuota flash kb download my...
4                rupiah ribu spesial pilih aktif buru skb
                              ...                        
1631                           yooo oke umumin grup kelas
1632                  ga nulis kerudung kirain warna jins
1633                                        mbak kirim ya
1634        nama beaok bwrangkat pagimau cas atay tranfer
1635                                       nomor bri nama
Name: clean_teks, Length: 1636, dtype: object

In [31]:
y
#menampilkan label

0       2
1       2
2       2
3       2
4       2
       ..
1631    0
1632    0
1633    0
1634    0
1635    0
Name: label, Length: 1636, dtype: int64

##  Feature extraction (TF-IDF dan N-Gram)

In [32]:
# save model dan butuh library pickle
import pickle

#Mengambil library TF-IDF
#TF-IDF ada di feature extractionnya sklearn
from sklearn.feature_extraction.text import TfidfVectorizer

#Unigram
vec_TF_IDF = TfidfVectorizer(ngram_range=(1,1))
vec_TF_IDF.fit(x)       #mengambil variable feature (x) untuk dibobotkan

#definisikan
x_tf_idf = vec_TF_IDF.transform(x)

#set modelnya. set feature tf idf ke SAV
pickle.dump(vec_TF_IDF.vocabulary_, open("feature_tf-idf.sav", "wb"))

Nantinya file feature_tf-idf.sav akan tersimpan yang merupakan hasil pickling dari model tf-idf

In [33]:
# menamilpilkan vocabulary dari TF-IDF
vec_TF_IDF.vocabulary_

{'promo': 3201,
 'beli': 380,
 'paket': 2944,
 'flash': 1174,
 'my': 2657,
 'telkomsel': 3921,
 'app': 195,
 'dpt': 1015,
 'extra': 1133,
 'kuota': 2219,
 'lte': 2365,
 'telpon': 3925,
 'mnthr': 2598,
 'buru': 639,
 'cek': 688,
 'tselmemytsel': 4114,
 'sk': 3685,
 'rupiah': 3462,
 'ribu': 3394,
 'spesial': 3757,
 'pilih': 3058,
 'aktif': 81,
 'sd': 3527,
 'november': 2822,
 'langgan': 2257,
 'hormat': 1513,
 'sisa': 3678,
 'kb': 2014,
 'download': 1010,
 'mytelkomsel': 2694,
 'apps': 201,
 'httptselmetsel': 1647,
 'kuotabeli': 2220,
 'hub': 1651,
 'skb': 3686,
 'ekstra': 1082,
 'pulsa': 3246,
 'dg': 898,
 'internet': 1770,
 'bulan': 622,
 'sjk': 3684,
 'augsept': 259,
 'detail': 892,
 'iring': 1796,
 'dgn': 899,
 'tarif': 3882,
 'hr': 1526,
 'panjang': 2959,
 'hits': 1500,
 'armada': 216,
 'curi': 804,
 'hati': 1449,
 'tekan': 3914,
 'okcall': 2883,
 'info': 1727,
 'eks': 1077,
 'loh': 2348,
 'internetan': 1771,
 'pakai': 2942,
 'volume': 4267,
 'ultima': 4181,
 'mbhr': 2477,
 'hrga': 

In [34]:
# Melihat Jumlah Feature
print(len(vec_TF_IDF.get_feature_names_out()))

4556


Akan ada 3253 feature dalam data clean yang sudah kita load

In [35]:
# Melihat Jumlah feature apa saja yang ada di dalam corpus
print(vec_TF_IDF.get_feature_names_out())


['aa' 'aamiiiin' 'aamiin' ... 'zjt' 'zona' 'ztkm']


In [36]:
#membuat dan melihat data tabular menggunakan tfidf. kata-kata feature akan dibuat dalam bentuk tabular
x1 = vec_TF_IDF.transform(x).toarray()
data_tabular_tf_idf =pd.DataFrame(x1,columns=vec_TF_IDF.get_feature_names_out())
data_tabular_tf_idf

Unnamed: 0,aa,aamiiiin,aamiin,ab,abadi,abai,abal,abbee,abdul,abdullah,...,yuni,yunit,yup,zahra,zalora,zara,zarkasi,zjt,zona,ztkm
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1631,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1632,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1633,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1634,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [37]:
#mencari data yang ada bobotnya jika di datasetnya itu ada kata-katanya
data_tabular_tf_idf.iloc[10:20,60:70]

Unnamed: 0,airpods,aja,ajaa,ajaaa,ajabri,ajahh,ajak,ajakin,ajaliat,ajar
10,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
11,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
12,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
13,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
14,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
15,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
16,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
17,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
18,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
19,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


## Feature Selection

Feature Selection menggunakan c-square. Mengubah data tabular di tf-idf dari dataframe menjadi sebuah array yang dapat dijalankan pada feature selection

In [38]:
#pendeteksian array
x_train = np.array(data_tabular_tf_idf)
y_train = np.array(y)

In [39]:
#panggil library c-square
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import chi2

#ambil nilai chi tertinggi
chi2_features = SelectKBest(chi2, k=3000)
x_kbest_features = chi2_features.fit_transform(x_train, y_train)

#untuk reduced featured
print('Original Feature Number', x_train.shape[1])
print('Reduced Feature Number', x_kbest_features.shape[1])

Original Feature Number 4556
Reduced Feature Number 3000


Feature sebelumnya 3253 dan setelah di-reduced menggunakan nilai chi yang paling bagus maka berubah menjadi 3000

Selanjutnya melihat chi2 feature akan coba score merupakan chi yang paling tinggi. Jadi semakin tinggi nilai chinya maka semakin baiik featurenya.

In [40]:
Data = pd.DataFrame(chi2_features.scores_,columns=['Nilai'])
Data

Unnamed: 0,Nilai
0,50.365104
1,0.661634
2,2.868013
3,1.116807
4,1.234403
...,...
4551,0.343228
4552,0.804937
4553,1.116807
4554,0.967605


Maka akan muncul 3253 nilai, karena kita reduce menjadi 3000

In [41]:
# Menampilkan data feature beserta nilainya

feature = vec_TF_IDF.get_feature_names_out()
feature

#panggil datanya
Data['Feature'] = feature
Data

Unnamed: 0,Nilai,Feature
0,50.365104,aa
1,0.661634,aamiiiin
2,2.868013,aamiin
3,1.116807,ab
4,1.234403,abadi
...,...,...
4551,0.343228,zara
4552,0.804937,zarkasi
4553,1.116807,zjt
4554,0.967605,zona


In [42]:
# Mengurutkan/Sort Nilai Feature Terbaik
Data.sort_values(by='Nilai', ascending=False)

Unnamed: 0,Nilai,Feature
3429,573.263163,rpjutadr
3282,573.263163,raffimbak
3004,573.263163,pegurusannya
2705,573.263163,nagita
2491,573.263163,medpt
...,...,...
3597,0.033050,september
2045,0.023946,keluarga
203,0.023578,april
3347,0.021787,registrasi


Hasilnya feature yang paling tinggi yaitu kata paket.Ini jika kita urutkan berdasarkan nilai feature terbaik

In [43]:
#menampilkan mask pada feature yang diseleksi
mask = chi2_features.get_support()
mask

array([ True,  True,  True, ...,  True,  True,  True])

In [44]:
# Menampilkan Feature yang terpilih berdasarkan Nilai mask atau tertinggi yang sudah di tetapkan pada chi square

new_feature=[]
for bool, f in zip(mask, feature):
    if bool:
        new_feature.append(f)
        selected_feature=new_feature
selected_feature

['aa',
 'aamiiiin',
 'aamiin',
 'ab',
 'abadi',
 'abai',
 'abbee',
 'abdul',
 'acaratks',
 'account',
 'ada',
 'adi',
 'adik',
 'adison',
 'admin',
 'administrasi',
 'adminlte',
 'adrian',
 'aduh',
 'advertising',
 'aea',
 'afbe',
 'affc',
 'affiliates',
 'afr',
 'afrika',
 'agam',
 'agen',
 'agendain',
 'agenpulsa',
 'agst',
 'agt',
 'agua',
 'agun',
 'agus',
 'ahaha',
 'ahub',
 'aidzin',
 'aigoo',
 'air',
 'airpods',
 'ajaa',
 'ajabri',
 'ajahh',
 'ajak',
 'ajeng',
 'aju',
 'akang',
 'akangteteh',
 'akbar',
 'akreditasi',
 'akses',
 'aksi',
 'aktfkan',
 'aktif',
 'aktifasi',
 'aktivasi',
 'akucintaislam',
 'akun',
 'akurasi',
 'akurat',
 'alaikum',
 'alaikumsaya',
 'alaiqum',
 'alam',
 'alamat',
 'alamsyah',
 'alas',
 'alat',
 'alesannya',
 'algoritma',
 'alhamdulillah',
 'alhuda',
 'ali',
 'all',
 'allah',
 'allahaamiin',
 'alphard',
 'alur',
 'alwayson',
 'aman',
 'amanda',
 'amazon',
 'ambil',
 'amin',
 'amoled',
 'ampuun',
 'an',
 'ana',
 'anabdullah',
 'ananda',
 'anda',
 'andaa

In [45]:
#membuat vocabulary baru berdasarkan fitur yang terseleksi
#untuk men-generate fitur yang ada di tf-idf pada proses deployment

new_selected_feature = {}

#kasih kondisinya
for (k,v) in vec_TF_IDF.vocabulary_.items():
    if k in selected_feature:
        new_selected_feature[k]=v

new_selected_feature

{'promo': 3201,
 'beli': 380,
 'paket': 2944,
 'flash': 1174,
 'my': 2657,
 'telkomsel': 3921,
 'app': 195,
 'dpt': 1015,
 'extra': 1133,
 'kuota': 2219,
 'lte': 2365,
 'telpon': 3925,
 'mnthr': 2598,
 'buru': 639,
 'cek': 688,
 'tselmemytsel': 4114,
 'sk': 3685,
 'rupiah': 3462,
 'ribu': 3394,
 'spesial': 3757,
 'pilih': 3058,
 'aktif': 81,
 'sd': 3527,
 'november': 2822,
 'langgan': 2257,
 'hormat': 1513,
 'sisa': 3678,
 'kb': 2014,
 'download': 1010,
 'mytelkomsel': 2694,
 'apps': 201,
 'httptselmetsel': 1647,
 'kuotabeli': 2220,
 'hub': 1651,
 'skb': 3686,
 'ekstra': 1082,
 'pulsa': 3246,
 'dg': 898,
 'internet': 1770,
 'bulan': 622,
 'sjk': 3684,
 'detail': 892,
 'iring': 1796,
 'dgn': 899,
 'tarif': 3882,
 'hr': 1526,
 'panjang': 2959,
 'hits': 1500,
 'tekan': 3914,
 'info': 1727,
 'loh': 2348,
 'internetan': 1771,
 'pakai': 2942,
 'volume': 4267,
 'ultima': 4181,
 'mbhr': 2477,
 'hrga': 1529,
 'tariflokasi': 3884,
 'tselmefl': 4109,
 'baru': 335,
 'nya': 2843,
 'hp': 1521,
 'ket

In [46]:
#melihat feature terbaru

len(new_selected_feature)

3000

In [47]:
#selection feature akan di-save ke dalam pickle yang nantinya file sav selection feature terbaru sudah tersimpan

pickle.dump(new_selected_feature, open("new_selected_feature_tf-idf.sav" , "wb"))

In [48]:
# Menampilkan Feature yang sudah diseleksi

data_selected_feature = pd.DataFrame(x_kbest_features, columns=selected_feature)
data_selected_feature

Unnamed: 0,aa,aamiiiin,aamiin,ab,abadi,abai,abbee,abdul,acaratks,account,...,yu,yudisium,yuk,yunit,yup,zahra,zarkasi,zjt,zona,ztkm
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1631,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1632,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1633,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1634,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


Kita sudah berhasil membuat feature selection yang baru. Datanya sudah dalam bentuk tabular, jadi bisa langsung dimodelkan menggunakan algoritma menggunakan naive bayes

## Modeling

Modeling menggunakan naive bayes

In [49]:
#Karena menggunakan data yang bersifat supervised learning, otomatis menggunakan data learning dan data testing
#definisikan x dan y
selected_x = x_kbest_features   #x_kbest_feature yang sudah diseleksi
selected_x

array([[0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]])

In [50]:
# import library untuk pemodelan text mining
import random
from sklearn.model_selection import train_test_split

# import library naive bayes. karena menggunakan algoritma Naive bayes
from sklearn.naive_bayes import MultinomialNB

In [51]:
#mengecek x dan y untuk data training untuk feature dan labelnya
#inisiasi x dan y
x = selected_x      
y = data.label

#setelah menentukan x dan y, baru pecah training dan testingnya
#ukurannya 20% testing 80% data training
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=0)

In [52]:
# Menampilkan jumlah data training dan data testing
print( 'Banyaknya X_train   : ', len(x_train))
print( 'Banyaknya X_test    : ', len (x_test))
print( 'Banyaknya Y_train   : ', len (y_train))
print( 'Banyaknya Y_test    : ', len(y_test))

Banyaknya X_train   :  1308
Banyaknya X_test    :  328
Banyaknya Y_train   :  1308
Banyaknya Y_test    :  328


Hasilnya dapat diliat, di mana 20% untuk testing dan 80% untuk training

In [53]:
# Memasukkan algoritmanya
# Proses training menggunakan Naive Bayes
text_algorithm = MultinomialNB()

In [54]:
#memasukkan modelnya
model = text_algorithm.fit(x_train, y_train)

In [55]:
# Membuat model prediksi

#masukkan sample data dari dataset untuk diimplementasikan untuk menentukan jenis sms (normal, promosi, penipuan)
data_input = ("promo beli paket flash my telkomsel app dpt extra kuota g lte extra telpon mnthr buru cek tselmemytsel sk")
data_input = text_preprocessing_process(data_input)

# Load untuk modelnya
tfidf = TfidfVectorizer

# Panggil data sav 
loaded_vec = TfidfVectorizer(decode_error="replace", vocabulary=set(pickle.load(open("new_selected_feature_tf-idf.sav", "rb"))))

hasil = model.predict(loaded_vec.fit_transform([data_input]))

#sampai sini, proses data input sudah masukkan akan diproses tfidfvectorizer dan akan dicek oleh new_selected_feature yang sudah dibuat

#selanjutnya kita kasih kondisi
if(hasil==0):
    s = "SMS Normal"
elif(hasil==1):
    s = "SMS Fraud"
else:
    s = "SMS Promo"

print("Hasil Prediksi : \n", s)

Hasil Prediksi : 
 SMS Promo


Hasilnya, kita sudah berhasil membuat model klasifikasi prediksi untuk mendeteksi sms penipuan menggunakan algoritma naive bayes

In [56]:
# Membuat model prediksi

data_input = ("tolong belikan mama pulsa nomor as mama teman mama celaka kluarganya hrus hubung mama ganti uangnyapenting")
data_input = text_preprocessing_process(data_input)

# Load
tfidf = TfidfVectorizer

loaded_vec = TfidfVectorizer(decode_error="replace", vocabulary=set(pickle.load(open("new_selected_feature_tf-idf.sav", "rb"))))

hasil = model.predict(loaded_vec.fit_transform([data_input]))

if(hasil==0):
    s = "SMS Normal"
elif(hasil==1):
    s = "SMS Fraud"
else:
    s = "SMS Promo"

print("Hasil Prediksi : \n", s)

Hasil Prediksi : 
 SMS Fraud


In [57]:
# Membuat model prediksi

data_input = ("delete ya nama pull sync masukin nama push")
data_input = text_preprocessing_process(data_input)

# Load
tfidf = TfidfVectorizer

loaded_vec = TfidfVectorizer(decode_error="replace", vocabulary=set(pickle.load(open("new_selected_feature_tf-idf.sav", "rb"))))

hasil = model.predict(loaded_vec.fit_transform([data_input]))

if(hasil==0):
    s = "SMS Normal"
elif(hasil==1):
    s = "SMS Fraud"
else:
    s = "SMS Promo"

print("Hasil Prediksi : \n", s)

Hasil Prediksi : 
 SMS Normal


## Evaluasi Model

Evaluasi model menggunakan confusion matrix

In [58]:
# Masukan Library yang dibutuhkan
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report

#panggil modelnya
predicted = model.predict(x_test)

CM = confusion_matrix(y_test, predicted)

#menampilkan data akurasi dari evaluasi modelnya
print(classification_report(y_test, predicted))

              precision    recall  f1-score   support

           0       0.99      0.80      0.89       122
           1       0.89      0.71      0.79        66
           2       0.79      1.00      0.88       139
           3       0.00      0.00      0.00         1

    accuracy                           0.87       328
   macro avg       0.67      0.63      0.64       328
weighted avg       0.88      0.87      0.86       328



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


Hasilnya terdapat precision dari ketiga label. Untuk akurasi ada di 0.67

In [59]:
# Menyimpan Model
pickle.dump(model, open("model_fraud.sav", "wb"))

Maka file sav akan tersimpan. Model ini yang nantinya akan dipanggil ke aplikasi streamlit untuk melengkapi production