## Klasifikasi Text - Tugas 3 Pengenalan Pola

## Baskara - 16/398499/PA/17460

### Import Library

In [1]:
import pandas as pd
import numpy as np
import nltk
import math
from Sastrawi.Stemmer.StemmerFactory import StemmerFactory

### Data Preparation (Pembagian Menjadi Training & Test)

In [2]:
df = pd.read_csv('data_penyakit.csv', names=['tanda_gejala','diagnosis_penyakit'])
training = df[df['diagnosis_penyakit'].notnull()]
test = df[df['diagnosis_penyakit'].isnull()]

In [3]:
training.reset_index(inplace=True, drop=True)
training.head()

Unnamed: 0,tanda_gejala,diagnosis_penyakit
0,"menggigil, demam, sakit kepala",Malaria (bentuk benigma)
1,"Kaku kuduk, penurunan kesadaran, muntah proyek...",Meningitis + perdarahan subarachnoid
2,"Pipi bengkak, nyeri saat mengunyah, nyeri testis",Parotitis
3,"Sakit gigi, gigi sensitif pada makanan dingin ...",Karies dentis
4,"Hidung tersumbat, bersin, batuk, sakit tenggor...",Common cold


In [4]:
test.reset_index(inplace=True, drop=True)
test.head()

Unnamed: 0,tanda_gejala,diagnosis_penyakit
0,"Mata lengket, mata berair, pandangan sedikit k...",
1,"Gusi bengkak, gusi kemerahan, gusi berdarah",
2,"Batuk lebih dari tiga minggu, sesak napas atau...",
3,"Demam, menggigil, suhu tubuh meningkat, batuk ...",
4,"Demam, muntah, diare cair, ampas sedikit seper...",


### Tokenization & Stemming Data Training

In [5]:
factory = StemmerFactory()
stemmer = factory.create_stemmer()
tokenizer = nltk.RegexpTokenizer(r'\w+')

for index, row in training.iterrows():
    # Stemming
    stemmed = stemmer.stem(row[0])
    #Tokenization
    tokens = tokenizer.tokenize(row[0])
    #Case Folding
    words = [w.lower() for w in tokens]
    training.at[index, 'tanda_gejala'] = words

In [6]:
training.head()

Unnamed: 0,tanda_gejala,diagnosis_penyakit
0,"[menggigil, demam, sakit, kepala]",Malaria (bentuk benigma)
1,"[kaku, kuduk, penurunan, kesadaran, muntah, pr...",Meningitis + perdarahan subarachnoid
2,"[pipi, bengkak, nyeri, saat, mengunyah, nyeri,...",Parotitis
3,"[sakit, gigi, gigi, sensitif, pada, makanan, d...",Karies dentis
4,"[hidung, tersumbat, bersin, batuk, sakit, teng...",Common cold


### Membuat Kolom Untuk Setiap Kata

In [7]:
columnlist = []
for index, row in training.iterrows():
    columnlist = np.concatenate((columnlist, row[0]))
columnlist = np.unique(columnlist)

In [8]:
for index in range(len(columnlist)):
    training.insert(2, str(columnlist[index]), 0)

In [9]:
training.head()

Unnamed: 0,tanda_gejala,diagnosis_penyakit,yang,warna,wajah,vulva,volume,vesikuler,vesikul,vesikel,...,ampas,amis,amandel,alis,aksila,akibat,akan,agak,ada,abdomen
0,"[menggigil, demam, sakit, kepala]",Malaria (bentuk benigma),0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,"[kaku, kuduk, penurunan, kesadaran, muntah, pr...",Meningitis + perdarahan subarachnoid,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,"[pipi, bengkak, nyeri, saat, mengunyah, nyeri,...",Parotitis,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,"[sakit, gigi, gigi, sensitif, pada, makanan, d...",Karies dentis,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,"[hidung, tersumbat, bersin, batuk, sakit, teng...",Common cold,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


### Menghitung Jumlah Frekuensi Setiap Kata

In [10]:
for index, row in training.iterrows():
    for columnindex in range(len(columnlist)):
        training.at[index, columnlist[columnindex]] = row[0].count(str(columnlist[columnindex]))

In [11]:
training.head()

Unnamed: 0,tanda_gejala,diagnosis_penyakit,yang,warna,wajah,vulva,volume,vesikuler,vesikul,vesikel,...,ampas,amis,amandel,alis,aksila,akibat,akan,agak,ada,abdomen
0,"[menggigil, demam, sakit, kepala]",Malaria (bentuk benigma),0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,"[kaku, kuduk, penurunan, kesadaran, muntah, pr...",Meningitis + perdarahan subarachnoid,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,"[pipi, bengkak, nyeri, saat, mengunyah, nyeri,...",Parotitis,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,"[sakit, gigi, gigi, sensitif, pada, makanan, d...",Karies dentis,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,"[hidung, tersumbat, bersin, batuk, sakit, teng...",Common cold,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


### Prepare Test Data

In [12]:
test.insert(2, 'jarak', 0.0)
test.head()

Unnamed: 0,tanda_gejala,diagnosis_penyakit,jarak
0,"Mata lengket, mata berair, pandangan sedikit k...",,0.0
1,"Gusi bengkak, gusi kemerahan, gusi berdarah",,0.0
2,"Batuk lebih dari tiga minggu, sesak napas atau...",,0.0
3,"Demam, menggigil, suhu tubuh meningkat, batuk ...",,0.0
4,"Demam, muntah, diare cair, ampas sedikit seper...",,0.0


### Preprocessing dan Penghitungan Jumlah Frekuensi Setiap Kata

In [13]:
for index, row in test.iterrows():
    # Stemming
    stemmed = stemmer.stem(row[0])
    #Tokenization
    tokens = tokenizer.tokenize(row[0])
    #Case Folding
    words = [w.lower() for w in tokens]
    test.at[index, 'tanda_gejala'] = words
for index in range(len(columnlist)):
    test.insert(3, str(columnlist[index]), 0)
for index, row in test.iterrows():
    for columnindex in range(len(columnlist)):
        test.at[index, columnlist[columnindex]] = row[0].count(str(columnlist[columnindex]))

### Penghitungan Jarak (Menggunakan Cosine similarity)

In [14]:
for test_index, test_row in test.iterrows():
    distance = []
    for train_index, train_row in training.iterrows():
        temp_a = 0
        temp_b = 0
        temp_c = 0
        for columnindex in range(len(columnlist)):
            temp_a += test_row[3+columnindex] * train_row[2+columnindex]
            temp_b += test_row[3+columnindex]**2
            temp_c += train_row[2+columnindex]**2
        distance += [((temp_a)/(math.sqrt(temp_b)*math.sqrt(temp_c)))]
    test.at[test_index, 'jarak'] = (np.min(distance))
    test.at[test_index, 'diagnosis_penyakit'] = training.at[np.argmin(distance), 'diagnosis_penyakit']

### Hasil Prediksi

In [15]:
test.iloc[:, : 3]

Unnamed: 0,tanda_gejala,diagnosis_penyakit,jarak
0,"[mata, lengket, mata, berair, pandangan, sedik...",Malaria (bentuk benigma),0.0
1,"[gusi, bengkak, gusi, kemerahan, gusi, berdarah]",Malaria (bentuk benigma),0.0
2,"[batuk, lebih, dari, tiga, minggu, sesak, napa...",Malaria (bentuk benigma),0.0
3,"[demam, menggigil, suhu, tubuh, meningkat, bat...",Meningitis + perdarahan subarachnoid,0.0
4,"[demam, muntah, diare, cair, ampas, sedikit, s...",Parotitis,0.0
5,"[nyeri, kolik, daerah, pinggang, malaise, mual...",Malaria (bentuk benigma),0.0
6,"[ruam, yang, gatal, terdri, dari, macula, maku...",Meningitis + perdarahan subarachnoid,0.0
