# Supervised Learning for Text

## Bagian 1: Machine Learning dengan Scikit-Learn (review)

In [1]:
# load dataset iris
from sklearn.datasets import load_iris
iris = load_iris()

In [2]:
# simpan matriks fitur X dan target y
X = iris.data
y = iris.target

**"Fitur"** sering disebut atribut, prediktor atau input.**"target"** sering disebut dengan label

In [3]:
# lihat ukuran X dan y
print(X.shape)
print(y.shape)

(150, 4)
(150,)


**"Observasi"** juga sering disebut jumlah sampel

In [4]:
# lihat 5 fitur pertama
import pandas as pd
pd.DataFrame(X, columns=iris.feature_names).head(10)

Unnamed: 0,sepal length (cm),sepal width (cm),petal length (cm),petal width (cm)
0,5.1,3.5,1.4,0.2
1,4.9,3.0,1.4,0.2
2,4.7,3.2,1.3,0.2
3,4.6,3.1,1.5,0.2
4,5.0,3.6,1.4,0.2
5,5.4,3.9,1.7,0.4
6,4.6,3.4,1.4,0.3
7,5.0,3.4,1.5,0.2
8,4.4,2.9,1.4,0.2
9,4.9,3.1,1.5,0.1


In [5]:
# lihat vektor label
print(y)
iris.target_names

[0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 2 2 2 2 2 2 2 2 2 2 2
 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2
 2 2]


array(['setosa', 'versicolor', 'virginica'], dtype='<U10')

Untuk  **membangun model**, fitur harus berbentuk **numerik**, dan setiap sampel harus memiliki **fitur yang sama dengan urutan yang sama**.

In [6]:
# import pustaka
from sklearn.neighbors import KNeighborsClassifier

# inisiasi model dengan parameter default
knn = KNeighborsClassifier()

# latih model
knn.fit(X, y)

KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
           metric_params=None, n_jobs=None, n_neighbors=5, p=2,
           weights='uniform')

Untuk **membuat prediksi**, observasi harus memiliki **fitur yang sama seperti data training**, dari jumlah dan maknanya.

In [7]:
# prediksi hasil
knn.predict([[1, 1, 1, 1]])

array([0])

## Bagian 2: Pemrosesan Teks dan Ekstraksi Fitur

Pada bagian ini, buka data teks lalu lakukan pemrosesan teks (cleansing, normalisasi, stemming dll.) dan lakukan ekstraksi fitur pada data teks yang sudah diproses

In [16]:
import pandas as pd
# Baca data
path = 'data/sms.tsv'
sms = pd.read_table(path, header=None, names=['label','sms'])

In [17]:
# lihat ukuran
sms.shape

(5572, 2)

In [18]:
# lihat data
sms.head()

Unnamed: 0,label,sms
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [19]:
# lihat distribusi kelas
sms.label.value_counts()

ham     4825
spam     747
Name: label, dtype: int64

In [20]:
sms.label = sms.label.map({'ham':0, 'spam':1})

In [21]:
sms.head()

Unnamed: 0,label,sms
0,0,"Go until jurong point, crazy.. Available only ..."
1,0,Ok lar... Joking wif u oni...
2,1,Free entry in 2 a wkly comp to win FA Cup fina...
3,0,U dun say so early hor... U c already then say...
4,0,"Nah I don't think he goes to usf, he lives aro..."


In [24]:
# Split data menjadi data train dan test
from sklearn.model_selection import train_test_split
X = sms.sms
y = sms.label
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25)

In [25]:
# lihat ukuran train & test
print(X_train.shape)
print(X_test.shape)

(4179,)
(1393,)


In [61]:
import os
import nltk
nltk.data.path.append("/Users/afifai/Documents/evolve/NLP_github/nltk_data")

In [65]:
# Inisiasi vectorizer
from sklearn.feature_extraction.text import CountVectorizer
from nltk.corpus import stopwords
vect = CountVectorizer(stop_words=stopwords.\
                       words("english"), min_df=0.005, max_df=0.995)

In [66]:
# Pelajari vocabulary dan ubah data train menjadi matriks
X_train_vec = vect.fit_transform(X_train)

In [67]:
# lihat vektor fitur
X_train_vec

<4179x323 sparse matrix of type '<class 'numpy.int64'>'
	with 16878 stored elements in Compressed Sparse Row format>

In [68]:
# lakukan hal yang sama dengan data testing
X_test_vec = vect.transform(X_test)

In [69]:
X_test_vec

<1393x323 sparse matrix of type '<class 'numpy.int64'>'
	with 5443 stored elements in Compressed Sparse Row format>

## Bagian 3: Pemodelan

Misal kita gunakan [multinomial Naive Bayes](http://scikit-learn.org/stable/modules/generated/sklearn.naive_bayes.MultinomialNB.html):

> The multinomial Naive Bayes classifier is suitable for classification with **discrete features** (e.g., word counts for text classification). The multinomial distribution normally requires integer feature counts. However, in practice, fractional counts such as tf-idf may also work.

In [70]:
# import
from sklearn.naive_bayes import MultinomialNB
nb = MultinomialNB()

In [71]:
# latih algoritma
nb.fit(X_train_vec, y_train)

MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True)

In [72]:
# buat prediksi
y_pred = nb.predict(X_test_vec)

In [73]:
# hitung akurasi
from sklearn.metrics import accuracy_score, confusion_matrix
accuracy_score(y_pred, y_test)

0.9798994974874372

In [36]:
# gunakan confussion matrix
confusion_matrix(y_pred, y_test)

array([[1191,   14],
       [   7,  181]])

In [38]:
# hitung probabilitas
y_pred_prob = nb.predict_proba(X_test_vec)

In [41]:
y_pred_prob[:5]

[[9.99999999e-01 6.68220329e-10]
 [9.97669607e-01 2.33039317e-03]
 [1.00000000e+00 4.44192286e-20]
 [9.99994508e-01 5.49182980e-06]
 [4.94539129e-11 1.00000000e+00]]


In [43]:
y_pred[:5]

array([0, 0, 0, 0, 1])

In [44]:
X_test[:5]

2469    Lol well don't do it without me. We could have...
1908                                      ELLO BABE U OK?
3636    It's not that you make me cry. It's just that ...
1886    I think asking for a gym is the excuse for laz...
420     Send a logo 2 ur lover - 2 names joined by a h...
Name: sms, dtype: object

## Bagian 4: Inferensia

In [46]:
new_sms =['how are you?', 'free entry!!!']
new_sms_vect = vect.transform(new_sms)

In [49]:
nb.predict_proba(new_sms_vect)

array([[0.99753144, 0.00246856],
       [0.01362368, 0.98637632]])

## Bagian 5: Buat Pipeline

In [74]:
from sklearn.pipeline import Pipeline

# definisi Pipeline
pipeline = Pipeline([
    ('vect', CountVectorizer(stop_words=stopwords.words("english"), min_df=0.005, max_df=0.995)),
    ('clf', MultinomialNB())
])

In [75]:
# melatih data melalui pipeline
pipeline.fit(X_train, y_train)

Pipeline(memory=None,
     steps=[('vect', CountVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=0.995, max_features=None, min_df=0.005,
        ngram_range=(1, 1), preprocessor=None,
        stop_words=['i',...enizer=None, vocabulary=None)), ('clf', MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True))])

In [55]:
# prediksi melalui pipeline
y_pred = pipeline.predict(X_test)

In [56]:
# hitung akurasi
accuracy_score(y_pred, y_test)

0.9849246231155779

In [57]:
# Inferensia melalui pipeline
pipeline.predict(new_sms)

array([0, 1])

In [58]:
pipeline.predict_proba(new_sms)

array([[0.99753144, 0.00246856],
       [0.01362368, 0.98637632]])

## Bagian 6: Simpan model kedalam file

In [76]:
#simpan
import pickle
outfile = open("model.pkl","wb")
pickle.dump(pipeline, outfile)
outfile.close()

In [None]:
#import pickle
infile = open('model.pkl', 'rb')
pipeline = pickle.load(infile)
infile.close()

## http://nlp.yuliadi.pro/dataset