In [16]:
from nltk import word_tokenize, pos_tag, ne_chunk
from nltk import conlltags2tree, tree2conlltags
from IPython.display import clear_output
import pandas as pd
from sklearn.feature_extraction import DictVectorizer
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.svm import SVC
import numpy as np
from sklearn.metrics import classification_report
from sklearn.naive_bayes import MultinomialNB

In [2]:
raw_data = pd.read_csv('reviews.csv')
reviews = raw_data['text'].copy()
reviews

0        Super good, don't get me wrong. But I came for...
1        I decided to try it out although I’m not a hug...
2        My caramel core begins to disappear about half...
3        Why are people complaining about the blonde br...
4        This ice cream is worst ice cream I’ve ever ta...
                               ...                        
21669    There was no chocolate ice cream in this at al...
21670    This ice cream has no flavor at all. No one in...
21671    Absolutely love this flavor! The only thing th...
21672    Brilliant combo - love the cheesecake and brow...
21673    Has a delicious taste with all natural ingredi...
Name: text, Length: 21674, dtype: object

# Generate IOB Tag
Untuk setiap kalimat *review* akan dibangkitkan pohon *named entity chunk* kemudian untuk setiap katanya dicari tag IOB nya

In [3]:
data_with_tags = []

i = 1
for review in reviews :
    clear_output(wait=True)
    print('Generating:', i, 'of', len(reviews))
    tree = ne_chunk(pos_tag(word_tokenize(review)))
    
    iob_tags = tree2conlltags(tree)
    
    for tag in iob_tags:
        data_with_tags.append(tag)
        
    i += 1
clear_output(wait=True)
df = pd.DataFrame(data_with_tags, columns=['Word', 'POS', 'Tag'])
df.head()

Unnamed: 0,Word,POS,Tag
0,Super,NNP,B-GPE
1,good,NN,O
2,",",",",O
3,do,VBP,O
4,n't,RB,O


Menampilkan jumlah nilai setiap Tag dan POS

In [4]:
df.groupby('Tag').size().reset_index(name='counts')

Unnamed: 0,Tag,counts
0,B-FACILITY,67
1,B-GPE,4535
2,B-GSP,35
3,B-LOCATION,57
4,B-ORGANIZATION,6084
5,B-PERSON,10186
6,I-FACILITY,107
7,I-GPE,122
8,I-GSP,2
9,I-LOCATION,30


In [5]:
df.groupby('POS').size().reset_index(name='counts')

Unnamed: 0,POS,counts
0,#,218
1,$,240
2,'',900
3,(,1502
4,),1581
5,",",21810
6,.,80634
7,:,3603
8,CC,39829
9,CD,9121


# Convert POS Tag
Mengubah nilai pos tag setiap kata dengan sistem One Hot Encoding

In [6]:
one_hot_enc_POS = pd.get_dummies(df, columns=['POS'], prefix=['POS'])
one_hot_enc_POS.head()

Unnamed: 0,Word,Tag,POS_#,POS_$,POS_'',POS_(,POS_),"POS_,",POS_.,POS_:,...,POS_VBD,POS_VBG,POS_VBN,POS_VBP,POS_VBZ,POS_WDT,POS_WP,POS_WP$,POS_WRB,POS_``
0,Super,B-GPE,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,good,O,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,",",O,0,0,0,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0
3,do,O,0,0,0,0,0,0,0,0,...,0,0,0,1,0,0,0,0,0,0
4,n't,O,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [7]:
#GA KEPAKE
#bag of words
# word2count = {}
# i = 1
# for word in df['Word']:
#     clear_output(wait=True)
#     if word not in word2count.keys():
#         word2count[word] = 1
#     else:
#         word2count[word] += 1
#     print('feature extraction :', round(100*i/len(df['Word']),2),'%')
#     i+=1

# Feature Extraction
mengubah setiap kata menjadi fitur numerik dengan metode TF-IDF

In [8]:
#TF-IDF
#define tfid vectorized
def dummy(doc):
    return doc

tfidf = TfidfVectorizer(analyzer='word', #''
                       tokenizer=dummy,
                       preprocessor=dummy,
                       token_pattern=None)

#melatih data
model = tfidf.fit(df['Word'].tolist())
training_feature = model.transform(df['Word'].tolist()).toarray()
word_numerical = pd.DataFrame(training_feature, columns=['f-'+str(i+1) for i in range(len(training_feature[0]))])
word_numerical.head()

Unnamed: 0,f-1,f-2,f-3,f-4,f-5,f-6,f-7,f-8,f-9,f-10,...,f-125,f-126,f-127,f-128,f-129,f-130,f-131,f-132,f-133,f-134
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.835641,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


# Mempersiapkan Input
Menggabungkan numerikal fitur dari setiap kata dengan one hot encoding POS Tagnya

In [9]:
pos_feature = one_hot_enc_POS.drop(['Word', 'Tag'], axis=1)

X = pd.concat([word_numerical, pos_feature], axis=1)
X.head()


Unnamed: 0,f-1,f-2,f-3,f-4,f-5,f-6,f-7,f-8,f-9,f-10,...,POS_VBD,POS_VBG,POS_VBN,POS_VBP,POS_VBZ,POS_WDT,POS_WP,POS_WP$,POS_WRB,POS_``
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0,0,0,0,0,0,0,0,0,0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0,0,0,0,0,0,0,0,0,0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0,0,0,0,0,0,0,0,0,0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0,0,0,1,0,0,0,0,0,0
4,0.0,0.0,0.0,0.0,0.0,0.835641,0.0,0.0,0.0,0.0,...,0,0,0,0,0,0,0,0,0,0


# Mempersiapkan Label
Mengambil kolom Tag sebagai label

In [13]:
#GA KEPAKE
# df["Tag"] = df["Tag"].astype('category')
# Y = pd.DataFrame(df["Tag"].cat.codes)

Y = df['Tag']
Y


0          B-GPE
1              O
2              O
3              O
4              O
           ...  
1069170        O
1069171        O
1069172        O
1069173        O
1069174        O
Name: Tag, Length: 1069175, dtype: object

In [14]:
#Nilai Unik Label
classes = np.unique(Y)
classes = classes.tolist()
classes

['B-FACILITY',
 'B-GPE',
 'B-GSP',
 'B-LOCATION',
 'B-ORGANIZATION',
 'B-PERSON',
 'I-FACILITY',
 'I-GPE',
 'I-GSP',
 'I-LOCATION',
 'I-ORGANIZATION',
 'I-PERSON',
 'O']

# Split Data

In [11]:
X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size = 0.1, random_state=0)

print('Training data :', X_train.shape[0])
print('Testing data :', X_test.shape[0])

Training data : 962257
Testing data : 106918


# Model Training
Menggunakan algoritma Multinomial Naive Bayes

In [17]:
nb = MultinomialNB(alpha=0.01)
nb.partial_fit(X_train, y_train, classes)

MultinomialNB(alpha=0.01)

# Model Evaluation

In [18]:
#Membuang label O karena dia adalah nilai paling banyak (kelas mayor) untuk kepentingan metrik pengukuran
#classes without 'O'
new_classes = classes.copy()
new_classes.pop()
new_classes

['B-FACILITY',
 'B-GPE',
 'B-GSP',
 'B-LOCATION',
 'B-ORGANIZATION',
 'B-PERSON',
 'I-FACILITY',
 'I-GPE',
 'I-GSP',
 'I-LOCATION',
 'I-ORGANIZATION',
 'I-PERSON']

In [19]:
print(classification_report(y_pred=nb.predict(X_test), y_true=y_test, labels = new_classes))

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


                precision    recall  f1-score   support

    B-FACILITY       0.00      0.00      0.00         7
         B-GPE       0.34      0.09      0.15       417
         B-GSP       0.00      0.00      0.00         2
    B-LOCATION       0.00      0.00      0.00         6
B-ORGANIZATION       0.38      0.45      0.41       610
      B-PERSON       0.37      0.76      0.50      1002
    I-FACILITY       0.00      0.00      0.00        11
         I-GPE       0.00      0.00      0.00        14
         I-GSP       0.00      0.00      0.00         0
    I-LOCATION       0.00      0.00      0.00         2
I-ORGANIZATION       0.12      0.01      0.01       186
      I-PERSON       0.26      0.27      0.26       353

     micro avg       0.36      0.45      0.40      2610
     macro avg       0.12      0.13      0.11      2610
  weighted avg       0.33      0.45      0.35      2610

