In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from nltk import word_tokenize
from sklearn.svm import SVC
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.tree import DecisionTreeClassifier as dt
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.utils import resample
from sklearn.metrics import *
from sklearn import model_selection
import matplotlib.pyplot as plt
import scikitplot as skplt
import warnings
warnings.filterwarnings('ignore')

In [2]:
#importdata
data=pd.read_excel('data6000.xlsx',sep=';', error_bad_lines = False)
data = data.dropna()
data['kelas'] = pd.to_numeric(data['kelas'], downcast='integer')
data.head()

Unnamed: 0,ulasan,kelas
0,"Parah si ini ceker terparah yg pernah d makan,...",0
1,Isi cekernya banyak. Pedasnya juara. Cekernya ...,1
2,Pas buat tempat meetup sama temen-temen becaus...,0
3,1. Lokasinya mudah ditemukan krn strategis pas...,1
4,Food: Tidak seperti yang digembar-gemborkan da...,0


In [3]:
#infodata
new_data = pd.DataFrame()
new_data['ulasan']=data['ulasan']
new_data['kelas']=data['kelas']

In [4]:
new_data.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 6004 entries, 0 to 6003
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   ulasan  6004 non-null   object
 1   kelas   6004 non-null   int8  
dtypes: int8(1), object(1)
memory usage: 99.7+ KB


In [5]:
#lowercase
new_data['ulasan'] = new_data['ulasan'].str.lower()
new_data['ulasan'].head()

0    parah si ini ceker terparah yg pernah d makan,...
1    isi cekernya banyak. pedasnya juara. cekernya ...
2    pas buat tempat meetup sama temen-temen becaus...
3    1. lokasinya mudah ditemukan krn strategis pas...
4    food: tidak seperti yang digembar-gemborkan da...
Name: ulasan, dtype: object

In [6]:
from Sastrawi.Stemmer.StemmerFactory import StemmerFactory

factory = StemmerFactory()
stemmer = factory.create_stemmer()

In [7]:
#Stemmer
words = []
for i, kalimat in enumerate (new_data['ulasan']):
    stop = stemmer.stem(kalimat)
    words.append(stop)

In [8]:
new = pd.DataFrame()
new['ulasan']=pd.Series(words)

In [9]:
new['ulasan'].head()

0    parah si ini ceker parah yg pernah d makan ga ...
1    isi ceker banyak pedas juara ceker lembut buat...
2    pas buat tempat meetup sama temen-temen becaus...
3    1 lokasi mudah temu krn strategis pas pengkol ...
4    food tidak seperti yang digembar-gemborkan dan...
Name: ulasan, dtype: object

In [10]:
from Sastrawi.StopWordRemover.StopWordRemoverFactory import StopWordRemoverFactory

factory = StopWordRemoverFactory()
stopword = factory.create_stop_word_remover()

In [11]:
#stopword
wordss = []
for i, kalimat in enumerate (new['ulasan']):
    stops = stopword.remove(kalimat)
    wordss.append(stops)

In [12]:
news = pd.DataFrame()
news['ulasan']=pd.Series(wordss)

In [13]:
news['ulasan'].head()

0    parah si ceker parah yg pernah d makan ga sesu...
1    isi ceker banyak pedas juara ceker lembut buat...
2    pas buat tempat meetup sama temen-temen becaus...
3    1 lokasi mudah temu krn strategis pas pengkol ...
4    food digembar-gemborkan sesuai ekspektasi turu...
Name: ulasan, dtype: object

In [14]:
#TOKENIZING
tokenized=[]
for x in news['ulasan']:
    temporary=word_tokenize(x)
    #print(temporary)
    temporary=[word for word in temporary if word.isalpha()]
    tokenized.append(temporary)

In [15]:
tokenized

[['parah',
  'si',
  'ceker',
  'parah',
  'yg',
  'pernah',
  'd',
  'makan',
  'ga',
  'sesuai',
  'advertising',
  'banget',
  'laper',
  'uda',
  'nunggu',
  'g',
  'k',
  'pas',
  'buka',
  'ceker',
  'begini',
  'yg',
  'dateng',
  'sedih',
  'banget',
  'yaa',
  'hitam',
  'selera',
  'makan',
  'hilang',
  'trs',
  'yg',
  'lebih',
  'parah',
  'kuah',
  'loh',
  'rasa',
  'persis',
  'sambel',
  'bangkok',
  'tp',
  'versi',
  'kuah',
  'and',
  'asin',
  'banget',
  'bener',
  'kecewa',
  'sih',
  'n',
  'ga',
  'makan',
  'bener',
  'ilfeel',
  'ga',
  'beli',
  'sihhhh',
  'cukup',
  'tau',
  'aja',
  'yaa'],
 ['isi',
  'ceker',
  'banyak',
  'pedas',
  'juara',
  'ceker',
  'lembut',
  'buat',
  'gue',
  'bingung',
  'gimana',
  'makan',
  'ceker',
  'alas',
  'gue',
  'mau',
  'makan',
  'ceker',
  'samping',
  'diajarin',
  'sih',
  'sama',
  'cowo',
  'gue',
  'gimana',
  'cara',
  'makan',
  'ceker',
  'nya',
  'buat',
  'minum',
  'halau',
  'galau',
  'nya',
  'enak'

In [16]:
def tokenizerr(x):
       return x.split(' ')

In [17]:
tokeneddata=[]
for i,x in enumerate(tokenized):
    tokeneddata.append(" ".join(map(str, x)))
tfidf= TfidfVectorizer(tokenizer=tokenizerr,use_idf=False, smooth_idf=False, sublinear_tf=False)
tfidf.fit_transform([tokeneddata[0]])
print(tfidf.get_feature_names())


['advertising', 'aja', 'and', 'asin', 'banget', 'bangkok', 'begini', 'beli', 'bener', 'buka', 'ceker', 'cukup', 'd', 'dateng', 'g', 'ga', 'hilang', 'hitam', 'ilfeel', 'k', 'kecewa', 'kuah', 'laper', 'lebih', 'loh', 'makan', 'n', 'nunggu', 'parah', 'pas', 'pernah', 'persis', 'rasa', 'sambel', 'sedih', 'selera', 'sesuai', 'si', 'sih', 'sihhhh', 'tau', 'tp', 'trs', 'uda', 'versi', 'yaa', 'yg']


In [18]:
tfidf_matrix = tfidf.fit_transform([tokeneddata[0]]) #PERHITUNGANTFIDF
print(tfidf_matrix.toarray())

[[0.10050378 0.10050378 0.10050378 0.10050378 0.30151134 0.10050378
  0.10050378 0.10050378 0.20100756 0.10050378 0.20100756 0.10050378
  0.10050378 0.10050378 0.10050378 0.30151134 0.10050378 0.10050378
  0.10050378 0.10050378 0.10050378 0.20100756 0.10050378 0.10050378
  0.10050378 0.30151134 0.10050378 0.10050378 0.30151134 0.10050378
  0.10050378 0.10050378 0.10050378 0.10050378 0.10050378 0.10050378
  0.10050378 0.10050378 0.10050378 0.10050378 0.10050378 0.10050378
  0.10050378 0.10050378 0.10050378 0.20100756 0.30151134]]


In [19]:
newss = pd.DataFrame()
newss['ulasan']=pd.Series(tokeneddata)

In [20]:
newss['ulasan'].head()

0    parah si ceker parah yg pernah d makan ga sesu...
1    isi ceker banyak pedas juara ceker lembut buat...
2    pas buat tempat meetup sama because they provi...
3    lokasi mudah temu krn strategis pas pengkol se...
4    food sesuai ekspektasi turut teman rasa biasa ...
Name: ulasan, dtype: object

In [21]:
X = newss['ulasan'].values
y = new_data['kelas'].values
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, random_state=42)

In [22]:
#vectorizer = CountVectorizer(ngram_range=(2,2))
#vectorizer.fit(X_train)
#vectorizer = CountVectorizer(ngram_range =(2, 2)) 
#X1 = vectorizer.fit_transform(X_train)  
#features = (vectorizer.get_feature_names()) 
#print("\n\nX1 : \n", X1.toarray()) 

In [23]:
# Applying TFIDF 
# You can still get n-grams here 
vectorizerr = TfidfVectorizer(ngram_range = (2, 2),use_idf=False, smooth_idf=False, sublinear_tf=False) 
X2 = vectorizerr.fit_transform(X_train) 
scores = (X2.toarray()) 
print("\n\nScores : \n", scores) 



Scores : 
 [[0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 ...
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]]


In [24]:
features = (vectorizerr.get_feature_names()) 
print("\n\n N-grams : \n", features) 



 N-grams : 
 ['aa nya', 'aaah enak', 'aama tomat', 'ab promo', 'abang mau', 'abiiiis teruus', 'abis bawa', 'abis bukber', 'abis capek', 'abis danniat', 'abis dh', 'abis enak', 'abis gelas', 'abis gua', 'abis hehe', 'abis hhhhh', 'abis isya', 'abis iya', 'abis kualitas', 'abis lah', 'abis lembang', 'abis makan', 'abis mamam', 'abis mampir', 'abis ngampus', 'abis nyesel', 'abis orang', 'abis pdhl', 'abis porsi', 'abis pulang', 'abis rb', 'abis searching', 'abis semua', 'abis sih', 'abis sisa', 'abis smua', 'abis soto', 'abis stocknya', 'abis taking', 'abis tiap', 'abis trus', 'abis uji', 'abisin kopi', 'abisin ujung', 'about akung', 'about the', 'abuba prabudimuntur', 'abuba soal', 'abuba yg', 'abur rasa', 'abur salmon', 'ac boleh', 'ac hny', 'ac jadi', 'ac kalo', 'ac lebih', 'ac loh', 'ac mana', 'ac nya', 'ac private', 'ac tetep', 'academy cocok', 'acapulco caesar', 'acara bandung', 'acara bukber', 'acara jazz', 'acara meeting', 'acara rating', 'acara sangat', 'acara sebut', 'acceptab

In [25]:
X_train = vectorizerr.transform(X_train)
X_test  = vectorizerr.transform(X_test)

In [26]:
X_train.toarray(), X_test.toarray()

(array([[0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.],
        ...,
        [0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.]]),
 array([[0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.],
        ...,
        [0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.]]))

In [27]:
#jumlah fitur
X_train.shape, X_test.shape, y_train.shape, y_test.shape

((4803, 29945), (1201, 29945), (4803,), (1201,))

In [28]:
model= SVC(C=1.95,kernel='poly',degree=2,coef0=1,tol=1e-3,cache_size=4096, probability=True)
model.fit(X_train, y_train)
predict = model.predict(X_test)

In [29]:
#FEATURE SELECTION METHOD
from sklearn.feature_selection import mutual_info_regression, SelectKBest, SelectPercentile

In [30]:
X_train

<4803x29945 sparse matrix of type '<class 'numpy.float64'>'
	with 147221 stored elements in Compressed Sparse Row format>

In [31]:
mi = mutual_info_regression(X_train, y_train)

In [32]:
#mi = pd.Series(mi)
#mi.sort_values(ascending=False)
#mi.sort_values(ascending=False).plot.bar(figsize=(10, 4))

In [33]:
sel = SelectPercentile(mutual_info_regression, percentile=10).fit(X_train, y_train)

In [34]:
sel.get_support()

array([False, False, False, ...,  True, False, False])

In [35]:
import collections
collections.Counter(sel.get_support())

Counter({False: 26951, True: 2994})

In [36]:
X_train = X_train[:,sel.get_support()]
X_test = X_test[:,sel.get_support()]

In [37]:
X_train.shape, X_test.shape

((4803, 2994), (1201, 2994))

In [38]:
#LEARNING METHOD
# model = xgb.XGBClassifier()
model = SVC(probability=True)
model.fit(X_train, y_train)
predict = model.predict(X_test)

In [39]:
print(classification_report(y_test, predict))
print('\n')
cm = confusion_matrix(y_test, predict)
print(cm)
print('\n')
print('Accuracy Score: ', accuracy_score(y_test, predict))

TP = 0
FP = 0
TN = 0
FN = 0
for i in range(len(predict)):
    if y_test[i]==predict[i]==1:
       TP += 1
    if predict[i]==1 and y_test[i]!=predict[i]:
       FP += 1
    if y_test[i]==predict[i]==0:
       TN += 1
    if predict[i]==0 and y_test[i]!=predict[i]:
       FN += 1
print('CONFUSION MATRIX :')
print('ACCURACY :',((TP+TN)/(TP+TN+FP+FN))*100)
print('SENSITIVITY :',(TP/(TP+FN))*100)
print('SPECIFICITY :',(TN/(TN+FP))*100)

              precision    recall  f1-score   support

           0       0.91      0.68      0.78       406
           1       0.86      0.97      0.91       795

    accuracy                           0.87      1201
   macro avg       0.88      0.83      0.85      1201
weighted avg       0.88      0.87      0.87      1201



[[278 128]
 [ 27 768]]


Accuracy Score:  0.8709408825978351
CONFUSION MATRIX :
ACCURACY : 87.0940882597835
SENSITIVITY : 96.60377358490567
SPECIFICITY : 68.4729064039409


In [40]:
mi

array([0.00044872, 0.00764422, 0.00709031, ..., 0.01080464, 0.00387425,
       0.00012422])

In [41]:
collections.Counter(y_test)

Counter({1: 795, 0: 406})

In [42]:
collections.Counter(y_train)

Counter({1: 3279, 0: 1524})

In [None]:
model.fit(X_train.toarray(), y_train)
probas = model.predict_proba(X_test.toarray())
skplt.metrics.plot_precision_recall_curve(y_test, probas)
plt.show()

In [None]:
seed = 7
kfold = model_selection.KFold(n_splits=10, random_state=seed)
scoring = 'accuracy'
results = model_selection.cross_val_score(model, X_train, y_train, cv=kfold, scoring=scoring)
print(results)
print('result mean : {}, result std : {}'.format(results.mean(), results.std()))