In [21]:
import seaborn as sns #library untuk memperindah visualisasi data
import matplotlib.pyplot as plt
import pandas as pd #library untuk mengolah data

#Setup Seaborn
sns.set_style(style='white')
sns.set_context(context='notebook', font_scale=1.3, rc={'figure.figsize':(16,9)})

In [22]:
dataku = pd.read_csv('lemma.csv') #Membaca File CSV
dataku.label.value_counts() #MEnghitung Jumlah record berdasarkan label

label
1    502
2    280
0    240
Name: count, dtype: int64

In [23]:
#proses training
import pickle #library menyimpan model
from sklearn.svm import SVC #library metode yang akan dipakai untuk membuat model
from sklearn.feature_extraction.text import TfidfVectorizer #library untuk merubah data text kedalam bentuk vektor
from sklearn.preprocessing import LabelEncoder #library untuk merubah data label string kedalam bentuk numerik

In [24]:
#Merubah data label
dictio_label ={
    0:"Netral",
    1:"Positif",
    2:"Negatif"
}

dataku["label"]=dataku['label'].map(dictio_label)
dataku.label

0       Negatif
1        Netral
2       Negatif
3        Netral
4       Positif
         ...   
1017    Negatif
1018    Negatif
1019     Netral
1020    Negatif
1021     Netral
Name: label, Length: 1022, dtype: object

In [25]:
dataku = dataku.dropna() #mendrop record yang terdapat data kosong
dataku.isnull().sum() #menghitung data kosong per kolom

x = dataku.lemma #mengambil data pada kolom lemma
y = dataku.label #mengambil data pada kolom label

In [26]:
encoder = LabelEncoder() #Memanggil class encoder
train_y = encoder.fit_transform(y) #encode data kolom label

tfidf_vect = TfidfVectorizer() #memanggil class untuk vektorizer

#Transform data pada kolom lemma kedalam bentuk vektor
tfidf_vect.fit(x) 
train_x_tfidf = tfidf_vect.transform(x)

SVM = SVC(decision_function_shape='ovo') #memanggil metode training
text_pickle =SVM.fit(train_x_tfidf, train_y) #membuat model

#save model
files = open('SVM_classifier_24.pickle','wb')
pickle.dump(text_pickle, files)
files.close()
print("selesai")

selesai


In [27]:
#Loading model yang sudah disimpan
model = open('SVM_classifier_24.pickle', 'rb') 
svm_classifier = pickle.load(model)
svm_classifier

In [28]:
#Ekstrakasi fitur
from imblearn.over_sampling import SMOTE #Library untuk fitting data

vect = TfidfVectorizer(max_features=1000, binary=True) 
X = vect.fit_transform(dataku.lemma)

print(X)


#sm=SMOTE()

  (0, 246)	0.34784398131795813
  (0, 451)	0.4062029592835461
  (0, 496)	0.34784398131795813
  (0, 700)	0.34784398131795813
  (0, 358)	0.3104419984635892
  (0, 920)	0.36496095827980607
  (0, 8)	0.35578904981582404
  (0, 240)	0.06261927703426774
  (0, 850)	0.3345670225054814
  (1, 722)	0.3437109511464866
  (1, 271)	0.2960699284719949
  (1, 608)	0.3558539170049344
  (1, 843)	0.3437109511464866
  (1, 701)	0.37150892277720765
  (1, 280)	0.3437109511464866
  (1, 97)	0.3008048902213212
  (1, 8)	0.3254008953212399
  (1, 240)	0.05727092731456296
  (1, 850)	0.30599145399388017
  (2, 337)	0.18146669153529005
  (2, 983)	0.43560385458339634
  (2, 990)	0.43560385458339634
  (2, 167)	0.3938562218379441
  (2, 927)	0.43560385458339634
  (2, 284)	0.33866885321669116
  :	:
  (1010, 240)	0.08495333121022464
  (1011, 240)	1.0
  (1012, 896)	0.5727166512555294
  (1012, 195)	0.5979120532248099
  (1012, 149)	0.553173579195811
  (1012, 240)	0.09217269260925547
  (1013, 668)	0.8837026596111464
  (1013, 208)	0.44

In [31]:
from sklearn.model_selection import ShuffleSplit #library untuk split data 
from sklearn.metrics import classification_report, accuracy_score, f1_score, confusion_matrix #library untuk evaluasi


X = dataku.lemma
Y = dataku.label

ss = ShuffleSplit(n_splits=10, test_size=0.2) # mensplit data
sm = SMOTE()

accs = []
fis = []
cms = []

for train_index, test_index in ss.split(X):
    X_train, X_test = X.iloc[train_index], X.iloc[test_index]
    y_train, y_test = Y.iloc[train_index], Y.iloc[test_index]

    X_train_vect = vect.fit_transform(X_train)
    X_test_vect = vect.transform(X_test)

    X_train_res, y_train_res = sm.fit_resample(X_train_vect, y_train) #FITTING DATA
    
    SVM.fit(X_train_res, y_train_res)
    y_pred = SVM.predict(X_test_vect)#memprediksi 

    accs.append(accuracy_score(y_test,y_pred)) #menyimpan akurasi
    fis.append(f1_score(y_test, y_pred, average='weighted')) #menyimpan nilai f-1
    cms.append(confusion_matrix(y_test,y_pred)) #menyimpan confusion matrix

print(classification_report(y_test,y_pred))
print("\naverage accuracy across fold : {:2f}%".format(sum(accs)/len(accs) *100))
print("\naverage F1-score across fold : {:2f}%".format(sum(fis)/len(fis) *100))
print("\naverage Confusion Matrix across fold : \n {}".format(sum(cms)/len(cms)))

              precision    recall  f1-score   support

     Negatif       0.67      0.62      0.65        53
      Netral       0.67      0.59      0.62        51
     Positif       0.80      0.88      0.84       100

    accuracy                           0.74       204
   macro avg       0.71      0.70      0.70       204
weighted avg       0.73      0.74      0.74       204


average accuracy across fold : 73.529412%

average F1-score across fold : 73.013197%

average Confusion Matrix across fold : 
 [[37.4  8.4 10.1]
 [12.5 24.   9.9]
 [ 9.2  3.9 88.6]]
