In [69]:
import string
import pandas as pd
import regex as re
import numpy as np
from pathlib import Path
from nltk.tokenize import word_tokenize
from sklearn.svm import SVC
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.feature_extraction.text import TfidfVectorizer
from Sastrawi.Stemmer.StemmerFactory import StemmerFactory
from Sastrawi.StopWordRemover.StopWordRemoverFactory import StopWordRemoverFactory, StopWordRemover, ArrayDictionary

## Membaca Dataset

In [70]:
df = pd.read_csv('Dataset Novel.csv')
# df.to_csv('Dataset Novel.csv', index=False)

In [71]:
df.Genre

0      Fantasi
1      Fantasi
2      Fantasi
3      Fantasi
4      Fantasi
        ...   
206    Sejarah
207    Sejarah
208    Sejarah
209    Sejarah
210    Sejarah
Name: Genre, Length: 211, dtype: object

In [146]:
df.iloc[0]['Sinopsis']

'"... Tiba-tiba dari balik gapura muncul bayangan tiriggi dengan jubah panjang berkibar. Aku terperangah. Sosok itu... raut wajahnya tersamar dalam keremangan, Dia berjalan pelan ke arahku. Lalu perlahan, sinar mentari temaram menyinari setengah wajahnya, mcmbuat garis wajahnya bercahaya, dan sesaat kemudian tampakiah deretan gigi-gigi putih terkuak dari seulas seringai dingin."Pertemuan aneh di gapura Bajang Ratu antara Bintang dan Ben Portman itu ternyata menjadi awal dari peristiwa-peristiwa menegangkan berikutnya. Bintang adalah seorang wartawan lepas yang sedang mengalami konflik batin setelah dirinya sadar banyak basil karyanya yang meraih berderet penghargaan namun didapatkan melalui cara tidak benar. aat kerusuhan Manggarai pecah, Bintang bertemu dengan Jon, seorang preman penguasa iManggarai. Aksi saling menyelamatkan menyatukan mereka pada jalin persahabatan. Kcduanya pun berusaha hijrah dari masa lalu kelam itu. Secara tak sengaja, Bintang dan Jon direkrut oleh Laskar Ababil

## Cek Distribusi Kelas

In [72]:
distribusiKelas = df.value_counts('Genre')
round(distribusiKelas*100/sum(distribusiKelas), 2)

Genre
Sejarah    26.07
Fantasi    25.12
Romance    25.12
Horor      23.70
Name: count, dtype: float64

## Mengambil Fitur dan Kelas

In [46]:
X = df.Sinopsis
y = df.Genre

In [47]:
dataSinopsis = X
genre = y

## Membersihkan Dokumen pada Sinopsis

In [48]:
def clean(x):
   x = x.lower()
   x = re.sub(r'([->]+) *', ' ', x)
   x = re.sub(r'([".?!%-,]+) *', ' ', x)
   x = re.sub(' +', ' ', x)
   x = re.sub(r'\d+','',x)
   x = re.sub(r'(\w)\1(\1+)',r'\1',x)

   return x

dataSinopsis = dataSinopsis.apply(lambda x: clean(x))

In [49]:
dataSinopsis

0       tiba tiba dari balik gapura muncul bayangan t...
1      stella lanchaster seorang gadis masa depan ber...
2      pertemuan panut william dengan nyi blorong tel...
3      tegap berdiri menantang langit namun luruh cep...
4      ini kisah tentang neena manusia yang bisa bern...
                             ...                        
206    demak tak pernah sanggup tidur nyenyak setelah...
207    pada abad pertengahan pusat perkembangan perad...
208    ratusan pasukan pangeran inamaru berhasil mena...
209    buku peraih penghargaan yang awalnya dilarang ...
210    kapan pun di mana pun urusan dan perkara kekua...
Name: Sinopsis, Length: 211, dtype: object

## Stemming Data Sinopsis

In [26]:
factory = StemmerFactory()
stemmer = factory.create_stemmer()

In [27]:
hasil = list()
for text in range(len(dataSinopsis)):
  result = stemmer.stem(dataSinopsis.values[text])
  hasil.append(result)

In [28]:
hasil[0]

'tiba tiba dari balik gapura muncul bayang tiriggi dengan jubah panjang kibar aku perangah sosok itu raut wajah samar dalam remang dia jalan pelan ke arah lalu perlahan sinar mentari temaram sari tengah wajah mcmbuat garis wajah cahaya dan saat kemudian tampakiah deret gigi gigi putih kuak dari ulas seringai dingin temu aneh di gapura bajang ratu antara bintang dan ben portman itu nyata jadi awal dari peristiwa peristiwa tegang ikut bintang adalah orang wartawan lepas yang sedang alami konflik batin telah diri sadar banyak basil karya yang raih deret harga namun dapat lalu cara tidak benar aat rusuh manggarai pecah bintang temu dengan jon orang preman kuasa imanggarai aksi saling selamat satu mereka pada jalin sahabat kcduanya pun usaha hijrah dari masa lalu kelam itu cara tak sengaja bintang dan jon rekrut oleh laskar ababil kelompok bela benar yang perang lawan para puja kelompok usung panji panji gelap laskar ababil harus gagal usaha para puja rebut kitab biru buah kitab sumber raha

## Menghilangkan Stopword 

In [73]:
stop_factory = StopWordRemoverFactory()
more_stopword = list(pd.read_csv('stopwordbahasa.csv').values.squeeze())
data = stop_factory.get_stop_words() + more_stopword
# stopword = stop_factory.create_stop_word_remover()
dictionary = ArrayDictionary(data)
remover = StopWordRemover(dictionary)
cleanText = []

for text in hasil:
    cleaningText = remover.remove(text)
    cleanText.append(cleaningText)

## Membuat TF-IDF Vectorizer

In [74]:
vectorizer = TfidfVectorizer(max_features=5000)

vecSinopsis = vectorizer.fit(cleanText)
xSinopsis = vecSinopsis.transform(cleanText).toarray()
print(vectorizer.get_feature_names_out())
print(xSinopsis)

['aat' 'ababil' 'abad' ... 'zu' 'zulaika' 'zunaira']
[[0.09004221 0.18008441 0.         ... 0.         0.         0.        ]
 [0.         0.         0.         ... 0.         0.         0.        ]
 [0.         0.         0.         ... 0.         0.         0.        ]
 ...
 [0.         0.         0.0557328  ... 0.         0.         0.        ]
 [0.         0.         0.         ... 0.         0.         0.        ]
 [0.         0.         0.09575102 ... 0.         0.         0.        ]]


In [31]:
import pickle

pickle.dump(vecSinopsis, open('vectorizer.pkl', 'wb'))

In [68]:
names = vectorizer.get_feature_names_out()
np.where(names == 'target')

(array([], dtype=int64),)

# Save Dataset TFIDF

In [75]:
dataset = pd.DataFrame(xSinopsis, columns=vectorizer.get_feature_names_out())

In [76]:
dataset['label']

0      0.0
1      0.0
2      0.0
3      0.0
4      0.0
      ... 
206    0.0
207    0.0
208    0.0
209    0.0
210    0.0
Name: label, Length: 211, dtype: float64

In [77]:
dataset.to_csv('TFIDFSinopsisNovel.csv', index=False)

## Membuat Dataframe Dataset Model

In [82]:
dataset['target'] = genre.reset_index().drop(labels=['index'], axis=1)

In [85]:
dataset

Unnamed: 0,aat,ababil,abad,abadi,abai,abasiyah,abdi,absurd,absurditas,academy,...,zara,zayni,zeus,ziarah,zodiak,zombie,zu,zulaika,zunaira,target
0,0.090042,0.180084,0.000000,0.000000,0.0,0.0,0.0,0.0,0.00000,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,2
1,0.000000,0.000000,0.000000,0.000000,0.0,0.0,0.0,0.0,0.00000,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,2
2,0.000000,0.000000,0.000000,0.000000,0.0,0.0,0.0,0.0,0.00000,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,2
3,0.000000,0.000000,0.000000,0.103847,0.0,0.0,0.0,0.0,0.00000,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,2
4,0.000000,0.000000,0.000000,0.000000,0.0,0.0,0.0,0.0,0.00000,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.104276,0.0,0.0,2
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
206,0.000000,0.000000,0.000000,0.000000,0.0,0.0,0.0,0.0,0.00000,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,3
207,0.000000,0.000000,0.189295,0.000000,0.0,0.0,0.0,0.0,0.00000,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,3
208,0.000000,0.000000,0.055733,0.000000,0.0,0.0,0.0,0.0,0.00000,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,3
209,0.000000,0.000000,0.000000,0.000000,0.0,0.0,0.0,0.0,0.10531,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,3


In [17]:
dd = pd.get_dummies(dataset, dtype=int)
dd

Unnamed: 0,aat,ababil,abad,abadi,abai,abasiyah,abdi,absurd,absurditas,academy,...,ziarah,zodiak,zombie,zu,zulaika,zunaira,genreTarget_Fantasi,genreTarget_Horor,genreTarget_Romance,genreTarget_Sejarah
0,0.090042,0.180084,0.000000,0.000000,0.0,0.0,0.0,0.0,0.00000,0.0,...,0.0,0.0,0.0,0.000000,0.0,0.0,1,0,0,0
1,0.000000,0.000000,0.000000,0.000000,0.0,0.0,0.0,0.0,0.00000,0.0,...,0.0,0.0,0.0,0.000000,0.0,0.0,1,0,0,0
2,0.000000,0.000000,0.000000,0.000000,0.0,0.0,0.0,0.0,0.00000,0.0,...,0.0,0.0,0.0,0.000000,0.0,0.0,1,0,0,0
3,0.000000,0.000000,0.000000,0.103847,0.0,0.0,0.0,0.0,0.00000,0.0,...,0.0,0.0,0.0,0.000000,0.0,0.0,1,0,0,0
4,0.000000,0.000000,0.000000,0.000000,0.0,0.0,0.0,0.0,0.00000,0.0,...,0.0,0.0,0.0,0.104276,0.0,0.0,1,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
206,0.000000,0.000000,0.000000,0.000000,0.0,0.0,0.0,0.0,0.00000,0.0,...,0.0,0.0,0.0,0.000000,0.0,0.0,0,0,0,1
207,0.000000,0.000000,0.189295,0.000000,0.0,0.0,0.0,0.0,0.00000,0.0,...,0.0,0.0,0.0,0.000000,0.0,0.0,0,0,0,1
208,0.000000,0.000000,0.055733,0.000000,0.0,0.0,0.0,0.0,0.00000,0.0,...,0.0,0.0,0.0,0.000000,0.0,0.0,0,0,0,1
209,0.000000,0.000000,0.000000,0.000000,0.0,0.0,0.0,0.0,0.10531,0.0,...,0.0,0.0,0.0,0.000000,0.0,0.0,0,0,0,1


In [18]:
dd.to_csv('tfidfSinopsis.csv', index=False)

### Label Encoding

In [84]:
dataset['target'] = dataset.target.map({'Romance':0, 'Horor':1, 'Fantasi':2, 'Sejarah':3})

## Split Data Train dan Data Test

In [31]:
dataValue = dataset.drop(labels=['label'], axis=1)
yValue = dataset.label

X_train, X_test, y_train, y_test = train_test_split(dataValue,yValue,test_size = 0.2, random_state = 42)

In [32]:
y_train

127    0
108    0
69     1
84     1
97     1
      ..
106    0
14     2
92     1
179    3
102    1
Name: label, Length: 168, dtype: int64

## Mendefinisikan Model Pelatihan

In [37]:
from sklearn.ensemble import BaggingClassifier
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier

clf = XGBClassifier(n_estimator=100)
clf.fit(X_train, y_train)

  if is_sparse(dtype):
  is_categorical_dtype(dtype) or is_pa_ext_categorical_dtype(dtype)
  if is_categorical_dtype(dtype):
  return is_int or is_bool or is_float or is_categorical_dtype(dtype)
  if is_sparse(data):
Parameters: { "n_estimator" } are not used.



In [38]:
y_pred = clf.predict(X_test)

  if is_sparse(dtype):
  is_categorical_dtype(dtype) or is_pa_ext_categorical_dtype(dtype)
  if is_categorical_dtype(dtype):
  return is_int or is_bool or is_float or is_categorical_dtype(dtype)


In [39]:
from sklearn.metrics import accuracy_score
print('Model accuracy score with default hyperparameters: {0:0.4f}'. format(accuracy_score(y_test, y_pred)))

Model accuracy score with default hyperparameters: 0.5581


## Test Model

In [98]:
test_data = 'Angga Terbang Ke Angkasa mengikuti alien yang menggunakan pesawat terbangnya yang canggih, sampailah pada sebuah pulau yang tak berpenghuni, angga melepaskan helm tempurnya untuk beristirahat sejenak'

In [99]:
test_data = pd.Series(test_data)
test_data = test_data.str.encode('ascii', 'ignore').str.decode('ascii')
x_dtrain1 = test_data.apply(lambda x: clean(x))

In [100]:
factory = StemmerFactory()
stemmer = factory.create_stemmer()

hasil1 = list()
for text in range(len(x_dtrain1)):
  result = stemmer.stem(x_dtrain1.values[text])
  hasil1.append(result)

factory_sw = StopWordRemoverFactory()
stopword = factory_sw.create_stop_word_remover()
more_stopword = ['di']
stop_factory = StopWordRemoverFactory().get_stop_words()
data1 = stop_factory + more_stopword
dictionary = ArrayDictionary(data1)
co = StopWordRemover(dictionary)
stop1 = list()

for index in range(len(hasil1)):
  stopw = co.remove(hasil1[index])
  stop1.append(stopw)
# tokens = nltk.tokenize.word_tokenize(stop1[-1])
# print(tokens)

In [101]:
X = vecSinopsis.transform(stop1)
x_data = X.toarray()

In [102]:
predik = clf.predict(x_data.reshape(1,-1))
predik

array([2], dtype=int64)

## Classification - Novel Genre Based on Sinopsis

In [9]:
clf = pickle.load(open('NovelGenreClassifier.pkl', 'rb'))
vector = pickle.load(open('vectorizer.pkl', 'rb'))

predictor = NovelGenreClassifier(model=clf, vectorizer=vector)

In [3]:
test_data = ['Cinta ini takkan berbalas sayang, kupastikan melayang pedih, ku saat merasa indah, semua hilang dan usai cinta']
# predictor.predict(test_data)

In [58]:
import os
from Classifier import Classifier
from sklearn.linear_model import LogisticRegression
from pathlib import Path

sourcePath = Path('Dataset Novel.csv')
stopwordPath = Path('stopwordbahasa.csv')
model = LogisticRegression()
vectorPath = Path(os.getcwd())
model = Classifier(sourcePath, stopwordPath, vectorPath, model)

In [59]:
model.train()

vectorizer is saved in C:\Users\hp\Downloads\PROPOSAL SKRIPSI\ Vectorizer.pkl
Model Saved in C:\Users\hp\Downloads\PROPOSAL SKRIPSI\ NovelGenreClassifier.pkl
Model accuracy score with default hyperparameters: 0.791


In [60]:
test_data = ['Cinta di musim cerry']
model.predict(test_data)

array([2], dtype=int64)

In [61]:
import pickle

m = pickle.load(open('NovelGenreClassifier.pkl', 'rb'))
v = pickle.load(open('vectorizer.pkl', 'rb'))

In [68]:
import numpy as np

test = v.transform(test_data).toarray()
np.round(m.predict_proba(test),1)*100

array([[ 0.,  0., 90.,  0.]])

## Training All Model

### Import all Model

In [86]:
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.naive_bayes import MultinomialNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from sklearn.model_selection import train_test_split
import pandas as pd
import pickle


import warnings
warnings.filterwarnings('ignore')

In [87]:
dataset

Unnamed: 0,aat,ababil,abad,abadi,abai,abasiyah,abdi,absurd,absurditas,academy,...,zara,zayni,zeus,ziarah,zodiak,zombie,zu,zulaika,zunaira,target
0,0.090042,0.180084,0.000000,0.000000,0.0,0.0,0.0,0.0,0.00000,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,2
1,0.000000,0.000000,0.000000,0.000000,0.0,0.0,0.0,0.0,0.00000,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,2
2,0.000000,0.000000,0.000000,0.000000,0.0,0.0,0.0,0.0,0.00000,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,2
3,0.000000,0.000000,0.000000,0.103847,0.0,0.0,0.0,0.0,0.00000,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,2
4,0.000000,0.000000,0.000000,0.000000,0.0,0.0,0.0,0.0,0.00000,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.104276,0.0,0.0,2
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
206,0.000000,0.000000,0.000000,0.000000,0.0,0.0,0.0,0.0,0.00000,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,3
207,0.000000,0.000000,0.189295,0.000000,0.0,0.0,0.0,0.0,0.00000,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,3
208,0.000000,0.000000,0.055733,0.000000,0.0,0.0,0.0,0.0,0.00000,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,3
209,0.000000,0.000000,0.000000,0.000000,0.0,0.0,0.0,0.0,0.10531,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,3


## Load TF-IDF Dataset 

In [88]:
tfidf = pd.read_csv('TFIDFSinopsisNovel.csv')

In [89]:
tfidf

Unnamed: 0,aat,ababil,abad,abadi,abai,abasiyah,abdi,absurd,absurditas,academy,...,zaman,zara,zayni,zeus,ziarah,zodiak,zombie,zu,zulaika,zunaira
0,0.090042,0.180084,0.000000,0.000000,0.0,0.0,0.0,0.0,0.00000,0.0,...,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0
1,0.000000,0.000000,0.000000,0.000000,0.0,0.0,0.0,0.0,0.00000,0.0,...,0.119854,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0
2,0.000000,0.000000,0.000000,0.000000,0.0,0.0,0.0,0.0,0.00000,0.0,...,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0
3,0.000000,0.000000,0.000000,0.103847,0.0,0.0,0.0,0.0,0.00000,0.0,...,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0
4,0.000000,0.000000,0.000000,0.000000,0.0,0.0,0.0,0.0,0.00000,0.0,...,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.104276,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
206,0.000000,0.000000,0.000000,0.000000,0.0,0.0,0.0,0.0,0.00000,0.0,...,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0
207,0.000000,0.000000,0.189295,0.000000,0.0,0.0,0.0,0.0,0.00000,0.0,...,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0
208,0.000000,0.000000,0.055733,0.000000,0.0,0.0,0.0,0.0,0.00000,0.0,...,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0
209,0.000000,0.000000,0.000000,0.000000,0.0,0.0,0.0,0.0,0.10531,0.0,...,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0


In [91]:
X = dataset.drop(labels=['target'], axis=1)
y = dataset['target']

In [143]:
y

0      2
1      2
2      2
3      2
4      2
      ..
206    3
207    3
208    3
209    3
210    3
Name: target, Length: 211, dtype: int64

In [92]:
model_and_params_collection = {
    'SVM': {
        'model': SVC(),
        'params': {
            'C': [10, 100, 1000],
            'kernel': ['rbf', 'poly', 'sigmoid'],
            'gamma': ['scale', 'auto'],
            'probability': [True]
        }
    },
    'LogisticRegression': {
        'model': LogisticRegression(),
        'params': {
            'penalty': ['l1', 'l2', 'elasticnet', None],
            'warm_start': [True, False],            
        }
    },
    'RandomForest': {
        'model': RandomForestClassifier(),
        'params': {
            'n_estimators': [10,100,1000],
            'criterion': ['gini', 'entropy', 'log_loss'],
        }
    },
    'DecisionTree': {
        'model': DecisionTreeClassifier(),
        'params': {
            'splitter': ['best', 'random'],
        }
    },
    'XgBoost': {
        'model': XGBClassifier(),
        'params': {
            'n_estimator': [10, 100, 1000],
#             'gamma': [0, 1, 5],
#             'max_depth': [3, 10], 
            'learning_rate': [0.1,0.01, 0.05]
        }
    },
    'NaiveBayes': {
        'model': MultinomialNB(),
        'params': {
            'alpha': [0.1, 0.5, 1.0, 10.0]
        }
    }
}

In [93]:
for model, data in zip(model_and_params_collection.keys(), model_and_params_collection.values()):
    print(model_and_params_collection[model]['params'])

{'C': [10, 100, 1000], 'kernel': ['rbf', 'poly', 'sigmoid'], 'gamma': ['scale', 'auto'], 'probability': [True]}
{'penalty': ['l1', 'l2', 'elasticnet', None], 'warm_start': [True, False]}
{'n_estimators': [10, 100, 1000], 'criterion': ['gini', 'entropy', 'log_loss']}
{'splitter': ['best', 'random']}
{'n_estimator': [10, 100, 1000], 'learning_rate': [0.1, 0.01, 0.05]}
{'alpha': [0.1, 0.5, 1.0, 10.0]}


In [10]:
from sklearn.datasets import make_classification
from sklearn.ensemble import RandomForestClassifier
from sklearn.experimental import enable_halving_search_cv  # noqa
from sklearn.model_selection import HalvingGridSearchCV
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
import pandas as pd

In [97]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, shuffle=True)

In [98]:
X

Unnamed: 0,aat,ababil,abad,abadi,abai,abasiyah,abdi,absurd,absurditas,academy,...,zaman,zara,zayni,zeus,ziarah,zodiak,zombie,zu,zulaika,zunaira
0,0.090042,0.180084,0.000000,0.000000,0.0,0.0,0.0,0.0,0.00000,0.0,...,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0
1,0.000000,0.000000,0.000000,0.000000,0.0,0.0,0.0,0.0,0.00000,0.0,...,0.119854,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0
2,0.000000,0.000000,0.000000,0.000000,0.0,0.0,0.0,0.0,0.00000,0.0,...,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0
3,0.000000,0.000000,0.000000,0.103847,0.0,0.0,0.0,0.0,0.00000,0.0,...,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0
4,0.000000,0.000000,0.000000,0.000000,0.0,0.0,0.0,0.0,0.00000,0.0,...,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.104276,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
206,0.000000,0.000000,0.000000,0.000000,0.0,0.0,0.0,0.0,0.00000,0.0,...,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0
207,0.000000,0.000000,0.189295,0.000000,0.0,0.0,0.0,0.0,0.00000,0.0,...,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0
208,0.000000,0.000000,0.055733,0.000000,0.0,0.0,0.0,0.0,0.00000,0.0,...,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0
209,0.000000,0.000000,0.000000,0.000000,0.0,0.0,0.0,0.0,0.10531,0.0,...,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0


In [99]:
for model in model_and_params_collection.keys():
    param_grid = model_and_params_collection[model]['params']
    base_estimator = model_and_params_collection[model]['model']
    sh = HalvingGridSearchCV(base_estimator, param_grid, cv=5).fit(X_train, y_train)
    print(sh.best_estimator_)
    y_pred = sh.predict(X_test)
    print('Model accuracy score with default hyperparameters: {0:0.4f}'. format(accuracy_score(y_test, y_pred)))
    pickle.dump(sh, open(f'{model}.pkl', 'wb'))

SVC(C=1000, kernel='sigmoid', probability=True)
Model accuracy score with default hyperparameters: 0.7442
LogisticRegression(penalty=None, warm_start=True)
Model accuracy score with default hyperparameters: 0.7907
RandomForestClassifier(criterion='entropy', n_estimators=1000)
Model accuracy score with default hyperparameters: 0.6977
DecisionTreeClassifier()
Model accuracy score with default hyperparameters: 0.6512
XGBClassifier(base_score=None, booster=None, callbacks=None,
              colsample_bylevel=None, colsample_bynode=None,
              colsample_bytree=None, device=None, early_stopping_rounds=None,
              enable_categorical=False, eval_metric=None, feature_types=None,
              gamma=None, grow_policy=None, importance_type=None,
              interaction_constraints=None, learning_rate=0.05, max_bin=None,
              max_cat_threshold=None, max_cat_to_onehot=None,
              max_delta_step=None, max_depth=None, max_leaves=None,
              min_child_weight

In [151]:
lr = pickle.load(open('SVM.pkl', 'rb'))
vec = pickle.load(open('vectorizer.pkl', 'rb'))

In [152]:
string = ['Tiba-tiba dari balik gapura muncul bayangan tiriggi dengan jubah panjang berkibar. Aku terperangah. Sosok itu... raut wajahnya tersamar dalam keremangan, Dia berjalan pelan ke arahku. Lalu perlahan, sinar mentari temaram menyinari setengah wajahnya, mcmbuat garis wajahnya bercahaya, dan sesaat kemudian tampakiah deretan gigi-gigi putih terkuak dari seulas seringai dingin."Pertemuan aneh di gapura Bajang Ratu antara Bintang dan Ben Portman itu ternyata menjadi awal dari peristiwa-peristiwa menegangkan berikutnya. Bintang adalah seorang wartawan lepas yang sedang mengalami konflik batin setelah dirinya sadar banyak basil karyanya yang meraih berderet penghargaan namun didapatkan melalui cara tidak benar. aat kerusuhan Manggarai pecah, Bintang bertemu dengan Jon, seorang preman penguasa iManggarai. Aksi saling menyelamatkan menyatukan mereka pada jalin persahabatan. Kcduanya pun berusaha hijrah dari masa lalu kelam itu. Secara tak sengaja, Bintang dan Jon direkrut oleh Laskar Ababil, kelompok pembela kebenaran yang berperang melawan Para Pemuja, kelompok pengusung panji-panji kegelapan. Laskar Ababil harus menggagalkan usaha Para Pemuja merebut Kitab Biru, sebuah kitab sumber rahasia kekuatan kegelapan yang diidamkan Ben dan Para Pemuja. Kitab jahat ini tak boleh jatuh di tangan Para Pemuja. Bagaimana pun caranya']
vecstring = vec.transform(string).toarray()
vecstring

array([[0.09569119, 0.19138239, 0.        , ..., 0.        , 0.        ,
        0.        ]])

In [153]:
np.argmax(lr.predict_proba(vecstring))

2