In [1]:
import pandas as pd
import re
import numpy as np
import joblib
import pickle
from sklearn import model_selection
from sklearn.linear_model import LogisticRegression
from sklearn.multiclass import OneVsRestClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import classification_report
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.pipeline import Pipeline
from sklearn.metrics import accuracy_score
from sklearn.model_selection import cross_val_score

In [2]:
df = pd.read_csv('labeling.csv', sep=';')

In [3]:
def convert(polarity):
    if polarity == 'positif':
        return 1
    elif polarity == 'netral':
        return 0
    else :
        return -1

In [4]:
df['polarity'] = df['label'].apply(convert)

In [5]:
df.head(5)

Unnamed: 0,id,created_at,text,TEXT.1,TOKENISASI,STOP_REMOVAL,label,polarity
0,"1,61655E+18",21/01/2023 04:46,repost divisi humas polri ketua harian kompoln...,Repost Divisi Humas Polri Ketua Harian Kompoln...,"['repost', 'divisi', 'humas', 'polri', 'ketua'...",repost divisi humas polri ketua harian kompoln...,positif,1
1,"1,61647E+18",20/01/2023 23:12,ch chotimah2 aniesbaswedan abu waras rahmania...,ch chotimah2 aniesbaswedan abu waras rahmania...,"['', 'ch', 'chotimah2', 'aniesbaswedan', 'abu'...",ch chotimah2 aniesbaswedan abu waras rahmania...,negatif,-1
2,"1,61645E+18",20/01/2023 21:42,jokowi kinerja kepolisian blm menunjukan pres...,jokowi Kinerja kepolisian blm menunjukan pres...,"['', 'jokowi', 'kinerja', 'kepolisian', 'blm',...",jokowi kinerja kepolisian blm menunjukan pres...,negatif,-1
3,"1,6164E+18",20/01/2023 18:29,kompascom ya itulah tabiat kepolisian di indo...,kompascom Ya Itulah Tabiat kepolisian di Indo...,"['', 'kompascom', 'ya', 'itulah', 'tabiat', 'k...",kompascom ya tabiat kepolisian indonesia mala...,negatif,-1
4,"1,61639E+18",20/01/2023 18:05,rt divhumas polri ketua harian kompolnas irjen...,RT DivHumas Polri Ketua Harian Kompolnas Irjen...,"['rt', 'divhumas', 'polri', 'ketua', 'harian',...",rt divhumas polri ketua harian kompolnas irjen...,positif,1


In [6]:
X = df['STOP_REMOVAL']
y = df['polarity']

In [7]:
bow_transformer = CountVectorizer()
print(df['STOP_REMOVAL'].shape)
X = bow_transformer.fit_transform(df['STOP_REMOVAL'])

print(X.toarray())
print('Shape of Sparse Matrix: ', X.shape)
print('Amount of Non-Zero occurrences: ', X.nnz)

filename1 = 'count_vector.pkl'
pickle.dump(bow_transformer, open(filename1, 'wb'))

tf_transform = TfidfTransformer(use_idf=False).fit(X)
X = tf_transform.transform(X)
print(X.shape)

filename1 = 'tfid_transform.pkl'
pickle.dump(bow_transformer, open(filename1, 'wb'))

(88,)
[[0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 ...
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 1 0 0]]
Shape of Sparse Matrix:  (88, 865)
Amount of Non-Zero occurrences:  1660
(88, 865)


In [8]:
X_train, X_test, y_train, y_test = train_test_split(
	X, y, test_size=0.2)
#print(X_test)

In [9]:
nb = MultinomialNB()
nb.fit(X_train, y_train)
pred = nb.predict(X_test)

In [10]:
print(classification_report(y_test, pred))

              precision    recall  f1-score   support

          -1       1.00      0.25      0.40         8
           1       0.62      1.00      0.77        10

    accuracy                           0.67        18
   macro avg       0.81      0.62      0.58        18
weighted avg       0.79      0.67      0.61        18



In [11]:
scores = cross_val_score(nb, X, y, cv=10)
print(scores)

[0.66666667 0.77777778 1.         0.77777778 0.66666667 0.77777778
 0.77777778 0.55555556 0.75       0.625     ]


In [21]:
print(scores.mean())

0.7375


In [12]:
from io import StringIO
classification = classification_report(y_test, pred)
s = StringIO(classification)
with open('classification.csv', 'w') as f:
    for line in s:
        f.write(line)

In [13]:
print(accuracy_score(y_test, pred))

0.6666666666666666


In [14]:
accuracy = accuracy_score(y_test, pred)
a = np.asarray([accuracy])
np.savetxt("accuracy.csv", a, delimiter=",", fmt='%s')

In [15]:
model = LogisticRegression()
model.fit(X_train, y_train)

In [16]:
filename = 'model_analisis.pkl'
pickle.dump(model, open(filename, 'wb'))

In [17]:
loaded_model = pickle.load(open(filename, 'rb'))
result = loaded_model.score(X_test, y_test)
print(result)

0.6111111111111112


In [18]:
loaded_model.predict(X_test)

array([ 1,  1,  1,  1,  1,  1,  1,  1,  1, -1,  1,  1,  1,  1,  1,  1,  1,
        1], dtype=int64)