<a href="https://colab.research.google.com/github/abuwildanm/Text-Mining/blob/master/Sentiment_Analysis_Project.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

## Persiapan

In [320]:
# Import Library Standard
import numpy as np
import pandas as pd

# Import Library Sklearn
from sklearn.model_selection import StratifiedShuffleSplit, train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

# Import Library untuk Stemming
!pip install Sastrawi
from Sastrawi.Stemmer.StemmerFactory import StemmerFactory
from Sastrawi.StopWordRemover.StopWordRemoverFactory import StopWordRemoverFactory



In [321]:
# Read Dataset
data = pd.read_excel('https://github.com/abuwildanm/Text-Mining/blob/master/Dataset%20MRT%20Jakarta.xlsx?raw=true')
# data = pd.read_excel('Dataset MRT Jakarta.xlsx')
data.head()

Unnamed: 0,Komentar,Sumber,Pakar 1 (Firda),Pakar 2 (Adel),Pakar 3 (Faiz),Pakar 4 (Marsha),Positif,Negatif,Hasill akhir
0,"tidak perlu menunggu lama utk menunggu MRT,krn...",Facebook,Positif,Positif,Positif,Positif,4,0,Positif
1,"alhamdulillah baru sempet naik MRt. nyaman, be...",Facebook,Positif,Positif,Positif,Positif,4,0,Positif
2,"Betul, petugas MRT Jakarta sangat ramah terhad...",Twitter,Positif,Positif,Positif,Positif,4,0,Positif
3,Hidup gw jadi lebih nyaman karena gak mikir ma...,Twitter,Positif,Positif,Positif,Positif,4,0,Positif
4,MRT adalah moda raya transportasi yang cepat d...,Twitter,Positif,Positif,Positif,Positif,4,0,Positif


In [322]:
df = data[['Komentar', 'Hasill akhir']].copy()
df.rename(columns={'Komentar':'komentar', 'Hasill akhir':'label'}, inplace=True)
df.head()

Unnamed: 0,komentar,label
0,"tidak perlu menunggu lama utk menunggu MRT,krn...",Positif
1,"alhamdulillah baru sempet naik MRt. nyaman, be...",Positif
2,"Betul, petugas MRT Jakarta sangat ramah terhad...",Positif
3,Hidup gw jadi lebih nyaman karena gak mikir ma...,Positif
4,MRT adalah moda raya transportasi yang cepat d...,Positif


In [323]:
print('Ukuran Dataset: ', df.shape)
n_document = df.shape[0]

Ukuran Dataset:  (30, 2)


## Preprocessing

### Spell Correction

### Stemming

In [0]:
# Membuat stemmer
stemmerFactory = StemmerFactory()
stemmer = stemmerFactory.create_stemmer()

# Daftar Stopword
stopWordRemoverFactory = StopWordRemoverFactory()
stopwords = stopWordRemoverFactory.get_stop_words()

In [0]:
# Proses Stem
for row in range(n_document):
  df.loc[row, 'komentar'] = stemmer.stem(df.loc[row, 'komentar'])

In [326]:
df.head()

Unnamed: 0,komentar,label
0,tidak perlu tunggu lama utk tunggu mrt krna ad...,Positif
1,alhamdulillah baru sempet naik mrt nyaman bers...,Positif
2,betul tugas mrt jakarta sangat ramah hadap san...,Positif
3,hidup gw jadi lebih nyaman karena gak mikir ma...,Positif
4,mrt adalah moda raya transportasi yang cepat d...,Positif


## Pembagian Data

In [0]:
# Pembagian data train dan test
x_train, x_test, y_train, y_test = train_test_split(df['komentar'], df['label'], test_size=(10/n_document), stratify=df['label'])
train_data = pd.DataFrame({'komentar':x_train, 'label':y_train})
test_data = pd.DataFrame({'komentar':x_test, 'label':y_test})

In [328]:
train_data.head()

Unnamed: 0,komentar,label
21,tolong pihak mrtjkt sedia tempat duduk banyak ...,Negatif
29,pak mohon baik pd bersih sirkulasi udara di mr...,Negatif
0,tidak perlu tunggu lama utk tunggu mrt krna ad...,Positif
1,alhamdulillah baru sempet naik mrt nyaman bers...,Positif
22,bapak ibu admin tolong di sampai ke pihak kait...,Negatif


In [329]:
train_data['label'].value_counts()

Positif    10
Negatif    10
Name: label, dtype: int64

In [330]:
test_data.head()

Unnamed: 0,komentar,label
16,saran min buat lift prioritas kalo bisa di kas...,Negatif
26,min usul nih kalo bisa tugas bersih di bekal j...,Negatif
18,min boleh kasih masuk klo boleh di tiap stasiu...,Negatif
12,kerja bagus mrt terus tahan prestasi,Positif
5,nyobain moda transportasi baru yaitu mrtjkt te...,Positif


In [331]:
test_data['label'].value_counts()

Positif    5
Negatif    5
Name: label, dtype: int64

In [332]:
# Ukuran Data Training & Testing
print('Ukuran data train:', train_data.shape)
print('Ukuran data test:', test_data.shape)

Ukuran data train: (20, 2)
Ukuran data test: (10, 2)


## Perhitungan Bobot

In [333]:
vectorizer = CountVectorizer(stop_words=stopwords)
tf = vectorizer.fit_transform(df['komentar'])
tf = pd.DataFrame(tf.toarray(), columns=vectorizer.get_feature_names())
tf

Unnamed: 0,10,12,20an,2x,40,ac,admin,ain,aja,akhir,akses,alhamdulillah,alias,anak,anti,antri,apa,arah,aroma,asik,atas,atm,atur,baca,badan,bagus,baharu,bahasa,baik,balik,banget,bangun,bank,banyak,bapak,barat,baru,bau,bawa,bayar,...,tau,tdk,teladan,telat,tempat,terima,terimakasih,terus,tetap,tgl,the,thx,tiap,toilet,topup,transportasi,tugas,tuh,tuju,tunggu,tutup,twitter,uang,udara,udh,ulang,untul,up,usul,utk,waktu,warga,warna,wc,wkwk,work,wujud,yaa,yah,yg
0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,2,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,...,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,2,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1
4,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
5,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,1,0,0,0,...,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,1,0,0,0
6,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,1,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,1
7,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
8,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,1,0,0,0,0,0,0,1,0,0,0,0,...,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
9,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,1,0,0,0,0


In [334]:
# Data train
vectorizer_train = CountVectorizer(stop_words=stopwords)
tf_train = vectorizer_train.fit_transform(train_data['komentar'])
tf_train = pd.DataFrame(tf_train.toarray(), columns=vectorizer_train.get_feature_names())
tf_train

Unnamed: 0,10,12,20an,2x,ac,admin,ain,aja,akhir,akses,alhamdulillah,alias,anti,antri,aroma,asik,atas,bagus,baharu,baik,balik,banget,bangun,bank,banyak,bapak,baru,bau,bayar,bbrp,bebas,berangkat,berapa,bersih,biar,bikin,blok,bnr2,brp,buat,...,skrng,smp,smt,sponsor,stasiun,susah,sy,tadi,tanya,tap,tdk,telat,tempat,terima,terimakasih,terus,tetap,tgl,the,thx,tiap,toilet,transportasi,tuh,tuju,tunggu,tutup,udara,udh,ulang,untul,up,utk,warna,wc,wkwk,work,yaa,yah,yg
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,2,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,1,0,0,0,0,0,0,0,2,0,1,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
1,0,0,1,0,0,0,0,0,0,0,0,0,0,0,1,0,1,0,0,1,0,0,0,0,0,0,0,1,0,1,0,0,0,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,2,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0
2,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,2,0,0,0,0,0,0,1,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,1,0,0,0,0,0,0,...,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,1,0,0,0,1,0,1,0,0,0,0,0,0,0,0,1,0,1,0,1,1,0,0,0,1,0,0,0,0,0,0,1,0,0,0,...,0,0,0,0,2,0,0,0,0,0,0,0,1,1,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,1,0,2
5,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,1,...,0,0,0,1,0,0,0,0,0,0,0,0,0,1,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,1
6,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,1,0
7,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,2,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1
8,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,1,1,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
9,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,...,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,1,0,0,0


In [335]:
# Data test
vectorizer_test = CountVectorizer(stop_words=stopwords)
tf_test = vectorizer_test.fit_transform(test_data['komentar'])
tf_test = pd.DataFrame(tf_test.toarray(), columns=vectorizer_test.get_feature_names())
tf_test

Unnamed: 0,10,40,admin,anak,antri,apa,arah,aroma,atas,atm,atur,baca,badan,bagus,bahasa,baik,banget,bangun,banyak,barat,baru,bau,bawa,bekal,bendung,bersih,betul,biar,bijak,blok,bni,buat,bugar,cari,cek,celaka,contoh,disabilitas,dong,dukung,...,sampe,sandang,sangat,santun,saran,sarung,segera,sehat,semua,sender,sering,sesuai,sih,soal,sopan,square,stasiun,tadi,tahan,tahun,tambah,tangan,tapcash,tau,teladan,terima,terimakasih,terus,tiap,topup,transportasi,tugas,twitter,uang,ulang,usul,waktu,warga,wujud,yg
0,0,0,0,1,1,0,0,0,0,0,0,0,1,0,0,0,1,0,1,0,0,0,0,0,0,0,0,1,0,0,0,2,1,0,0,0,0,0,0,0,...,0,0,0,0,1,0,0,1,0,0,0,1,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,2
1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,1,0,1,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,1,0,0,0,1,0,0,0,0
2,0,0,0,0,0,0,1,0,0,3,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,2,2,0,0,1,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,2,0,0,0,0,0,1,1,0,0,0,0,1,1,0,1,0,2,1,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,1,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,1,0,0,0,0,0,0,1,1,0
5,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,1,0,0,1,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0
6,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,1,0,1,0,1,...,0,1,2,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,1,0,0,0
7,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,1,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,2,1,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0
8,1,1,0,0,0,1,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,1,2,0,0,0,0,0,0,0,0,0,0,0,0,1,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
9,0,0,1,0,0,0,0,0,0,0,1,1,0,0,1,1,0,0,0,0,0,0,1,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,1,0,0,0,0,0,0,1,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0


## Klasifikasi

### Scratch

In [0]:
class MultinomialNaiveBayes():
  
  def __init__(self, alpha=1.0):
    self.prior = []
    self.likelihood = []
    self.posterior = []
    self.alpha = alpha

  def calculatePrior(self, label):
    prior = label.value_counts() / label.size
    return prior

  def calculateLikelihood(self, data, label):
    sparse_data = pd.concat([data, label.reset_index(drop=True)], axis=1)
    countWordLabel = sparse_data.groupby('label').sum()
    countAllWordLabel = countWordLabel.sum(axis=1)
    likelihood = (countWordLabel + self.alpha).div(countAllWordLabel + data.shape[1], axis=0)
    return likelihood

  # # Option 1
  # def calculatePosterior(self, data):
  #   posterior = []
  #   for i in range(data.shape[0]):
  #     idx_term_test = (data.iloc[i] != 0)
  #     term_test = data.columns[idx_term_test]
  #     feature_test = np.intersect1d(term_test, self.likelihood.columns)
  #     posterior.append(self.prior * self.likelihood[feature_test].prod(axis=1))
    
  #   posterior = pd.DataFrame(posterior)
  #   return posterior

  def calculatePosterior(self, row_data):
    idx_term_test = (row_data != 0)
    term_test = row_data.index[idx_term_test]
    feature_test = np.intersect1d(term_test, self.likelihood.columns)
    posterior = self.prior * self.likelihood[feature_test].prod(axis=1)
    return posterior

  def fit(self, x, y):

    # Menghitung prior
    self.prior = self.calculatePrior(y)
    # Menghitung likelihood
    self.likelihood = self.calculateLikelihood(x, y)

  def predict(self, x):

    # Menghitung posterior
    self.posterior = x.apply(self.calculatePosterior, axis=1)
    # Memprediksi label
    pred = self.posterior.idxmax(axis=1)

    return pred

  def score(self, x, y):
    
    # Memprediksi label
    pred = self.predict(x)
    # Menghitung akurasi
    accuracy = accuracy_score(y, pred)

    return accuracy


In [337]:
mnb = MultinomialNaiveBayes()
mnb.fit(tf_train, train_data['label'])
mnb.predict(tf_test)

0    Negatif
1    Positif
2    Negatif
3    Positif
4    Positif
5    Positif
6    Positif
7    Negatif
8    Negatif
9    Positif
dtype: object

In [338]:
mnb.score(tf_test, test_data['label'])

0.7

In [339]:
mnb.prior

Positif    0.5
Negatif    0.5
Name: label, dtype: float64

In [340]:
mnb.likelihood

Unnamed: 0_level_0,10,12,20an,2x,ac,admin,ain,aja,akhir,akses,alhamdulillah,alias,anti,antri,aroma,asik,atas,bagus,baharu,baik,balik,banget,bangun,bank,banyak,bapak,baru,bau,bayar,bbrp,bebas,berangkat,berapa,bersih,biar,bikin,blok,bnr2,brp,buat,...,skrng,smp,smt,sponsor,stasiun,susah,sy,tadi,tanya,tap,tdk,telat,tempat,terima,terimakasih,terus,tetap,tgl,the,thx,tiap,toilet,transportasi,tuh,tuju,tunggu,tutup,udara,udh,ulang,untul,up,utk,warna,wc,wkwk,work,yaa,yah,yg
label,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1,Unnamed: 26_level_1,Unnamed: 27_level_1,Unnamed: 28_level_1,Unnamed: 29_level_1,Unnamed: 30_level_1,Unnamed: 31_level_1,Unnamed: 32_level_1,Unnamed: 33_level_1,Unnamed: 34_level_1,Unnamed: 35_level_1,Unnamed: 36_level_1,Unnamed: 37_level_1,Unnamed: 38_level_1,Unnamed: 39_level_1,Unnamed: 40_level_1,Unnamed: 41_level_1,Unnamed: 42_level_1,Unnamed: 43_level_1,Unnamed: 44_level_1,Unnamed: 45_level_1,Unnamed: 46_level_1,Unnamed: 47_level_1,Unnamed: 48_level_1,Unnamed: 49_level_1,Unnamed: 50_level_1,Unnamed: 51_level_1,Unnamed: 52_level_1,Unnamed: 53_level_1,Unnamed: 54_level_1,Unnamed: 55_level_1,Unnamed: 56_level_1,Unnamed: 57_level_1,Unnamed: 58_level_1,Unnamed: 59_level_1,Unnamed: 60_level_1,Unnamed: 61_level_1,Unnamed: 62_level_1,Unnamed: 63_level_1,Unnamed: 64_level_1,Unnamed: 65_level_1,Unnamed: 66_level_1,Unnamed: 67_level_1,Unnamed: 68_level_1,Unnamed: 69_level_1,Unnamed: 70_level_1,Unnamed: 71_level_1,Unnamed: 72_level_1,Unnamed: 73_level_1,Unnamed: 74_level_1,Unnamed: 75_level_1,Unnamed: 76_level_1,Unnamed: 77_level_1,Unnamed: 78_level_1,Unnamed: 79_level_1,Unnamed: 80_level_1,Unnamed: 81_level_1
Negatif,0.002075,0.004149,0.004149,0.004149,0.004149,0.004149,0.004149,0.002075,0.004149,0.004149,0.002075,0.004149,0.002075,0.004149,0.004149,0.002075,0.008299,0.002075,0.002075,0.004149,0.004149,0.004149,0.004149,0.004149,0.010373,0.004149,0.002075,0.006224,0.006224,0.006224,0.002075,0.004149,0.004149,0.004149,0.004149,0.002075,0.004149,0.002075,0.004149,0.004149,...,0.004149,0.002075,0.002075,0.002075,0.012448,0.004149,0.004149,0.002075,0.004149,0.004149,0.006224,0.004149,0.010373,0.004149,0.004149,0.004149,0.002075,0.004149,0.002075,0.004149,0.006224,0.006224,0.002075,0.004149,0.002075,0.002075,0.004149,0.004149,0.004149,0.006224,0.004149,0.002075,0.002075,0.002075,0.004149,0.002075,0.002075,0.004149,0.004149,0.006224
Positif,0.004706,0.002353,0.002353,0.002353,0.002353,0.002353,0.002353,0.004706,0.009412,0.002353,0.004706,0.002353,0.004706,0.002353,0.002353,0.004706,0.002353,0.004706,0.004706,0.002353,0.002353,0.004706,0.002353,0.002353,0.007059,0.002353,0.004706,0.002353,0.002353,0.002353,0.007059,0.004706,0.002353,0.007059,0.002353,0.004706,0.004706,0.004706,0.002353,0.004706,...,0.002353,0.004706,0.004706,0.004706,0.004706,0.002353,0.002353,0.004706,0.002353,0.002353,0.002353,0.002353,0.002353,0.004706,0.004706,0.002353,0.007059,0.002353,0.004706,0.002353,0.004706,0.002353,0.004706,0.002353,0.004706,0.007059,0.002353,0.002353,0.002353,0.002353,0.002353,0.004706,0.007059,0.004706,0.002353,0.004706,0.004706,0.002353,0.002353,0.007059


In [341]:
mnb.posterior

Unnamed: 0,Negatif,Positif
0,1.570955e-44,6.705449e-47
1,1.111637e-10,1.379294e-10
2,5.563452e-28,9.214734e-31
3,1.111637e-10,5.517175e-10
4,8.182056999999999e-48,4.238506e-46
5,8.612744e-15,3.054492e-14
6,3.418361e-23,1.591593e-21
7,2.96577e-19,3.382136e-20
8,4.1039750000000006e-31,7.317583000000001e-33
9,9.225206e-13,9.736192e-13


### Benchmark

In [342]:
mnb = MultinomialNB()
model = mnb.fit(tf.loc[train_data.index], train_data['label'])
model.predict(tf.loc[test_data.index])

array(['Negatif', 'Positif', 'Negatif', 'Positif', 'Positif', 'Positif',
       'Positif', 'Negatif', 'Negatif', 'Positif'], dtype='<U7')

In [343]:
mnb.score(tf.loc[test_data.index], test_data['label'])

0.7