# Naivni Bajasov Klasifikator

## Ucitavanje paketa

In [2]:
import pandas as pd
import numpy as np

from sklearn.naive_bayes import MultinomialNB
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer

## Priprema podataka

In [19]:
corpus = [
    'Chinese Beijing Chinese',
    'Chinese Chinese Shanghai',
    'Chinese Macao',
    'Tokyo Japan Chinese'
]

classes = ['yes', 'yes', 'yes', 'no']

target_document = 'Chinese Chinese Chinese Tokyo Japan'

vectorizer = CountVectorizer()
X_train = vectorizer.fit_transform(corpus)
X_test = vectorizer.transform([target_document])

feature_names = vectorizer.get_feature_names()

pd.DataFrame(X_train.toarray(), index=corpus, columns=feature_names)

Unnamed: 0,beijing,chinese,japan,macao,shanghai,tokyo
Chinese Beijing Chinese,1,2,0,0,0,0
Chinese Chinese Shanghai,0,2,0,0,1,0
Chinese Macao,0,1,0,1,0,0
Tokyo Japan Chinese,0,1,1,0,0,1


## Treniranje i Evaluacija Modela

In [17]:
clf = MultinomialNB()
clf.fit(X_train, classes)

pd.DataFrame(clf.feature_count_, index=clf.classes_, columns=feature_names)

[1. 3.]


Unnamed: 0,beijing,chinese,japan,macao,shanghai,tokyo
no,0.0,1.0,1.0,0.0,0.0,1.0
yes,1.0,5.0,0.0,1.0,1.0,0.0


In [20]:
print(clf.predict(X_test))

['yes']


# Ebrat skup podataka

## Ucitavanje potrebnih paketa

In [46]:
import numpy as np
import pandas as pd

from sklearn.naive_bayes import MultinomialNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier

from sklearn.feature_extraction import DictVectorizer
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import confusion_matrix, classification_report, accuracy_score
 
import os

## Preprocesiranje podataka

In [42]:
corpus = []
classes = []

for class_name in os.listdir('Data/VektoriEbart-5/Trening/'):
    dir_path = os.path.join('Data/VektoriEbart-5/Trening/', class_name)
    for file_name in os.listdir(dir_path):
        with open(os.path.join(dir_path, file_name)) as f:
            file_words = {}
            for line in f.readlines():
                [word, count] = line.split()
                file_words[word] = int(count)
        classes.append(class_name)
        corpus.append(file_words)
        
vectorizer = DictVectorizer()
X = vectorizer.fit_transform(corpus)
feature_names = vectorizer.get_feature_names()

df = pd.DataFrame(X.toarray(), columns=feature_names)

X_train, X_test, y_train, y_test = train_test_split(df, classes, test_size=0.33, stratify=classes)
print(X_train.shape, X_test.shape)
print(len(y_train), len(y_test))

(2339, 36830) (1153, 36830)
2339 1153


## Treniranje i Evaluacija Modela

In [58]:
def train_and_evaluate(clf, X_train, X_test, y_train, y_test, cv=False):
    clf.fit(X_train, y_train)
    
    if cv:
        print(f'Najbolji parametri: {clf.best_params_}')
        
    print(f'Tacnost modela:\n{accuracy_score(y_train, clf_mnb.predict(X_train))} (Train)\n{accuracy_score(y_test, clf_mnb.predict(X_test))} (Test)\n')
    print(f'Matrica konfuzije:\n{confusion_matrix(y_train, clf_mnb.predict(X_train))} (Train)\n{confusion_matrix(y_test, clf_mnb.predict(X_test))} (Test)\n')
    print(f'Izvestaj klasifikacije:\n{classification_report(y_train, clf_mnb.predict(X_train))} (Train)\n{classification_report(y_test, clf_mnb.predict(X_test))} (Test)\n')


### Naivni Bajasov Klasifikator

In [59]:
print('Multinomijalni Naivni Bajasov Klasifikator!\n')

clf_mnb = MultinomialNB()
train_and_evaluate(clf_mnb, X_train, X_test, y_train, y_test)

Multinomijalni Naivni Bajasov Klasifikator!

Tacnost modela:
0.9478409576742197 (Train)
0.8907198612315698 (Test)

Matrica konfuzije:
[[210   1   2  10   0]
 [ 10 341   2  62   0]
 [  0   2 417   1   0]
 [  8  12   4 598   4]
 [  1   2   0   1 651]] (Train)
[[ 94   6   1   9   0]
 [  2 155   4  42   2]
 [  3   2 191   8   3]
 [  2  24   3 278   2]
 [  1   5   2   5 309]] (Test)

Izvestaj klasifikacije:
                 precision    recall  f1-score   support

      Ekonomija       0.92      0.94      0.93       223
HronikaKriminal       0.95      0.82      0.88       415
  KulturaZabava       0.98      0.99      0.99       420
       Politika       0.89      0.96      0.92       626
          Sport       0.99      0.99      0.99       655

       accuracy                           0.95      2339
      macro avg       0.95      0.94      0.94      2339
   weighted avg       0.95      0.95      0.95      2339
 (Train)
                 precision    recall  f1-score   support

      Ekonom

### K-Najblizih Suseda

In [62]:
print('K-Najblizih Suseda!')

params_knn = {
    'n_neighbors': [3, 5, 7],
    'p': [2],
    'weights': ['uniform', 'distance']
}

clf_knn = GridSearchCV(KNeighborsClassifier(), param_grid=params_knn, cv=5)
train_and_evaluate(clf_knn, X_train, X_test, y_train, y_test, cv=True)

K-Najblizih Suseda!
Najbolji parametri: {'n_neighbors': 7, 'p': 2, 'weights': 'distance'}
Tacnost modela:
0.9478409576742197 (Train)
0.8907198612315698 (Test)

Matrica konfuzije:
[[210   1   2  10   0]
 [ 10 341   2  62   0]
 [  0   2 417   1   0]
 [  8  12   4 598   4]
 [  1   2   0   1 651]] (Train)
[[ 94   6   1   9   0]
 [  2 155   4  42   2]
 [  3   2 191   8   3]
 [  2  24   3 278   2]
 [  1   5   2   5 309]] (Test)

Izvestaj klasifikacije:
                 precision    recall  f1-score   support

      Ekonomija       0.92      0.94      0.93       223
HronikaKriminal       0.95      0.82      0.88       415
  KulturaZabava       0.98      0.99      0.99       420
       Politika       0.89      0.96      0.92       626
          Sport       0.99      0.99      0.99       655

       accuracy                           0.95      2339
      macro avg       0.95      0.94      0.94      2339
   weighted avg       0.95      0.95      0.95      2339
 (Train)
                 precisio

### Drveta Odlucivanja

In [61]:
print('Drveta Odlucivanja!')

params_dt = {
    'criterion': ['gini', 'entropy'],
    'max_depth': [15, 25, 5]
}

clf_dt = GridSearchCV(DecisionTreeClassifier(), param_grid=params_dt, cv=5)
train_and_evaluate(clf_dt, X_train, X_test, y_train, y_test, cv=True)

Drveta Odlucivanja!
Najbolji parametri: {'criterion': 'gini', 'max_depth': 25}
Tacnost modela:
0.9478409576742197 (Train)
0.8907198612315698 (Test)

Matrica konfuzije:
[[210   1   2  10   0]
 [ 10 341   2  62   0]
 [  0   2 417   1   0]
 [  8  12   4 598   4]
 [  1   2   0   1 651]] (Train)
[[ 94   6   1   9   0]
 [  2 155   4  42   2]
 [  3   2 191   8   3]
 [  2  24   3 278   2]
 [  1   5   2   5 309]] (Test)

Izvestaj klasifikacije:
                 precision    recall  f1-score   support

      Ekonomija       0.92      0.94      0.93       223
HronikaKriminal       0.95      0.82      0.88       415
  KulturaZabava       0.98      0.99      0.99       420
       Politika       0.89      0.96      0.92       626
          Sport       0.99      0.99      0.99       655

       accuracy                           0.95      2339
      macro avg       0.95      0.94      0.94      2339
   weighted avg       0.95      0.95      0.95      2339
 (Train)
                 precision    recall

# Iris skup podataka

## Ucitavanje paketa

In [63]:
import numpy as np
import pandas as pd

from sklearn.naive_bayes import GaussianNB

from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report

## Preprocesiranje podataka

In [71]:
df = pd.read_csv('Data/iris.csv')

feature_names = df.columns[:4].tolist()
target_name = df.columns[4]

X = df[feature_names]
y = df[target_name]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, stratify=y)
print(X_train.shape, X_test.shape)
print(y_train.shape, y_test.shape)

(100, 4) (50, 4)
(100,) (50,)


## Treniranje i Evaluacija Modela

In [73]:
clf = GaussianNB()
clf.fit(X_train, y_train)

y_pred = clf.predict(X_test)

print(f'Tacnost: {accuracy_score(y_test, y_pred)}')
print(f'Matrica konfuzije:\n {confusion_matrix(y_test, y_pred)}')
print(f'Izvestaj klasifikacije:\n {classification_report(y_test, y_pred)}')

Tacnost: 0.96
Matrica konfuzije:
 [[17  0  0]
 [ 0 16  0]
 [ 0  2 15]]
Izvestaj klasifikacije:
               precision    recall  f1-score   support

      Setosa       1.00      1.00      1.00        17
  Versicolor       0.89      1.00      0.94        16
   Virginica       1.00      0.88      0.94        17

    accuracy                           0.96        50
   macro avg       0.96      0.96      0.96        50
weighted avg       0.96      0.96      0.96        50

