# Natural Language Processing

### List 2

In [1]:
import os
from src.parsers.text_parser import TextParser
from src.naive_bayes_text_classificator import NaiveBayesText
from sklearn.metrics import accuracy_score
from typing import Tuple, List

## Create required objects, structures etc.

In [2]:
text_parser = TextParser()

In [3]:
orzeszkowa_unigrams = text_parser.parse_text_file('korpus_orzeszkowej.txt',
                                                  text_name='Orzeszkowa')
prus_unigrams = text_parser.parse_text_file('korpus_prusa.txt',
                                            text_name='Prus')
sienkiewicz_unigrams = text_parser.parse_text_file('korpus_sienkiewicza.txt',
                                                   text_name='Sienkiewicz')

In [4]:
# A part of generated orzeszkowa unigrams
dict(list(orzeszkowa_unigrams.unigrams.items())[0:2])

{'Jędza': 2, 'powieść': 2}

In [5]:
all_targets = (orzeszkowa_unigrams, prus_unigrams, sienkiewicz_unigrams)

In [6]:
naive_bayes = NaiveBayesText(all_targets)

### Add optional features to the model

In [7]:
def create_feature(feature_name: str, mark: str) -> Tuple:
    feature_values = (text_parser.count_mark('korpus_orzeszkowej.txt', mark),
                      text_parser.count_mark('korpus_prusa.txt', mark),
                      text_parser.count_mark('korpus_sienkiewicza.txt', mark))
    
    return (feature_name, feature_values)

In [8]:
features_meta = (('dash', '—'), ('space', ' '), ('exclamation_mark', '!'),
                 ('question_mark', '?'), ('dot', '.'))
additional_features = tuple([create_feature(feature_name, mark)
                             for feature_name, mark in features_meta])

In [9]:
additional_features

(('dash', (5722, 8652, 2078)),
 ('space', (223428, 169477, 82381)),
 ('exclamation_mark', (4060, 2500, 847)),
 ('question_mark', (1760, 2062, 631)),
 ('dot', (12812, 10643, 4793)))

### Fit the model

In [10]:
model = naive_bayes.fit(new_features=additional_features)

In [11]:
# Print some values of the model
model.head()

Unnamed: 0,Orzeszkowa,Prus,Sienkiewicz
puch,7.066354e-07,0.0,0.0
trzymającem,7.066354e-07,0.0,0.0
nieprzystojnego,7.066354e-07,0.0,0.0
synowca,1.413271e-06,0.0,2e-06
tarcza,2.119906e-06,0.0,2e-06


In [12]:
# Ensure the values are normalized
print(model.values.sum(0))

[0.33333333 0.33333333 0.33333333]


## Test our Naive Bayes Classificator

In [13]:
# Splitted data
validation_data = os.listdir(os.path.join(os.getcwd(), 'data',
                                          'dane_pozytywistyczne',
                                          'testy1', 'validation_data'))

test_data = os.listdir(os.path.join(os.getcwd(), 'data', 'dane_pozytywistyczne',
                                    'testy1', 'test_data'))

In [14]:
# Check classifier on the data
def check_naive_bayes(test_paths: List, test_parent_dir: str) -> Tuple:
    y_true, y_pred = [], []

    for test in test_paths:
        if 'orzeszkowej' in test:
            true_target = 'Orzeszkowa'
        elif 'prus' in test:
            true_target = 'Prus'
        else:
            true_target = 'Sienkiewicz'

        y_true.append(true_target)

        unigrams_test = text_parser.parse_text_file(os.path.join('testy1',
                                                                 test_parent_dir,
                                                                 test))
        pred_target = naive_bayes.predict(model, unigrams_test)
        y_pred.append(pred_target)
        
    return y_true, y_pred

### Experiment on validation data

In [15]:
validation_targets = check_naive_bayes(validation_data, 'validation_data')

In [16]:
accuracy_score(*validation_targets)

0.6666666666666666

### Experiment on test data

In [17]:
test_targets = check_naive_bayes(test_data, 'test_data')

In [18]:
accuracy_score(*test_targets)

0.8461538461538461