### Load Tokenizer

In [1]:
import nltk
nltk.download('punkt')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Youski\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [2]:
from tokenizers import Tokenizer
from nltk.tokenize import word_tokenize


tokenizer = Tokenizer.from_file("unigram_tokenizer.json")
tokenizer

<tokenizers.Tokenizer at 0x1df2672fd10>

### build training and testing datasets

In [3]:
# this cell is completely copied from Chat GPT
# for more clarification please inbox 
import os
import shutil
import random

def split_data(source_folder, destination_folder, test_ratio=0.1):
    # Create destination folders if they don't exist
    if not os.path.exists(destination_folder):
        os.makedirs(destination_folder)

    # Iterate through each subfolder in the source folder
    for root, dirs, files in os.walk(source_folder):
        for subdir in dirs:
            source_subfolder = os.path.join(root, subdir)
            dest_subfolder = os.path.join(destination_folder, subdir)

            # Create destination subfolder if it doesn't exist
            if not os.path.exists(dest_subfolder):
                os.makedirs(dest_subfolder)

            # List all files in the source subfolder
            files_list = os.listdir(source_subfolder)
            # Shuffle the files list to randomly select files for testing
            random.shuffle(files_list)

            # Calculate the number of files to move to testing folder
            num_files_to_move = int(len(files_list) * test_ratio)

            # Move files to the testing folder
            for file_name in files_list[:num_files_to_move]:
                source_file_path = os.path.join(source_subfolder, file_name)
                dest_file_path = os.path.join(dest_subfolder, file_name)
                shutil.move(source_file_path, dest_file_path)
                print(f"Moved {file_name} to {dest_subfolder}")

                
# split_data("training", "testing", test_ratio=0.1)

In [4]:
from nltk.corpus import stopwords
 
# nltk.download('stopwords')
print(stopwords.words('english'))

['i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', "you're", "you've", "you'll", "you'd", 'your', 'yours', 'yourself', 'yourselves', 'he', 'him', 'his', 'himself', 'she', "she's", 'her', 'hers', 'herself', 'it', "it's", 'its', 'itself', 'they', 'them', 'their', 'theirs', 'themselves', 'what', 'which', 'who', 'whom', 'this', 'that', "that'll", 'these', 'those', 'am', 'is', 'are', 'was', 'were', 'be', 'been', 'being', 'have', 'has', 'had', 'having', 'do', 'does', 'did', 'doing', 'a', 'an', 'the', 'and', 'but', 'if', 'or', 'because', 'as', 'until', 'while', 'of', 'at', 'by', 'for', 'with', 'about', 'against', 'between', 'into', 'through', 'during', 'before', 'after', 'above', 'below', 'to', 'from', 'up', 'down', 'in', 'out', 'on', 'off', 'over', 'under', 'again', 'further', 'then', 'once', 'here', 'there', 'when', 'where', 'why', 'how', 'all', 'any', 'both', 'each', 'few', 'more', 'most', 'other', 'some', 'such', 'no', 'nor', 'not', 'only', 'own', 'same', 'so', 'than', '

In [5]:
from math import log

def log_transformer(x: float) -> float|None:
    return log(x) if x > 0 else None

In [6]:
import re

def clean_file_content(content: str) -> str:
#     replacements = [
#         # regex, fill_value
#         (' +', ' '),
#         (' \d*[.,]?\d* ', ''),
#         ('[.,-<>]', ''),
#         ("""[()'"]""", '')
#     ]
#     for regex, fill_value in replacements:
#         content = re.sub(regex, fill_value, content)
    regex = r'[^a-zA-Z]'
    content = re.sub(regex, ' ', content)
    content = re.sub(' +', ' ', content)
    content = content.strip()
    
    content = content.replace('\n', '')
    content = content.lower()
    
    return content


In [7]:
clean_file_content("hi123")

'hi'

In [8]:
def drop_stopwords(words: str) -> str:
    return [word for word in words if word not in stopwords.words('english')]
    

In [9]:
def load_files_texts(base_relative_path: str, files: list[str]) -> list[str]:
    docs_content = []
    for file in files:
        with open(f"{base_relative_path}/{file}", 'r') as f:
            content = clean_file_content(f.read())
            
            docs_content.append(content)
            
    return docs_content
    

In [10]:
from typing import Tuple

def split_into_twe_sets(files: list, splitting_rate: float = 0.9) -> Tuple[list, list]:
    # selected_files = random.sample(files, int(len(files) * splitting_rate))
    index = int(len(files) * splitting_rate)
    return files[:index], files[index:]


In [11]:
from nltk.stem import PorterStemmer
from nltk.tokenize import word_tokenize
 
ps = PorterStemmer()

def process_loaded_file(file_content: str) -> list:
        file_content = clean_file_content(file_content)
        words = word_tokenize(file_content)
        words = [ps.stem(word) for word in words]
        words = drop_stopwords(words)
        return words


In [12]:
def load_batch(base_folder: str, batch_size: int=10):
    
    files_words = []
    counter = 0
    folders_count = 0
    
    for folder in os.listdir(base_folder):            
        print(folder)
        for file in os.listdir(f"{base_folder}/{folder}"):
            if counter < batch_size:
                with open(f"{base_folder}/{folder}/{file}") as f:
                    file_content = f.read()
                    words = process_loaded_file(file_content)
                    files_words.extend(words)
                    counter += 1
            else:
                yield folder, files_words
                files_words.clear()
                counter = 0
                
        if len(files_words):
            yield folder, files_words

next(load_batch('training'))

acq


('acq',
 ['comput',
  'termin',
  'system',
  'cpml',
  'complet',
  'sale',
  'commack',
  'n',
  'feb',
  'comput',
  'termin',
  'system',
  'inc',
  'said',
  'ha',
  'complet',
  'sale',
  'share',
  'common',
  'stock',
  'warrant',
  'acquir',
  'addit',
  'one',
  'mln',
  'share',
  'sedio',
  'n',
  'v',
  'lugano',
  'switzerland',
  'dlr',
  'compani',
  'said',
  'warrant',
  'exercis',
  'five',
  'year',
  'purchas',
  'price',
  'dlr',
  'per',
  'share',
  'comput',
  'termin',
  'said',
  'sedio',
  'also',
  'ha',
  'right',
  'buy',
  'addit',
  'share',
  'increas',
  'total',
  'hold',
  'pct',
  'comput',
  'termin',
  'outstand',
  'common',
  'stock',
  'certain',
  'circumst',
  'involv',
  'chang',
  'control',
  'compani',
  'compani',
  'said',
  'condit',
  'occur',
  'warrant',
  'would',
  'exercis',
  'price',
  'equal',
  'pct',
  'common',
  'stock',
  'market',
  'price',
  'time',
  'exceed',
  'dlr',
  'per',
  'share',
  'comput',
  'termin',
  'a

### For step 3 ( Vocabulay extraction )
#### I will drop stopwords in english because they are meaning less in our case
**Another phase later on**

### Modeling

*model will built on pandas df*

model_row_architecture = {
    "category": "c",
    "word": "test",
    "frequency": 10
}

In [13]:
import pandas as pd

In [14]:
%%time
df = pd.DataFrame({'category': pd.Series(dtype='str'),
                   'word': pd.Series(dtype='str'),
                   'frequency': pd.Series(dtype='int')})

df.dtypes

CPU times: total: 0 ns
Wall time: 3.69 ms


category     object
word         object
frequency     int32
dtype: object

In [15]:
%%time
gener = load_batch('training', batch_size=1000)
for category, words in gener:
    temp_df = pd.DataFrame(words, columns=['word'])
    temp_df['category'] = category
    temp_df['frequency'] = 1
    
    df = pd.concat([df, temp_df], ignore_index=True)
    

acq
alum
barley
bop
carcass
castor-oil
cocoa
coconut
coconut-oil
coffee
copper
copra-cake
corn
cotton
cotton-oil
cpi
cpu
crude
dfl
dlr
dmk
earn
fuel
gas
gnp
gold
grain
groundnut
groundnut-oil
heat
hog
housing
income
instal-debt
interest
ipi
iron-steel
jet
jobs
l-cattle
lead
lei
lin-oil
livestock
lumber
meal-feed
money-fx
money-supply
naphtha
nat-gas
nickel
nkr
nzdlr
oat
oilseed
orange
palladium
palm-oil
palmkernel
pet-chem
platinum
potato
propane
rand
rape-oil
rapeseed
reserves
retail
rice
rubber
rye
ship
silver
sorghum
soy-meal
soy-oil
soybean
strategic-metal
sugar
sun-meal
sun-oil
sunseed
tea
tin
trade
unknown
veg-oil
wheat
wpi
yen
zinc
CPU times: total: 1min 16s
Wall time: 3min 9s


In [20]:
df

Unnamed: 0,category,word,frequency
0,acq,comput,1
1,acq,termin,1
2,acq,system,1
3,acq,cpml,1
4,acq,complet,1
...,...,...,...
5875895,zinc,pound,1
5875896,zinc,lead,1
5875897,zinc,compani,1
5875898,zinc,spokesman,1


In [21]:
df

Unnamed: 0,category,word,frequency
0,acq,comput,1
1,acq,termin,1
2,acq,system,1
3,acq,cpml,1
4,acq,complet,1
...,...,...,...
5875895,zinc,pound,1
5875896,zinc,lead,1
5875897,zinc,compani,1
5875898,zinc,spokesman,1


In [22]:
df['word'].value_counts()

word
said         170146
mln           85130
pct           69945
dlr           57777
u             54951
              ...  
pittway           1
petrolit          1
plit              1
basix             1
eindhoven         1
Name: count, Length: 20541, dtype: int64

In [23]:
df['category'].unique()

array(['acq', 'alum', 'barley', 'bop', 'carcass', 'castor-oil', 'cocoa',
       'coconut', 'coconut-oil', 'coffee', 'copper', 'copra-cake', 'corn',
       'cotton', 'cotton-oil', 'cpi', 'cpu', 'crude', 'dfl', 'dlr', 'dmk',
       'earn', 'fuel', 'gas', 'gnp', 'gold', 'grain', 'groundnut',
       'groundnut-oil', 'heat', 'hog', 'housing', 'income', 'instal-debt',
       'interest', 'ipi', 'iron-steel', 'jet', 'jobs', 'l-cattle', 'lead',
       'lei', 'lin-oil', 'livestock', 'lumber', 'meal-feed', 'money-fx',
       'money-supply', 'naphtha', 'nat-gas', 'nickel', 'nkr', 'nzdlr',
       'oat', 'oilseed', 'orange', 'palladium', 'palm-oil', 'palmkernel',
       'pet-chem', 'platinum', 'potato', 'propane', 'rand', 'rape-oil',
       'rapeseed', 'reserves', 'retail', 'rice', 'rubber', 'rye', 'ship',
       'silver', 'sorghum', 'soy-meal', 'soy-oil', 'soybean',
       'strategic-metal', 'sugar', 'sun-meal', 'sun-oil', 'sunseed',
       'tea', 'tin', 'trade', 'unknown', 'veg-oil', 'wheat', 'wpi

#### Drop shorts meaningless words and symbols

#### I will drop strings of length one, because some words of length two might be abbreviation

In [24]:
df[df['word'].astype(str).apply(len) <= 2]['word'].unique()

array(['n', 'ha', 'v', 'st', 'co', 'ct', 'u', 'j', 'go', 'wa', 'l', 'p',
       'ga', 'ad', 'e', 'f', 'hi', 'lo', 'c', 'li', 'ka', 'kk', 'ic',
       'ab', 'b', 'r', 'bk', 'sm', 'md', 'jr', 'pc', 'mh', 'vm', 'la',
       'us', 'sb', 'dr', 'lp', 'vy', 'mi', 'sa', 'uk', 'h', 'w', 'nt',
       'ag', 'cz', 'bh', 'ov', 'cb', 'k', 'pa', 'mo', 'sh', 'wi', 'ta',
       'hk', 'tv', 'un', 'g', 'vt', 'bi', 'gt', 'mm', 'om', 'va', 'hr',
       'mx', 'x', 'ah', 'ac', 'ht', 'mr', 'de', 'nv', 'du', 'lm', 'oj',
       'th', 'cc', 'ec', 'ek', 'ky', 'ex', 'fe', 'oy', 'dm', 'pl', 'el',
       'ba', 'lk', 'su', 'dt', 'fm', 'fa', 'mt', 'gy', 'di', 'wx', 'ui',
       'ti', 'dd', 'ai', 'ii', 'ps', 'nw', 'le', 'nc', 'ml', 'fd', 'ko',
       'ge', 'z', 'gu', 'iv', 'ph', 'bv', 'cd', 'cq', 'ok', 'rd', 'rt',
       'sk', 'et', 'fb', 'oi', 'tf', 'gs', 'tm', 'gw', 'km', 'gr', 'vw',
       'xp', 'il', 'gd', 'em', 'cp', 'bp', 'bu', 'fi', 'aa', 'ms', 'gm',
       'fc', 'jo', 'br', 'er', 'pr', 'tl', 'bl', 'hl', 'ch', '

In [24]:
indicies = df[df['word'].astype(str).apply(len) < 2].index
df.drop(indicies, inplace=True)

In [25]:
df.groupby(['category', 'word']).sum('frequency')

Unnamed: 0_level_0,Unnamed: 1_level_0,frequency
category,word,Unnamed: 2_level_1
acq,aa,1
acq,aaa,1
acq,aac,2
acq,aaix,1
acq,aar,2
...,...,...
zinc,zero,12
zinc,zheziang,1
zinc,zimbabw,1
zinc,zinc,65


In [26]:
df[df['word'].astype(str).apply(len) == 2]['word'].unique()

array(['ha', 'st', 'co', 'ct', 'go', 'wa', 'ga', 'ad', 'hi', 'lo', 'li',
       'ka', 'kk', 'ic', 'ab', 'bk', 'sm', 'md', 'jr', 'pc', 'mh', 'vm',
       'la', 'us', 'sb', 'dr', 'lp', 'vy', 'mi', 'sa', 'uk', 'nt', 'ag',
       'cz', 'bh', 'ov', 'cb', 'pa', 'mo', 'sh', 'wi', 'ta', 'hk', 'tv',
       'un', 'vt', 'bi', 'gt', 'mm', 'om', 'va', 'hr', 'mx', 'ah', 'ac',
       'ht', 'mr', 'de', 'nv', 'du', 'lm', 'oj', 'th', 'cc', 'ec', 'ek',
       'ky', 'ex', 'fe', 'oy', 'dm', 'pl', 'el', 'ba', 'lk', 'su', 'dt',
       'fm', 'fa', 'mt', 'gy', 'di', 'wx', 'ui', 'ti', 'dd', 'ai', 'ii',
       'ps', 'nw', 'le', 'nc', 'ml', 'fd', 'ko', 'ge', 'gu', 'iv', 'ph',
       'bv', 'cd', 'cq', 'ok', 'rd', 'rt', 'sk', 'et', 'fb', 'oi', 'tf',
       'gs', 'tm', 'gw', 'km', 'gr', 'vw', 'xp', 'il', 'gd', 'em', 'cp',
       'bp', 'bu', 'fi', 'aa', 'ms', 'gm', 'fc', 'jo', 'br', 'er', 'pr',
       'tl', 'bl', 'hl', 'ch', 'sg', 'ri', 'rb', 'um', 'na', 'ey', 'sc',
       'mp', 'al', 'nz', 'ra', 'rs', 'kb', 'hc', 'b

In [25]:
prior = df.groupby('category').sum('frequency') / df.groupby('category').sum('frequency').sum()
prior['frequency'] = prior['frequency'].apply(log_transformer)
prior

Unnamed: 0_level_0,frequency
category,Unnamed: 1_level_1
acq,-3.770056
alum,-4.660629
barley,-4.611624
bop,-4.437230
carcass,-4.352814
...,...
veg-oil,-6.282084
wheat,-5.131529
wpi,-5.076029
yen,-4.907925


In [26]:
df2 = df.copy()
model = df.pivot_table(index='category', columns='word', values='frequency', aggfunc='sum')
model.fillna(0, inplace=True)
model

word,aa,aaa,aac,aachen,aagiy,aaix,aam,aancor,aap,aar,...,zuccherifici,zuckerman,zuercher,zulia,zur,zurich,zuyuan,zverev,zy,zzzz
category,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
acq,1.0,1.0,2.0,0.0,0.0,1.0,0.0,0.0,0.0,2.0,...,3.0,1.0,0.0,0.0,0.0,7.0,0.0,0.0,0.0,0.0
alum,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,...,3.0,0.0,0.0,0.0,0.0,4.0,0.0,0.0,0.0,0.0
barley,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,...,3.0,0.0,0.0,0.0,0.0,4.0,0.0,0.0,0.0,0.0
bop,4.0,2.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,...,3.0,0.0,0.0,0.0,0.0,4.0,0.0,0.0,0.0,0.0
carcass,4.0,2.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,...,3.0,0.0,0.0,0.0,0.0,4.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
veg-oil,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
wheat,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
wpi,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
yen,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


### Add Unknown token

In [27]:
model['UNKNOWN'] = 0
model

word,aa,aaa,aac,aachen,aagiy,aaix,aam,aancor,aap,aar,...,zuckerman,zuercher,zulia,zur,zurich,zuyuan,zverev,zy,zzzz,UNKNOWN
category,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
acq,1.0,1.0,2.0,0.0,0.0,1.0,0.0,0.0,0.0,2.0,...,1.0,0.0,0.0,0.0,7.0,0.0,0.0,0.0,0.0,0
alum,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,4.0,0.0,0.0,0.0,0.0,0
barley,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,4.0,0.0,0.0,0.0,0.0,0
bop,4.0,2.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,4.0,0.0,0.0,0.0,0.0,0
carcass,4.0,2.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,4.0,0.0,0.0,0.0,0.0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
veg-oil,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0
wheat,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0
wpi,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0
yen,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0


### Add 1 smoothing

In [28]:
model = model + 1
model

word,aa,aaa,aac,aachen,aagiy,aaix,aam,aancor,aap,aar,...,zuckerman,zuercher,zulia,zur,zurich,zuyuan,zverev,zy,zzzz,UNKNOWN
category,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
acq,2.0,2.0,3.0,1.0,1.0,2.0,1.0,1.0,1.0,3.0,...,2.0,1.0,1.0,1.0,8.0,1.0,1.0,1.0,1.0,1
alum,2.0,1.0,1.0,1.0,1.0,2.0,1.0,1.0,1.0,2.0,...,1.0,1.0,1.0,1.0,5.0,1.0,1.0,1.0,1.0,1
barley,2.0,1.0,1.0,1.0,1.0,2.0,1.0,1.0,1.0,2.0,...,1.0,1.0,1.0,1.0,5.0,1.0,1.0,1.0,1.0,1
bop,5.0,3.0,1.0,1.0,1.0,2.0,1.0,1.0,1.0,2.0,...,1.0,1.0,1.0,1.0,5.0,1.0,1.0,1.0,1.0,1
carcass,5.0,3.0,1.0,1.0,1.0,2.0,1.0,1.0,1.0,2.0,...,1.0,1.0,1.0,1.0,5.0,1.0,1.0,1.0,1.0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
veg-oil,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1
wheat,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1
wpi,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1
yen,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1


### Calculate prob. for each word given category

In [29]:
classes_summations = model.sum(axis=1)
classes_summations

category
acq        155986.0
alum        76131.0
barley      78923.0
bop         90046.0
carcass     96168.0
             ...   
veg-oil     31527.0
wheat       55254.0
wpi         57235.0
yen         63952.0
zinc        65907.0
Length: 91, dtype: float64

In [30]:
model = model.astype('Float64').div(classes_summations, axis=0)
model

word,aa,aaa,aac,aachen,aagiy,aaix,aam,aancor,aap,aar,...,zuckerman,zuercher,zulia,zur,zurich,zuyuan,zverev,zy,zzzz,UNKNOWN
category,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
acq,0.000013,0.000013,0.000019,0.000006,0.000006,0.000013,0.000006,0.000006,0.000006,0.000019,...,0.000013,0.000006,0.000006,0.000006,0.000051,0.000006,0.000006,0.000006,0.000006,0.000006
alum,0.000026,0.000013,0.000013,0.000013,0.000013,0.000026,0.000013,0.000013,0.000013,0.000026,...,0.000013,0.000013,0.000013,0.000013,0.000066,0.000013,0.000013,0.000013,0.000013,0.000013
barley,0.000025,0.000013,0.000013,0.000013,0.000013,0.000025,0.000013,0.000013,0.000013,0.000025,...,0.000013,0.000013,0.000013,0.000013,0.000063,0.000013,0.000013,0.000013,0.000013,0.000013
bop,0.000056,0.000033,0.000011,0.000011,0.000011,0.000022,0.000011,0.000011,0.000011,0.000022,...,0.000011,0.000011,0.000011,0.000011,0.000056,0.000011,0.000011,0.000011,0.000011,0.000011
carcass,0.000052,0.000031,0.00001,0.00001,0.00001,0.000021,0.00001,0.00001,0.00001,0.000021,...,0.00001,0.00001,0.00001,0.00001,0.000052,0.00001,0.00001,0.00001,0.00001,0.00001
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
veg-oil,0.000032,0.000032,0.000032,0.000032,0.000032,0.000032,0.000032,0.000032,0.000032,0.000032,...,0.000032,0.000032,0.000032,0.000032,0.000032,0.000032,0.000032,0.000032,0.000032,0.000032
wheat,0.000018,0.000018,0.000018,0.000018,0.000018,0.000018,0.000018,0.000018,0.000018,0.000018,...,0.000018,0.000018,0.000018,0.000018,0.000018,0.000018,0.000018,0.000018,0.000018,0.000018
wpi,0.000017,0.000017,0.000017,0.000017,0.000017,0.000017,0.000017,0.000017,0.000017,0.000017,...,0.000017,0.000017,0.000017,0.000017,0.000017,0.000017,0.000017,0.000017,0.000017,0.000017
yen,0.000016,0.000016,0.000016,0.000016,0.000016,0.000016,0.000016,0.000016,0.000016,0.000016,...,0.000016,0.000016,0.000016,0.000016,0.000016,0.000016,0.000016,0.000016,0.000016,0.000016


### check for zero prob.

In [31]:
(model == 0).any().any()

False

### Classification

In [32]:
from math import log


def predict(file_content: str) -> str:
    processed_content = process_loaded_file(file_content)
    valid_words = [word for word in processed_content if word in model.columns]
    invalid_words = [word  for word in processed_content if word not in model.columns]
    valid_words.extend(['UNKNOWN'] * len(invalid_words))
    
    predictor = model[valid_words]
    predictor = predictor.applymap(log_transformer)
    return (predictor.sum(axis=1) + prior['frequency']).idxmax()


In [33]:
prior

Unnamed: 0_level_0,frequency
category,Unnamed: 1_level_1
acq,-3.770056
alum,-4.660629
barley,-4.611624
bop,-4.437230
carcass,-4.352814
...,...
veg-oil,-6.282084
wheat,-5.131529
wpi,-5.076029
yen,-4.907925


In [34]:
pd.DataFrame({'actual_category': pd.Series(dtype='str'),
                           'file_content': pd.Series(dtype='str'),
                           })

Unnamed: 0,actual_category,file_content


In [35]:
from sklearn.metrics import f1_score


def test_data(folder: str):
    test_df = pd.DataFrame({'actual_category': pd.Series(dtype='str'),
                           'file_content': pd.Series(dtype='str'),
                           })

    for category in os.listdir(folder):
        for file in os.listdir(f"{folder}/{category}"):
            with open(f"{folder}/{category}/{file}") as f:
                test_df.loc[len(test_df)] = [category, clean_file_content(f.read())]
                
    test_df["predicted"] = test_df["file_content"].apply(predict)
    
    return f1_score(test_df['actual_category'], test_df['predicted'], average='macro')
            
test_data("test")

0.08434828930038325

### Old model accurace `0.06671500212397337`

In [38]:
test_df["predicted"] = test_df["file_content"].apply(predict)
test_df

NameError: name 'test_df' is not defined

In [None]:
x = """

GREYHOUND CORP <G> COMPLETES BUS LINE SALE

    PHOENIX, Ariz., March 18 - Greyhound corp said it completed
the sale of its Greyhound Lines unit to Dallas-based GLI
Holdings Inc for 350 mln dlrs in cash, securities, royalties
and other considerations.
    Greyhound said GLI is authorized to continue using the
familiar running dog logo on a red, white and blue shield,
while Greyhound Corp will continue to use the running dog alone
as its symbol.

"""

In [None]:
predict(x)

### Features that can advance model accuracy 

- Take tf-idf into considration, since it can add more advatnage for more informative words ( more related words, they are more informative ), but you have to get absolute value of its prob then add the tf-idf score to the result.
- Add more data
- Build more than one model and take diff approaches in each one, and try ensemble these methods ( model care about order, model does not care about order )
- Add more score for model if desired class is in the document.

In [None]:
def save_model(prior: pd.DataFrame, words_props: pd.DataFrame) -> None:
    prior.to_csv('prior.csv')
    words_props.to_csv('words_props.csv')

In [None]:
save_model(prior, model)

In [None]:
pd.read_csv('words_props.csv')

In [None]:
for category in os.listdir('training'):
    print(category, len(os.listdir(f"training/{category}")))

In [38]:
nltk.__version__

'3.8.1'

In [41]:
word_tokenize("hihi")

['hihi']

In [44]:
from typing import List
def __drop_stop_words(text: str) -> str:
        return ""

def __tokinize(text) -> List[str]:
    return word_tokenize(text)

In [45]:
X

NameError: name 'X' is not defined