### Important: Set up directories
Set `WORK_DIR` to the path to the repo in the cell below:

In [1]:
import os
WORK_DIR = os.path.join(os.getenv("HOME"), 'text-gnn')
os.chdir(WORK_DIR)

In [2]:
import pandas as pd
%load_ext autoreload
%autoreload 2
import sys
from IPython.core.display import display, HTML
display(HTML("<style>.container { width:80% !important; }</style>"))
import json
import jsonlines

sys.path.append('src')
from src.shared.utils import tokenize_prune_stem

## Helsinki Swahili Corpus
Set the `DATASET_DIR_NAME` and `VOCAB_DIR_NAME` variables to the directory names

In [3]:
# The results from the `create_dataset.py` script
DATASET_DIR_NAME = 'swahili-processed-v1'
# Results from running the `download_stemming.py` script
VOCAB_DIR_NAME = 'hsc-dictionary'

### Documents
Explore document stats

In [4]:
dataset_dir = os.path.join(WORK_DIR, 'results', DATASET_DIR_NAME)
df = pd.read_csv(f'{dataset_dir}/dataset.csv',sep=';')

In [5]:
n_classes = df.document_type.nunique()
print(f'{len(df)} total documents with {n_classes} classes')

457 total documents with 3 classes


In [6]:
df.document_type.value_counts(dropna=False)

news     221
bunge    199
books     37
Name: document_type, dtype: int64

In [7]:
df['n_words'] = df.document_content.apply(lambda x:len(x.split()))

In [8]:
mean_words = round(df.n_words.mean(),1)
median_words = round(df.n_words.median(),1)

In [9]:
print(f'Mean words per document of {mean_words}')

Mean words per document of 52887.8


In [10]:
print(f'Median words per document of {median_words}')

Median words per document of 38732.0


In [11]:
total_words = df.n_words.sum()
print(f'{total_words/int(1e6):.2f} million words total')

24.17 million words total


In [12]:
## File size is 160.1 MB

### Vocab
Explore vocabulary stats

In [13]:
stemming_dir = os.path.join(WORK_DIR, 'results', VOCAB_DIR_NAME, 'stemming')

In [14]:
def get_words_in_vocab(path: str, count_threshold: int):
    with open(path,'r') as f:
        vocab_counts = json.load(f)
        return [word for word, count in vocab_counts.items() if count >= count_threshold]

In [15]:
unstemmed_vocab_path = os.path.join(stemming_dir, 'vocab_counts.json')
unstemmed_vocab = get_words_in_vocab(unstemmed_vocab_path, count_threshold=1)
print(f'{len(unstemmed_vocab)} unique words in the unstemmed vocabulary')
unstemmed_vocab = get_words_in_vocab(unstemmed_vocab_path, count_threshold=2)
print(f'{len(unstemmed_vocab)} unique words in the unstemmed vocabulary which occur at least twice')

434449 unique words in the unstemmed vocabulary
212920 unique words in the unstemmed vocabulary which occur at least twice


In [19]:
# TODO: Still need to generate the cleaned HSC vocab and stemming map
stemmed_vocab_path = os.path.join(stemming_dir, 'cleaned_vocab_counts.json')
stemmed_vocab = get_words_in_vocab(stemmed_vocab_path, count_threshold=1)
print(f'{len(stemmed_vocab)} unique words in the stemmed vocabulary')
stemmed_vocab_2 = get_words_in_vocab(stemmed_vocab_path, count_threshold=2)
print(f'{len(stemmed_vocab)} unique words in the unstemmed vocabulary which occur at least twice')

Which we should be able to check is the same as applying this method of finding the number of words in the vocab after applying stemming and cleaning.

In [17]:
# TODO: Requires cell above
stemmer_path = os.path.join(stemming_dir, 'stemming_cleaned.json')
with open(stemmer_path,'r') as f:
    stemming_map = json.load(f)

stemmed_words = []
for word in stemmed_vocab_2:
    stemmed_words.extend(tokenize_prune_stem(word, stemming_map))
print(f'{len(set(stemmed_words))} words in vocab after applying stemming and pruning')

In [18]:
# stemmer_path = os.path.join(stemming_dir, 'stemming_cleaned.json')
# with open(stemmer_path,'r') as f:
#     stemming_map = json.load(f)

In [19]:
# stemmed_words = [stemming_map[word] for word in stemmed_vocab]
# print(f'{len(set(stemmed_words))} words in vocab after applying stemming')

## Zenodo Swahili News Corpus
Set the `DATASET_DIR_NAME` and `VOCAB_DIR_NAME` variables to the directory names

In [20]:
# The results from the `create_dataset.py` script
DATASET_DIR_NAME = 'zenodo-processed-v4'
# Results from running the `download_stemming.py` script
VOCAB_DIR_NAME = 'z-news-dictionary-v3'

### Documents
Explore document stats

In [21]:
# dataset_dir = f'{WORK_DIR}/results/zen_data'
dataset_dir = os.path.join(WORK_DIR, 'results', DATASET_DIR_NAME)
df = pd.read_csv(f'{dataset_dir}/dataset.csv',sep=';')

In [22]:
n_classes = df.document_type.nunique()
print(f'{len(df)} total documents with {n_classes} classes')

23267 total documents with 6 classes


In [23]:
df.document_type.value_counts(dropna=False)

kitaifa      10242
michezo       6004
burudani      2229
uchumi        2027
kimataifa     1906
afya           859
Name: document_type, dtype: int64

In [24]:
df['n_words'] = df.document_content.apply(lambda x:len(x.split()))
mean_words = round(df.n_words.mean(),1)
median_words = round(df.n_words.median(),1)

In [25]:
print(f'Mean words per document of {mean_words}')

Mean words per document of 332.2


In [26]:
print(f'Median words per document of {median_words}')

Median words per document of 275.0


In [27]:
total_words = df.n_words.sum()
print(f'{total_words/int(1e6):.2f} million words total')

7.73 million words total


In [28]:
## File size is 52.3 MB

### Vocab
Explore vocabulary stats


In [29]:
stemming_dir = os.path.join(WORK_DIR, 'results', VOCAB_DIR_NAME, 'stemming')

In [30]:
unstemmed_vocab_path = os.path.join(stemming_dir, 'vocab_counts.json')
unstemmed_vocab = get_words_in_vocab(unstemmed_vocab_path, count_threshold=1)
print(f'{len(unstemmed_vocab)} unique words in the unstemmed vocabulary')
unstemmed_vocab = get_words_in_vocab(unstemmed_vocab_path, count_threshold=2)
print(f'{len(unstemmed_vocab)} unique words in the unstemmed vocabulary which occur at least twice')

197619 unique words in the unstemmed vocabulary
95504 unique words in the unstemmed vocabulary which occur at least twice


In [31]:
# TODO: Still need to generate the cleaned HSC vocab and stemming map
stemmed_vocab_path = os.path.join(stemming_dir, 'cleaned_vocab_counts.json')
stemmed_vocab = get_words_in_vocab(stemmed_vocab_path, count_threshold=1)
print(f'{len(stemmed_vocab)} unique words in the stemmed vocabulary')
stemmed_vocab_2 = get_words_in_vocab(stemmed_vocab_path, count_threshold=2)
print(f'{len(stemmed_vocab)} unique words in the unstemmed vocabulary which occur at least twice')

39746 unique words in the stemmed vocabulary
39746 unique words in the unstemmed vocabulary which occur at least twice


We now consider the remaining words after removing non-alpha's, single character words, and stopwords 

In [32]:
stemmer_path = os.path.join(stemming_dir, 'stemming_cleaned.json')
with open(stemmer_path,'r') as f:
    stemming_map = json.load(f)

stemmed_words = []
for word in stemmed_vocab_2:
    stemmed_words.extend(tokenize_prune_stem(word, stemming_map))
print(f'{len(set(stemmed_words))} words in vocab after applying stemming and pruning')

36574 words in vocab after applying stemming and pruning
