In [9]:
import os.path
import pandas as pd

from os import path
from tqdm import tqdm

from lib.training_preparation import limit_by_type
from lib.training_preparation import count_distinct_words
from lib.training_preparation import get_tt_stratified_split


tqdm.pandas()

In [3]:
dset_path = 'data/limit_5K_per_type_order_by_id_desc'

df_files = ['cleaned.csv', 
            'stemmed.csv', 
            'lemmatized.csv']

folds_num = 5
min_ratio = 0.1
limit = 1000

if not path.exists(f'{dset_path}/10_tt_split'):
    os.mkdir(f'{dset_path}/10_tt_split')

In [11]:
for f in df_files:
    in_path = f'{dset_path}/01_processed/{f}'
    out_path = f'{dset_path}/10_tt_split/{f}'

    print(f'Loading {in_path}...')
    df = pd.read_csv(in_path, sep=';')
    
    df = limit_by_type(df, limit, 'id', 'type')

    print('Counting distinct words in documents...')
    df['codw'] = df['text'].progress_apply(count_distinct_words)

    # print('Oversampling by type...')
    # df = oversample_by_type(df)

    print('Getting folds for training and testing process...')
    df_kfolds = get_tt_stratified_split(df['type'], folds_num)

    # print(f'Writing to {out_path}...')
    # df.to_csv(out_path, index=False, sep=';')

    print('Count of distinct words')
    p = out_path.split('.')[0]
    if not path.exists(p):
        os.mkdir(p)
    
    dir_name = f'{p}/codw'
    if not path.exists(dir_name):
        os.mkdir(dir_name)

    print('Saving...')
    for fn in tqdm(range(folds_num)):
        df_train = df[['codw', 'type']].iloc[df_kfolds[f'train_{fn}'].tolist()]
        df_train = df_train.sample(frac=1).reset_index(drop=True)
        df_train.to_csv(f'{dir_name}/train_{fn}.csv', index=False, sep=';')
        df_test = df[['codw', 'type']].iloc[df_kfolds[df_kfolds[f'test_{fn}'].notna()][f'test_{fn}'].astype(int).tolist()]
        df_test = df_test.sample(frac=1).reset_index(drop=True)
        df_test.to_csv(f'{dir_name}/test_{fn}.csv', index=False, sep=';')

    print('Fasttext')
    if not path.exists(out_path[:-4]):
        os.mkdir(out_path[:-4])
    
    dir_name = f'{out_path[:-4]}/fasttext'
    if not path.exists(dir_name):
        os.mkdir(dir_name)

    df['id_typ_document_fasttext'] = '__class__' + df['type'].astype(str)
    df['text_fasttext'] = df['id_typ_document_fasttext'] + ' ' + df['text']

    print('Saving...')
    for fn in tqdm(range(folds_num)):
        df_train = df[['text_fasttext']].iloc[df_kfolds[f'train_{fn}'].tolist()]
        df_train = df_train.sample(frac=1).reset_index(drop=True)
        df_train.to_csv(f'{dir_name}/train_{fn}.csv', header=None, index=False)
        df_test = df[['text_fasttext']].iloc[df_kfolds[df_kfolds[f'test_{fn}'].notna()][f'test_{fn}'].astype(int).tolist()]
        df_test = df_test.sample(frac=1).reset_index(drop=True)
        df_test.to_csv(f'{dir_name}/test_{fn}.csv', header=None, index=False)
    
    print('\n')

Loading data/limit_5K_per_type_order_by_id_desc/01_processed/cleaned.csv...
Counting distinct words in documents...


100%|██████████| 31459/31459 [00:06<00:00, 4836.41it/s]


Getting folds for training and testing process...
Count of distinct words
Saving...


100%|██████████| 5/5 [00:35<00:00,  7.08s/it]


Fasttext
Saving...


100%|██████████| 5/5 [00:19<00:00,  3.90s/it]




Loading data/limit_5K_per_type_order_by_id_desc/01_processed/stemmed.csv...
Counting distinct words in documents...


100%|██████████| 31459/31459 [00:05<00:00, 5488.39it/s]


Getting folds for training and testing process...
Count of distinct words
Saving...


100%|██████████| 5/5 [00:28<00:00,  5.79s/it]


Fasttext
Saving...


100%|██████████| 5/5 [00:19<00:00,  3.89s/it]




Loading data/limit_5K_per_type_order_by_id_desc/01_processed/lemmatized.csv...
Counting distinct words in documents...


100%|██████████| 31459/31459 [00:05<00:00, 5255.91it/s]


Getting folds for training and testing process...
Count of distinct words
Saving...


100%|██████████| 5/5 [00:28<00:00,  5.77s/it]


Fasttext
Saving...


100%|██████████| 5/5 [00:18<00:00,  3.77s/it]






