In [21]:
import os
import torch
from transformers import (
    AutoTokenizer, DataCollatorWithPadding
)
import mlflow as mf
import pandas as pd
from hydra import initialize, compose
import ftzard.utils.mlflow as mf_utils
from sklearn.model_selection import train_test_split
from datasets import load_dataset, Dataset, DatasetDict
from sklearn.model_selection import train_test_split
import dagstermill as dgm


In [2]:
base_path = 'E:/FTzard/ftzard'
config_path = f'../../config/'
data_path = f"{base_path}/data/cleaned_data.csv"
config_name = 'config'

In [3]:
with initialize(version_base=None, config_path=config_path):
    cfg = compose(config_name=config_name)
    tracking_uri, experiment_name = cfg.MLFLOW_TRACKING_URI, cfg.MLFLOW_EXPERIMENT_NAME
    

In [4]:
os.environ['MLFLOW_TRACKING_URI'] = tracking_uri
run_name = 'data_processing'
model_name = cfg['model_name']
max_len = 1024
print('Data Path: ', data_path)
print('Mlflow Experiment Name: ', experiment_name)
print('Mlflow Run Name: ', run_name)
print('Model Name: ', model_name)

Data Path:  E:/FTzard/ftzard/data/cleaned_data.csv
Mlflow Experiment Name:  senetiment_analysis
Mlflow Run Name:  data_processing
Model Name:  tiiuae/falcon-7b


In [5]:
data = pd.read_csv(data_path, encoding='latin-1')

In [6]:
data.head()

Unnamed: 0,text,target
0,user switchfoot httptwitpic.com2y1zl Awww tha...,0
1,is upset that he cant update his Facebook by t...,0
2,user Kenichan I dived many times for the ball....,0
3,my whole body feels itchy and like its on fire,0
4,user nationwideclass no its not behaving at al...,0


In [7]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1600000 entries, 0 to 1599999
Data columns (total 2 columns):
 #   Column  Non-Null Count    Dtype 
---  ------  --------------    ----- 
 0   text    1600000 non-null  object
 1   target  1600000 non-null  int64 
dtypes: int64(1), object(1)
memory usage: 24.4+ MB


In [8]:
data['target'] = data['target'].apply(lambda x: x-3 if x!=0 else x)

In [9]:
data.head()

Unnamed: 0,text,target
0,user switchfoot httptwitpic.com2y1zl Awww tha...,0
1,is upset that he cant update his Facebook by t...,0
2,user Kenichan I dived many times for the ball....,0
3,my whole body feels itchy and like its on fire,0
4,user nationwideclass no its not behaving at al...,0


In [10]:
data['target'].value_counts()

target
0    800000
1    800000
Name: count, dtype: int64

In [11]:
data.columns

Index(['text', 'target'], dtype='object')

In [12]:
train_data, validation_data = train_test_split(data[['text', 'target']], test_size=0.1, stratify=data['target'])
validation_data, test_data = train_test_split(validation_data, test_size=0.3, stratify=validation_data['target'])

print('Examples in Training data:', len(train_data))
print('Examples in Validation data:', len(validation_data))
print('Examples in Test data:', len(test_data))

Examples in Training data: 1440000
Examples in Validation data: 112000
Examples in Test data: 48000


In [13]:
print(train_data['target'].value_counts())
print(validation_data['target'].value_counts())
print(test_data['target'].value_counts())

target
0    720000
1    720000
Name: count, dtype: int64
target
0    56000
1    56000
Name: count, dtype: int64
target
1    24000
0    24000
Name: count, dtype: int64


In [18]:
experiment_id = mf_utils.create_experiment(exp_name=experiment_name)
print('Experiment Id: ', experiment_id)
run_id = mf_utils.get_run_id_by_name(run_name=run_name, experiment_ids=[experiment_id])
print('Run Id: ', run_id)
if run_id:
    mf.start_run(run_id=run_id, run_name=run_name, experiment_id=experiment_id)
else:
    mf.start_run(run_name=run_name, experiment_id=experiment_id)
    
mf.log_param(key='max_sequence_length', value=max_len)
mf.log_param(key='random seed', value=42)
mf.log_param(key='train_split_size', value =0.9)
mf.log_param(key='test_split_size', value =0.3)

train_ds = Dataset.from_pandas(train_data)
val_ds = Dataset.from_pandas(validation_data)
test_ds = Dataset.from_pandas(test_data)

train_ds_shuffled = train_ds.shuffle(seed=42)

final_ds = dataset = DatasetDict({
    'train': train_ds_shuffled,
    'val': val_ds,
    'test': test_ds
})

print(final_ds)

tokenizer = AutoTokenizer.from_pretrained(model_name, add_prefix_space=True)
tokenizer.pad_token_id = tokenizer.eos_token_id
tokenizer.pad_token = tokenizer.eos_token

def tokenize(examples):
    return tokenizer(examples['text'], truncation=True, max_length=max_len)

rm_cols = ['__index_level_0__', 'text']
collator = DataCollatorWithPadding(tokenizer=tokenizer)


tokenized_datasets = dataset.map(tokenize, batched=True, remove_columns=rm_cols)
tokenized_datasets = tokenized_datasets.rename_column("target", "label")
tokenized_datasets.set_format("torch")

mf.end_run()

The provided experiment name senetiment_analysis already exists, the run will be logged in this experiment.
                                 
Experiment Id:  1
Run Id:  59d4296e7f28406da2b49a1dca661136
DatasetDict({
    train: Dataset({
        features: ['text', 'target', '__index_level_0__'],
        num_rows: 1440000
    })
    val: Dataset({
        features: ['text', 'target', '__index_level_0__'],
        num_rows: 112000
    })
    test: Dataset({
        features: ['text', 'target', '__index_level_0__'],
        num_rows: 48000
    })
})


Map: 100%|██████████| 1440000/1440000 [02:07<00:00, 11275.99 examples/s]
Map: 100%|██████████| 112000/112000 [00:07<00:00, 14111.34 examples/s]
Map: 100%|██████████| 48000/48000 [00:03<00:00, 13892.25 examples/s]


In [19]:
output = {
    'datasets':tokenized_datasets,
    'tokenizer':tokenizer,
    'collator':collator
}

In [22]:
dgm.yield_result(output, output_name='preprocessed_dataset_with_tokenizer')

{'datasets': DatasetDict({
     train: Dataset({
         features: ['label', 'input_ids', 'attention_mask'],
         num_rows: 1440000
     })
     val: Dataset({
         features: ['label', 'input_ids', 'attention_mask'],
         num_rows: 112000
     })
     test: Dataset({
         features: ['label', 'input_ids', 'attention_mask'],
         num_rows: 48000
     })
 }),
 'tokenizer': PreTrainedTokenizerFast(name_or_path='tiiuae/falcon-7b', vocab_size=65024, model_max_length=2048, is_fast=True, padding_side='right', truncation_side='right', special_tokens={'eos_token': '<|endoftext|>', 'pad_token': '<|endoftext|>', 'additional_special_tokens': ['>>TITLE<<', '>>ABSTRACT<<', '>>INTRODUCTION<<', '>>SUMMARY<<', '>>COMMENT<<', '>>ANSWER<<', '>>QUESTION<<', '>>DOMAIN<<', '>>PREFIX<<', '>>SUFFIX<<', '>>MIDDLE<<']}, clean_up_tokenization_spaces=True),  added_tokens_decoder={
 	0: AddedToken(">>TITLE<<", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
 	1: 