In [2]:
import os
import torch
from transformers import (
    AutoTokenizer, DataCollatorWithPadding
)
import mlflow as mf
import pandas as pd
from hydra import initialize, compose
import ftzard.utils.mlflow as mf_utils
from sklearn.model_selection import train_test_split
from datasets import load_dataset, Dataset, DatasetDict
from sklearn.model_selection import train_test_split
import joblib
import dagstermill as dgm


In [3]:
base_path = '/app/ftzard'
config_path = f'{base_path}/config/'
try:
    os.symlink(config_path, "config_link")
except Exception as e:
    print("Symlink already created...")
data_path = f"{base_path}/data/cleaned_data.csv"
config_name = 'config'

In [4]:
with initialize(version_base=None, config_path="config_link"):
    cfg = compose(config_name=config_name)
    tracking_uri, experiment_name = cfg.MLFLOW_TRACKING_URI, cfg.MLFLOW_EXPERIMENT_NAME
    

In [5]:
os.environ['MLFLOW_TRACKING_URI'] = tracking_uri
run_name = 'data_processing'
model_name = cfg['model_name']
max_len = 1024
print('Data Path: ', data_path)
print('Mlflow Experiment Name: ', experiment_name)
print('Mlflow Run Name: ', run_name)
print('Model Name: ', model_name)

Data Path:  ../../data/cleaned_data.csv
Mlflow Experiment Name:  senetiment_analysis
Mlflow Run Name:  data_processing
Model Name:  tiiuae/falcon-7b


In [6]:
data_cleaned = pd.read_csv(data_path, encoding='latin-1')

In [7]:
data_cleaned.head()

Unnamed: 0,text,target
0,user switchfoot httptwitpic.com2y1zl Awww tha...,0
1,is upset that he cant update his Facebook by t...,0
2,user Kenichan I dived many times for the ball....,0
3,my whole body feels itchy and like its on fire,0
4,user nationwideclass no its not behaving at al...,0


In [8]:
data_cleaned.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1600000 entries, 0 to 1599999
Data columns (total 2 columns):
 #   Column  Non-Null Count    Dtype 
---  ------  --------------    ----- 
 0   text    1600000 non-null  object
 1   target  1600000 non-null  int64 
dtypes: int64(1), object(1)
memory usage: 24.4+ MB


In [9]:
data_cleaned['target'] = data_cleaned['target'].apply(lambda x: x-3 if x!=0 else x)

In [10]:
data_cleaned.head()

Unnamed: 0,text,target
0,user switchfoot httptwitpic.com2y1zl Awww tha...,0
1,is upset that he cant update his Facebook by t...,0
2,user Kenichan I dived many times for the ball....,0
3,my whole body feels itchy and like its on fire,0
4,user nationwideclass no its not behaving at al...,0


In [11]:
data_cleaned['target'].value_counts()

target
0    800000
1    800000
Name: count, dtype: int64

In [12]:
data_cleaned.columns

Index(['text', 'target'], dtype='object')

In [13]:
data_reduced, _  =  train_test_split(data_cleaned[['text', 'target']], train_size=0.01, stratify=data_cleaned['target'])

In [14]:
data_reduced['target'].value_counts()

target
1    8000
0    8000
Name: count, dtype: int64

In [15]:
train_data, validation_data = train_test_split(data_reduced[['text', 'target']], test_size=0.1, stratify=data_reduced['target'])
validation_data, test_data = train_test_split(validation_data, test_size=0.3, stratify=validation_data['target'])

print('Examples in Training data:', len(train_data))
print('Examples in Validation data:', len(validation_data))
print('Examples in Test data:', len(test_data))

Examples in Training data: 14400
Examples in Validation data: 1120
Examples in Test data: 480


In [16]:
print(train_data['target'].value_counts())
print(validation_data['target'].value_counts())
print(test_data['target'].value_counts())

target
1    7200
0    7200
Name: count, dtype: int64
target
0    560
1    560
Name: count, dtype: int64
target
0    240
1    240
Name: count, dtype: int64


In [17]:
experiment_id = mf_utils.create_experiment(exp_name=experiment_name)
print('Experiment Id: ', experiment_id)
run_id = mf_utils.get_run_id_by_name(run_name=run_name, experiment_ids=[experiment_id])
print('Run Id: ', run_id)
if run_id:
    mf.start_run(run_id=run_id, run_name=run_name, experiment_id=experiment_id)
else:
    mf.start_run(run_name=run_name, experiment_id=experiment_id)
    
mf.log_param(key='max_sequence_length', value=max_len)
mf.log_param(key='random seed', value=42)
mf.log_param(key='train_split_size', value =0.9)
mf.log_param(key='test_split_size', value =0.3)

train_ds = Dataset.from_pandas(train_data)
val_ds = Dataset.from_pandas(validation_data)
test_ds = Dataset.from_pandas(test_data)

train_ds_shuffled = train_ds.shuffle(seed=42)

final_ds = dataset = DatasetDict({
    'train': train_ds_shuffled,
    'val': val_ds,
    'test': test_ds
})

print(final_ds)

tokenizer = AutoTokenizer.from_pretrained(model_name, add_prefix_space=True)
tokenizer.pad_token_id = tokenizer.eos_token_id
tokenizer.pad_token = tokenizer.eos_token

def tokenize(examples):
    return tokenizer(examples['text'], truncation=True, max_length=max_len)

rm_cols = ['__index_level_0__', 'text']
collator = DataCollatorWithPadding(tokenizer=tokenizer)


tokenized_datasets = dataset.map(tokenize, batched=True, remove_columns=rm_cols)
tokenized_datasets = tokenized_datasets.rename_column("target", "label")
tokenized_datasets.set_format("torch")

mf.end_run()

The provided experiment name senetiment_analysis already exists, the run will be logged in this experiment.
                                 
Experiment Id:  1


The git executable must be specified in one of the following ways:
    - be included in your $PATH
    - be set via $GIT_PYTHON_GIT_EXECUTABLE
    - explicitly set via git.refresh(<full-path-to-git-executable>)

All git commands will error until this is rectified.

This initial message can be silenced or aggravated in the future by setting the
$GIT_PYTHON_REFRESH environment variable. Use one of the following values:
    - quiet|q|silence|s|silent|none|n|0: for no message or exception
    - error|e|exception|raise|r|2: for a raised exception

Example:
    export GIT_PYTHON_REFRESH=quiet



Run Id:  a1165194420b41728de8f833adb5e493
DatasetDict({
    train: Dataset({
        features: ['text', 'target', '__index_level_0__'],
        num_rows: 14400
    })
    val: Dataset({
        features: ['text', 'target', '__index_level_0__'],
        num_rows: 1120
    })
    test: Dataset({
        features: ['text', 'target', '__index_level_0__'],
        num_rows: 480
    })
})


Map: 100%|████████████████████████████████████████████████| 14400/14400 [00:00<00:00, 43431.41 examples/s]
Map: 100%|██████████████████████████████████████████████████| 1120/1120 [00:00<00:00, 57750.77 examples/s]
Map: 100%|████████████████████████████████████████████████████| 480/480 [00:00<00:00, 48636.66 examples/s]


In [18]:
output = {
    'datasets':tokenized_datasets,
}

In [None]:
# with open(f"{base_path}/data/tokenized_dataset.joblib", "wb") as file:
#     joblib.dump(output, file)
metadata = {"run_name": run_name,
           "run_id":run_id if run_id else mf_utils.get_run_id_by_name(run_name=run_name, experiment_ids=[experiment_id])}

In [19]:
dgm.yield_result(output, output_name='tokenized_dataset')
dgm.yield_result(metadata, output_name='step2_run_metadata')

{'datasets': DatasetDict({
     train: Dataset({
         features: ['label', 'input_ids', 'attention_mask'],
         num_rows: 14400
     })
     val: Dataset({
         features: ['label', 'input_ids', 'attention_mask'],
         num_rows: 1120
     })
     test: Dataset({
         features: ['label', 'input_ids', 'attention_mask'],
         num_rows: 480
     })
 })}