In [23]:
import os
import torch
from transformers import (
    AutoTokenizer, DataCollatorWithPadding
)
import re
import mlflow as mf
import pandas as pd
from hydra import initialize, compose
import ftzard.utils.mlflow as mf_utils
from sklearn.model_selection import train_test_split
from datasets import load_dataset, Dataset, DatasetDict, concatenate_datasets
from sklearn.model_selection import train_test_split
from ftzard.utils.dvc import get_current_date_time
import joblib
import dagstermill as dgm


In [10]:
base_path = '/app/ftzard'
config_path = f'{base_path}/config/'
try:
    os.symlink(config_path, "config_link")
except Exception as e:
    print("Symlink already created...")
data_path = f"{base_path}/data/training.csv"
config_name = 'config'

Symlink already created...


In [3]:
with initialize(version_base=None, config_path="config_link"):
    cfg = compose(config_name=config_name)
    tracking_uri, experiment_name = cfg.MLFLOW.TRACKING.URI, cfg.MLFLOW.EXPERIMENT.NAME
    

In [11]:
os.environ['MLFLOW_TRACKING_URI'] = tracking_uri
base_run_name = "INFERNECE DATA PROCESSING"
run_name = get_current_date_time()
model_name = cfg.HUGGINGFACE.MODEL.NAME
max_len = 1024
print("Base Run Name: ", base_run_name)
print('Data Path: ', data_path)
print('Mlflow Experiment Name: ', experiment_name)
print('Mlflow Run Name: ', run_name)
print('Model Name: ', model_name)

Base Run Name:  INFERNECE DATA PROCESSING
Data Path:  /app/ftzard/data/training.csv
Mlflow Experiment Name:  senetiment_analysis
Mlflow Run Name:  2024-07-03_6:52
Model Name:  tiiuae/falcon-7b


In [17]:
data = pd.read_csv(data_path, encoding='latin-1')

In [18]:
data.head()

Unnamed: 0.1,Unnamed: 0,target,id,date,no_query,user,text
0,0,0,1467810369,Mon Apr 06 22:19:45 PDT 2009,NO_QUERY,_TheSpecialOne_,"@switchfoot http://twitpic.com/2y1zl - Awww, t..."
1,1,0,1467810672,Mon Apr 06 22:19:49 PDT 2009,NO_QUERY,scotthamilton,is upset that he can't update his Facebook by ...
2,2,0,1467810917,Mon Apr 06 22:19:53 PDT 2009,NO_QUERY,mattycus,@Kenichan I dived many times for the ball. Man...
3,3,0,1467811184,Mon Apr 06 22:19:57 PDT 2009,NO_QUERY,ElleCTF,my whole body feels itchy and like its on fire
4,4,0,1467811193,Mon Apr 06 22:19:57 PDT 2009,NO_QUERY,Karoli,"@nationwideclass no, it's not behaving at all...."


In [19]:
data = data[["text"]]
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1600000 entries, 0 to 1599999
Data columns (total 1 columns):
 #   Column  Non-Null Count    Dtype 
---  ------  --------------    ----- 
 0   text    1600000 non-null  object
dtypes: object(1)
memory usage: 12.2+ MB


In [20]:
data['text'] = data['text'].astype('str')

In [21]:
def remove_at_the_rate(string):
    pattern = re.compile(r'@\w+')
    matches = re.findall(pattern, string)
    if matches:
        for match in matches:
            string = string.replace(match, f'user {match.split("@")[-1]}')
    return string

def remove_special_characters(string):
    pattern = re.compile(pattern = r'[^a-zA-Z0-9. ]')
    string = re.sub(pattern, '', string)
    return string


def helper(string):
    return remove_special_characters(remove_at_the_rate(string))

In [25]:
for i in range(2):
    mf.end_run()
experiment_id = mf_utils.create_experiment(exp_name=experiment_name)
print('Experiment Id: ', experiment_id)
base_run_id = mf_utils.get_run_id_by_name(run_name=base_run_name, experiment_ids=[experiment_id])

with mf.start_run(run_name=base_run_name, experiment_id=experiment_id, run_id=base_run_id):
    run_id = mf_utils.get_run_id_by_name(run_name=run_name, experiment_ids=[experiment_id], nested=True)
    print('Run Id: ', run_id)
    if run_id:
        mf.start_run(run_id=run_id, run_name=run_name, experiment_id=experiment_id, nested=True)
    else:
        mf.start_run(run_name=run_name, experiment_id=experiment_id, nested=True)
        
    text = data['text'].apply(lambda x: helper(x))
    print(text[0])
    data['text'] = text
    
    mf.end_run()

The provided experiment name senetiment_analysis already exists, the run will be logged in this experiment.
                                 
Experiment Id:  1
Run Id:  9149b374d0f74b1497c0ae8141aa5ab6
user switchfoot httptwitpic.com2y1zl  Awww thats a bummer.  You shoulda got David Carr of Third Day to do it. D


In [29]:
for i in range(2):
    mf.end_run()
with mf.start_run(run_id=base_run_id, run_name=base_run_name, experiment_id=experiment_id):
    run_id = mf_utils.get_run_id_by_name(run_name=run_name, experiment_ids=[experiment_id], nested=True)
    print('Run Id: ', run_id)
    if run_id:
        mf.start_run(run_id=run_id, run_name=run_name, experiment_id=experiment_id, nested=True)
    else:
        mf.start_run(run_name=run_name, experiment_id=experiment_id, nested=True)
    mf.log_param(key='max_sequence_length', value=max_len)
    mf.log_param(key='random seed', value=42)
    ds =  Dataset.from_pandas(data)

    tokenizer = AutoTokenizer.from_pretrained(model_name, add_prefix_space=True)
    tokenizer.pad_token_id = tokenizer.eos_token_id
    tokenizer.pad_token = tokenizer.eos_token
    
    def tokenize(examples):
        return tokenizer(examples['text'], truncation=True, max_length=max_len)
    
    rm_cols = ['text']
    collator = DataCollatorWithPadding(tokenizer=tokenizer)

    tokenized_datasets = ds.map(tokenize, batched=True, remove_columns=rm_cols)
    # tokenized_datasets = tokenized_datasets.rename_column("label")
    tokenized_datasets.set_format("torch")

    
        

Run Id:  9149b374d0f74b1497c0ae8141aa5ab6


Map: 100%|████████████████████████████████████████████| 1600000/1600000 [00:40<00:00, 39131.93 examples/s]


In [30]:
print(tokenized_datasets)

Dataset({
    features: ['input_ids', 'attention_mask'],
    num_rows: 1600000
})


In [31]:
output = {
    'datasets':{
        "test": tokenized_datasets,
    }
}

In [32]:
metadata = {"run_name": run_name,
           "run_id":run_id if run_id else mf_utils.get_run_id_by_name(run_name=run_name, experiment_ids=[experiment_id]),
           "base_run_id": base_run_id,
            "base_run_name": base_run_name
           }

In [33]:
dgm.yield_result(output, output_name='tokenized_dataset')
dgm.yield_result(metadata, output_name='step2_1_run_metadata')

{'run_name': '2024-07-03_6:52',
 'run_id': '9149b374d0f74b1497c0ae8141aa5ab6',
 'base_run_id': 'ed170880e9c14795bb3f0018be900f60',
 'base_run_name': 'INFERNECE DATA PROCESSING'}