In [1]:
import pandas as pd
import dagstermill as dgm
import numpy as np 
import re
import mlflow as mf
import ftzard.utils.mlflow as mf_utils
from omegaconf import DictConfig, OmegaConf
from hydra import initialize, compose
from ftzard.utils.common import get_current_date_time
from dagster import file_relative_path
import os

In [2]:
base_path = '/app/ftzard'
config_path = f'{base_path}/config/'
try:
    os.symlink(config_path, "config_link")
except Exception as e:
    print("Symlink already created...")
relative_config_path = file_relative_path(os.getcwd(), '../../config')
data_path = f"{base_path}/data/training.csv"
config_name = 'config'

Symlink already created...


In [3]:
with initialize(version_base=None, config_path="config_link"):
    cfg = compose(config_name=config_name)
    tracking_uri, experiment_name = cfg.MLFLOW.TRACKING.URI, cfg.MLFLOW.EXPERIMENT.NAME


In [4]:
os.environ['MLFLOW_TRACKING_URI'] = tracking_uri
base_run_name = 'DATA CLEANING'
run_name = get_current_date_time()
print("Base Run Name: ", base_run_name)
print('Data Path: ', data_path)
print('Mlflow Experiment Name: ', experiment_name)
print('Mlflow Run Name: ', run_name)

Base Run Name:  DATA CLEANING
Data Path:  /app/ftzard/data/training.csv
Mlflow Experiment Name:  senetiment_analysis
Mlflow Run Name:  2024-07-01_8:1


In [5]:
data = pd.read_csv(data_path, encoding='latin-1', index_col=('Unnamed: 0'))

In [6]:
data.head()

Unnamed: 0,target,id,date,no_query,user,text
0,0,1467810369,Mon Apr 06 22:19:45 PDT 2009,NO_QUERY,_TheSpecialOne_,"@switchfoot http://twitpic.com/2y1zl - Awww, t..."
1,0,1467810672,Mon Apr 06 22:19:49 PDT 2009,NO_QUERY,scotthamilton,is upset that he can't update his Facebook by ...
2,0,1467810917,Mon Apr 06 22:19:53 PDT 2009,NO_QUERY,mattycus,@Kenichan I dived many times for the ball. Man...
3,0,1467811184,Mon Apr 06 22:19:57 PDT 2009,NO_QUERY,ElleCTF,my whole body feels itchy and like its on fire
4,0,1467811193,Mon Apr 06 22:19:57 PDT 2009,NO_QUERY,Karoli,"@nationwideclass no, it's not behaving at all...."


In [7]:
data.info()

<class 'pandas.core.frame.DataFrame'>
Index: 1600000 entries, 0 to 1599999
Data columns (total 6 columns):
 #   Column    Non-Null Count    Dtype 
---  ------    --------------    ----- 
 0   target    1600000 non-null  int64 
 1   id        1600000 non-null  int64 
 2   date      1600000 non-null  object
 3   no_query  1600000 non-null  object
 4   user      1600000 non-null  object
 5   text      1600000 non-null  object
dtypes: int64(2), object(4)
memory usage: 85.4+ MB


In [8]:
data['user'] = data['user'].astype('str')
data['text'] = data['text'].astype('str')
data['no_query'] = data['no_query'].astype('str')

In [9]:
def remove_at_the_rate(string):
    pattern = re.compile(r'@\w+')
    matches = re.findall(pattern, string)
    if matches:
        for match in matches:
            string = string.replace(match, f'user {match.split("@")[-1]}')
    return string

def remove_special_characters(string):
    pattern = re.compile(pattern = r'[^a-zA-Z0-9. ]')
    string = re.sub(pattern, '', string)
    return string


def helper(string):
    return remove_special_characters(remove_at_the_rate(string))

In [11]:
experiment_id = mf_utils.create_experiment(exp_name=experiment_name)
print('Experiment Id: ', experiment_id)
base_run_id = mf_utils.get_run_id_by_name(run_name=base_run_name, experiment_ids=[experiment_id])

with mf.start_run(run_name=base_run_name, experiment_id=experiment_id, run_id=base_run_id):
    run_id = mf_utils.get_run_id_by_name(run_name=run_name, experiment_ids=[experiment_id], nested=True)
    print('Run Id: ', run_id)
    if run_id:
        mf.start_run(run_id=run_id, run_name=run_name, experiment_id=experiment_id, nested=True)
    else:
        mf.start_run(run_name=run_name, experiment_id=experiment_id, nested=True)
        
    text = data['text'].apply(lambda x: helper(x))
    print(text[0])
    data['text'] = text
    
    mf.end_run()

The provided experiment name senetiment_analysis already exists, the run will be logged in this experiment.
                                 
Experiment Id:  1
Run Id:  be61f36a158b4affb5d564eece987b80
user switchfoot httptwitpic.com2y1zl  Awww thats a bummer.  You shoulda got David Carr of Third Day to do it. D


In [16]:
# data[['text', 'target']].to_csv(f"{base_path}/data/cleaned_data.csv", index = False)
metadata = {"run_name": run_name,
           "run_id":run_id,
           "base_run_id": base_run_id,
           "base_run_name": base_run_name}

In [17]:
dgm.yield_result(data[['text', 'target']], output_name='cleaned_data')
dgm.yield_result(metadata, output_name='step1_run_metadata')

{'run_name': '2024-07-01_8:1',
 'run_id': 'be61f36a158b4affb5d564eece987b80',
 'base_run_id': 'e442f5277fc34af994196f82669c773f',
 'base_run_name': 'DATA CLEANING'}