In [1]:
import pandas as pd
import dagstermill as dgm
import numpy as np 
import re
import mlflow as mf
import ftzard.utils.mlflow as mf_utils
from omegaconf import DictConfig, OmegaConf
from hydra import initialize, compose
import os

In [2]:
base_path = 'E:/FTzard/ftzard'
config_path = f'../../config/'
data_path = f"{base_path}/data/raw_data/sentiment_analysis/training.csv"
config_name = 'mlflow_experiment'

In [3]:
with initialize(version_base=None, config_path=config_path):
    cfg = compose(config_name=config_name)
    tracking_uri, experiment_name = cfg.MLFLOW_TRACKING_URI, cfg.MLFLOW_EXPERIMENT_NAME
    

In [4]:
os.environ['MLFLOW_TRACKING_URI'] = tracking_uri
run_name = 'data_cleaning'
print('Data Path: ', data_path)
print('Mlflow Experiment Name: ', experiment_name)
print('Mlflow Run Name: ', run_name)

Data Path:  E:/FTzard/ftzard/data/raw_data/sentiment_analysis/training.csv
Mlflow Experiment Name:  senetiment_analysis
Mlflow Run Name:  data_cleaning


In [5]:
data = pd.read_csv(data_path, encoding='latin-1', index_col=('Unnamed: 0'))

In [6]:
data.head()

Unnamed: 0,target,id,date,no_query,user,text
0,0,1467810369,Mon Apr 06 22:19:45 PDT 2009,NO_QUERY,_TheSpecialOne_,"@switchfoot http://twitpic.com/2y1zl - Awww, t..."
1,0,1467810672,Mon Apr 06 22:19:49 PDT 2009,NO_QUERY,scotthamilton,is upset that he can't update his Facebook by ...
2,0,1467810917,Mon Apr 06 22:19:53 PDT 2009,NO_QUERY,mattycus,@Kenichan I dived many times for the ball. Man...
3,0,1467811184,Mon Apr 06 22:19:57 PDT 2009,NO_QUERY,ElleCTF,my whole body feels itchy and like its on fire
4,0,1467811193,Mon Apr 06 22:19:57 PDT 2009,NO_QUERY,Karoli,"@nationwideclass no, it's not behaving at all...."


In [7]:
data.info()

<class 'pandas.core.frame.DataFrame'>
Index: 1600000 entries, 0 to 1599999
Data columns (total 6 columns):
 #   Column    Non-Null Count    Dtype 
---  ------    --------------    ----- 
 0   target    1600000 non-null  int64 
 1   id        1600000 non-null  int64 
 2   date      1600000 non-null  object
 3   no_query  1600000 non-null  object
 4   user      1600000 non-null  object
 5   text      1600000 non-null  object
dtypes: int64(2), object(4)
memory usage: 85.4+ MB


In [8]:
data['user'] = data['user'].astype('str')
data['text'] = data['text'].astype('str')
data['no_query'] = data['no_query'].astype('str')

In [9]:
def remove_at_the_rate(string):
    pattern = re.compile(r'@\w+')
    matches = re.findall(pattern, string)
    if matches:
        for match in matches:
            string = string.replace(match, f'user {match.split('@')[-1]}')
    return string.lower()

def remove_special_characters(string):
    pattern = re.compile(pattern = r'[^a-zA-Z0-9. ]')
    string = re.sub(pattern, '', string)
    return string


def helper(string):
    return remove_special_characters(remove_at_the_rate(string))

In [10]:
experiment_id = mf_utils.create_experiment(exp_name=experiment_name)
print('Experiment Id: ', experiment_id)
run_id = mf_utils.get_run_id_by_name(run_name=run_name, experiment_ids=[experiment_id])
print('Run Id: ', run_id)
if run_id:
    mf.start_run(run_id=run_id, run_name=run_name, experiment_id=experiment_id)
else:
    mf.start_run(run_name=run_name, experiment_id=experiment_id)
    
text = data['text'].apply(lambda x: helper(x))
print(text[0])
data['text'] = text

mf.end_run()

The provided experiment name senetiment_analysis already exists, the run will be logged in this experiment.
                                 
Experiment Id:  1
Run Id:  6f2f7a3538de462eb0137477aa1fcf7d
user switchfoot httptwitpic.com2y1zl  awww thats a bummer.  you shoulda got david carr of third day to do it. d


In [12]:
dgm.yield_result(data[['text', 'target']], output_name='cleaned_data')

Unnamed: 0,text,target
0,user switchfoot httptwitpic.com2y1zl awww tha...,0
1,is upset that he cant update his facebook by t...,0
2,user kenichan i dived many times for the ball....,0
3,my whole body feels itchy and like its on fire,0
4,user nationwideclass no its not behaving at al...,0
...,...,...
1599995,just woke up. having no school is the best fee...,4
1599996,thewdb.com very cool to hear old walt intervi...,4
1599997,are you ready for your mojo makeover ask me fo...,4
1599998,happy 38th birthday to my boo of alll time tup...,4
