In [None]:
import numpy   as np 
import pandas  as pd 

# #!pip install -U pandera
# #!pip install -U transformers
# #!pip install -U datasets
# import pandera as pa 
# from   transformers import pipeline, DataCollatorWithPadding
# from   transformers import DistilBertTokenizer, DistilBertForSequenceClassification
# from   datasets import load_dataset, load_metric, Dataset 
# #from  deepchecks.tabular import Dataset

In [None]:
# import keebler 
# from keebler.data.utils import trsfrm_dt
# from keebler.core.io.utils import trsfrm_to_camel

In [None]:
class ModelBase(object):
    def __init__(self, class_map:dict, model_name:str):
        super().__init__()
        bert_tokenizer = DistilBertTokenizer.from_pretrained(model_name)
        data_collator  = DataCollatorWithPadding(tokenizer=bert_tokenizer)
        # load pretrained model with edicated class map
        num_labels     = len(class_map) 
        self.seq_clf_model  = DistilBertForSequenceClassification.from_pretrained(model_name, num_labels=num_labels)
        self.seq_clf_model.config.id2label = class_map

class ModelStore(object):
    def __init__(self, artifact:str):
        artifact.push_to_hub(
            "profoz/covid", 
            use_auth_token=api_key,
            use_temp_dir=True,
            revision='1'
        )

bert_tokenizer.push_to_hub(
    "profoz/covid",
    use_auth_token=api_key,
    use_temp_dir=True,
    revision='1'
)



model_name = 'distilbert-base-uncased'
class_map  = {0: 'Negative', 1: 'Neutral', 2: 'Positive'}
model      = ModelBase(class_map, model_name)


In [None]:
data_dict = {'train': "/workspaces/keebler-studio/data/chemical-formula/CNH_033_full.csv"}
dataset   = load_dataset("csv", data_files=data_dict, delimiter=",", split='train')
metrics   = dict(num_rows=dataset.num_rows, num_columns=dataset.num_columns)
df        = Dataset.to_pandas(dataset).drop("Unnamed: 0", axis=1)

df_schema_test = pd.DataFrame({
    "int_col":   range(100),
    "float_col": map(float, reversed(range(100))),
    "str_col":   ["hi"] * 100
})

# object based API: Simple Schema
schema_simple = pa.DataFrameSchema( {
    "int_col":   pa.Column(int),
    "float_col": pa.Column(float),
    "str_col":   pa.Column(str)
})
# object based API: Check based APi to express validation rules
schema_check = pa.DataFrameSchema({
    "int_col":   pa.Column(int,   pa.Check.ge(0)),      # pa.Check(lambda Series: series >= 0), pa.Check(lambda Series: series.mean() > 0)
    "float_col": pa.Column(float, pa.Check.lt(101)),
    "str_col":   pa.Column(str,   pa.Check.eq("hi"))
})
#class-based API: pydantic style 
class Schema(pa.SchemaModel):
    int_col:pa.typing.Series[int]     = pa.Field(ge=0)
    float_col:pa.typing.Series[float] = pa.Field(lt=101)
    str_col:pa.typing.Series[str]     = pa.Field(eq="hi")
    index:pa.typing.Index[int]

# Schema.to_schema()
# custom check, like a UDF implementation rather than inline 
@pa.check_types(lazy=True)
def test_schema(df: pa.typing.DataFrame[Schema]) -> pa.typing.DataFrame[Schema]:
    return df 
# @pa.check 
# def custom_check(cls, series:pd.Series) -> pd.Series:
#     return series >= 0 
df_schema_test.loc[:5, "int_col"] = -1
print( test_schema(df_schema_test) )


# # if it passes the dataframe checks, if should just pass a dataframe
# # error returns index and failure case; failure case values, condition that failed 
# # further on corruption of dataframe, if will error out immediately (SchemaErrorß)
# try: 
#     report = schema_simple(df_schema_test, lazy=True)
# except pa.errors.SchemaErrors as exc:
#     failure_cases = exc.failure_cases
#     print(failure_cases)

In [None]:
# Goal: ml models built to be goal seeking, but goal needs to be in terms of business outcome
# - if you are not clear about the goal, you are not building the right stuff

# Text (pre-processed) into a format the model can understand
# - SuperAgent to collect and sample more representative samples
# Pre-Processed: passed to the model
# Post-Processed: predictions are post-processed so you can make sense of time

# Measurement
# - If we don't know how to measure it, we don't build any solution
# - What's the definition of good, why are we building this model

# Monitoring
# Human and Machine Partnership
# - Person involved in understanding and monitoring the models   
# Testing: Unintended Consequences, exposed to the risks, companion diagnostics, esnure integrity of the AI model
# - Risk Level Asessment across Use Cases *****
# - Risk Level on data and knowledge (e.g., privacy)
# - High Cost Curve
# Integrity: Design a solution natively with integrity in mind
# - models should not fail on missing features
# - identify bias: data is biased, sampling is biased. 
# - measurment: take first step to measure (historically, where to be, set targets where want to be).
# - measurement: how to measure diversity
# - bias, measurment, understanding, and shaping to ensure no negative outcomes that are inappropriate.
# - corrective action: corrective action



def exec_pipeline(text:str, name:str='ner'):
    """Execute a Pipeline
    
    >>> ner = pipeline("ner", grouped_entities=True)
    >>> ner("My name is Sylvain and I work at Hugging Face in Brooklyn.")

    >>> cls = pipeline("zero-shot-classification")
    cls(
        "This is a course about the Transformers library",
        candidate_labels=["education", "politics", "business"],
    )

    >>> cls = pipeline("sentiment-analysis")
    >>> cls("I've been waiting for a HuggingFace course my whole life.")
    >>> cls(["I've been waiting for a HuggingFace course my whole life.", "I hate this so much!"])

    >>> ner = pipeline("ner", grouped_entities=True)
    >>> ner("My name is Sylvain and I work at Hugging Face in Brooklyn.")

    """
    pass 

# springboard, innovation risk
# speed and innovation, prepared for the worst case
# assurance to business: reptuation risk is managed (when adopting AI), nothing bad will happen, impact on revenue
# how to help me to grow
# Cost of being wrong: which is more controlled vs live and die (higher stakes), which ones are tolerable
# OKRs

df = df.map(
    lambda x: {'text_len': len(x['comments'].split())}
).filter(
    lambda x: x['text_len' > 2]
)
df = df.map(
    lambda x: {'text': "".join([x['comments'],"/n"]) + 
                       "".join([x['title'],"/n"])  }
)


#metadata = len(ds.column_names), ds.version, ds.citation
#df = df.shuffle(seed=42).select(range(1000))
#df.head()