In [1]:
# %pip install python-dotenv 
# %pip install langchain
# %pip install huggingface_hub
# %pip install ipywidgets
# %pip install py-readability-metrics

In [2]:
from dotenv import load_dotenv
from langchain_community.llms import HuggingFaceEndpoint

load_dotenv()

True

In [3]:
hub = HuggingFaceEndpoint(repo_id="NousResearch/Nous-Hermes-2-Mixtral-8x7B-DPO")

Token has not been saved to git credential helper. Pass `add_to_git_credential=True` if you want to set the git credential as well.
Token is valid (permission: read).
Your token has been saved to C:\Users\alanj\.cache\huggingface\token
Login successful


In [4]:
from langchain import PromptTemplate

In [5]:
tasks_template = """<|im_start|>system
You are a helpful designer for a childrens digital game company<|im_end|>
<|im_start|>user
{name} is {age} years old and has a reading level age of {reading_level}. They need help with {teaching_task}.

Make a list of 10 suitable tasks to learn {teaching_task}.

Just list out each item 1 by 1 as a JSON list. Only provide the list of tasks, do not include the question or any other information, just the list of tasks.<|im_end|>
<|im_start|>assistant
"""

tasks_prompt = PromptTemplate(
    template = tasks_template,
    input_variables=["name", "age", "reading_level", "teaching_task"]
)

dialogue_template = """<|im_start|>system
You are a helpful designer for a childrens digital game company<|im_end|>
<|im_start|>user
{name} is {age} years old and has a reading level age of {reading_level}. They need help with {teaching_task}.

Question: What dialogue would be helpful in a game where the tasks are:

{tasks}

Answer: Let's think step by step.<|im_end|>
<|im_start|>assistant
"""

dialogue_prompt = PromptTemplate(
    template = dialogue_template, 
    input_variables=["name", "age", "reading_level", "teaching_task", "tasks"]
)

In [6]:
# Define your desired data structure.
from typing import List
from langchain_core.pydantic_v1 import BaseModel, Field
from langchain_core.output_parsers import JsonOutputParser


class Tasks(BaseModel):
    list: List = Field(description="list of tasks to learn a teaching task")

# Set up a parser + inject instructions into the prompt template.
parser = JsonOutputParser(pydantic_object=Tasks)

tasks_prompt_json = PromptTemplate(
    template=tasks_template,
    input_variables=["name", "age", "reading_level", "teaching_task"],
    partial_variables={"format_instructions": parser.get_format_instructions()},
)

In [7]:
tasks_chain = tasks_prompt_json | hub 
# dialogue_chain = dialogue_prompt | hub.bind(stop="<|im_end|>")

# chain = (
#     tasks_chain | dialogue_chain
# )

# Langsmith

In [8]:
from langsmith import Client
client = Client()

## Create dataset

In [9]:
# text = """How to get ready for school
# How to identify shapes
# How to read body language
# How to understand facial expressions
# How to share with others
# How to empathise with others
# How to handle identify emotions
# How to deal with emotions
# How to spell their name
# How to solve a maze
# How to tidy up
# How to ask for help
# How to brush teeth
# How to use the toilet
# How to deal with loud noises
# How to deal with bright lights
# How to deal with strong smells
# How to make friends
# How to help others"""

# dataset = client.create_dataset(
#     dataset_name="DynoLearn Dataset",
#     description="Initial Prompts",
# )

# client.create_examples(
#     inputs=[{"name": "William", "age": '7', "reading_level": '5', "teaching_task": q} for q in text.split("")],
#     outputs=[],
#     dataset_id=dataset.id,
# )

## Evaluate

In [14]:
def write_to_file(string):
    with open("text.txt", "a") as t:
        t.writelines(string)
        t.writelines("\n")

In [11]:
from langchain.smith import RunEvalConfig
from langsmith.evaluation import EvaluationResult, run_evaluator
from readability import Readability

@run_evaluator
def flesch_readability_eval(run, example) -> EvaluationResult:
    try:
        score = Readability(run.outputs.get("output")).flesch().score
        write_to_file(str(run.inputs) + "\t" + run.outputs.get("output").replace("\n", " ") + "\t" + str(score))
    except Exception as e:
        print(e, end='\n\n')
        score = -1
    return EvaluationResult(key="flesch", score=score)

@run_evaluator
def flesch_kincaid_readability_eval(run, example) -> EvaluationResult:
    try:
        score = Readability(run.outputs.get("output")).flesch_kincaid().score
        write_to_file(str(run.inputs) + "\t" + run.outputs.get("output").replace("\n", " ") + "\t"*2 + str(score))
    except Exception as e:
        print(e, end='\n\n')
        score = -1
    return EvaluationResult(key="flesch_kincaid", score=score)

@run_evaluator
def gunning_fog_readability_eval(run, example) -> EvaluationResult:
    try:
        score = Readability(run.outputs.get("output")).gunning_fog().score
        write_to_file(str(run.inputs) + "\t" + run.outputs.get("output").replace("\n", " ") + "\t"*3 + str(score))
    except Exception as e:
        print(e, end='\n\n')
        score = -1
    return EvaluationResult(key="gunning_fog", score=score)

@run_evaluator
def coleman_liau_readability_eval(run, example) -> EvaluationResult:
    try:
        score = Readability(run.outputs.get("output")).coleman_liau().score
        write_to_file(str(run.inputs) + "\t" + run.outputs.get("output").replace("\n", " ") + "\t"*4 + str(score))
    except Exception as e:
        print(e, end='\n\n')
        score = -1
    return EvaluationResult(key="coleman_liau", score=score)

@run_evaluator
def dale_chall_readability_eval(run, example) -> EvaluationResult:
    try:
        score = Readability(run.outputs.get("output")).dale_chall().score
        write_to_file(str(run.inputs) + "\t" + run.outputs.get("output").replace("\n", " ") + "\t"*5 + str(score))
    except Exception as e:
        print(e, end='\n\n')
        score = -1
    return EvaluationResult(key="dale_chall", score=score)

@run_evaluator
def ari_readability_eval(run, example) -> EvaluationResult:
    try:
        score = Readability(run.outputs.get("output")).ari().score
        write_to_file(str(run.inputs) + "\t" + run.outputs.get("output").replace("\n", " ") + "\t"*6 + str(score))
    except Exception as e:
        print(e, end='\n\n')
        score = -1
    return EvaluationResult(key="ari", score=score)

@run_evaluator
def linsear_write_readability_eval(run, example) -> EvaluationResult:
    try:
        score = Readability(run.outputs.get("output")).linsear_write().score
        write_to_file(str(run.inputs) + "\t" + run.outputs.get("output").replace("\n", " ") + "\t"*7 + str(score))
    except Exception as e:
        print(e, end='\n\n')
        score = -1
    return EvaluationResult(key="linsear_write", score=score)
    

@run_evaluator
def smog_readability_eval(run, example) -> EvaluationResult:
    try:
        score = Readability(run.outputs.get("output")).smog().score
        write_to_file(str(run.inputs) + "\t" + run.outputs.get("output").replace("\n", " ") + "\t"*8 + str(score))
    except Exception as e:
        print(e, end='\n\n')
        score = -1
    return EvaluationResult(key="smog", score=score)

@run_evaluator
def spache_readability_eval(run, example) -> EvaluationResult:
    try:
        score = Readability(run.outputs.get("output")).spache().score
        write_to_file(str(run.inputs) + "\t" + run.outputs.get("output").replace("\n", " ") + "\t"*9 + str(score))
    except Exception as e:
        print(e, end='\n\n')
        score = -1
    return EvaluationResult(key="spache", score=score)

eval_config = RunEvalConfig(
    # eval_llm=eval_llm,
    custom_evaluators=[
        spache_readability_eval, 
        ari_readability_eval, 
        coleman_liau_readability_eval, 
        smog_readability_eval, 
        linsear_write_readability_eval,
        dale_chall_readability_eval,
        gunning_fog_readability_eval,
        flesch_kincaid_readability_eval,
        flesch_readability_eval
    ],
    # # You can also use a prebuilt evaluator
    # # by providing a name or RunEvalConfig.<configured evaluator>
    # evaluators=[
    #     # You can specify an evaluator by name/enum.
    #     # In this case, the default criterion is "helpfulness"
    #     "criteria",
    #     # Or you can configure the evaluator
    #     RunEvalConfig.Criteria("harmfulness"),
    #     RunEvalConfig.Criteria(
    #         {
    #             "cliche": "Are the lyrics cliche?"
    #             "Respond Y if they are, N if they're entirely unique."
    #         }
    #     ),
    # ],
)

In [13]:
results = client.arun_on_dataset(
    dataset_name="DynoLearn Dataset",
    llm_or_chain_factory=tasks_chain,
    evaluation=eval_config,
    verbose=True,
    project_name="runnable-test-44",
    # Any experiment metadata can be specified here
    project_metadata={"version": "1.0.0"},
)

View the evaluation results for project 'runnable-test-43' at:
https://smith.langchain.com/o/cb290b0a-415d-5ed7-8ecf-fc36743479bf/datasets/116b0e6d-2428-4ec4-9988-e60242606aae/compare?selectedSessions=9dc14dc8-f2b4-4a2f-b728-c23a381991bd

View all tests for Dataset DynoLearn Dataset at:
https://smith.langchain.com/o/cb290b0a-415d-5ed7-8ecf-fc36743479bf/datasets/116b0e6d-2428-4ec4-9988-e60242606aae
[--------------->                                  ] 6/19100 words required.

[---------------------------->                     ] 11/19100 words required.

[---------------------------------------------->   ] 18/19100 words required.

[------------------------------------------------->] 19/19

Unnamed: 0,feedback.spache,feedback.ari,feedback.coleman_liau,feedback.smog,feedback.linsear_write,feedback.dale_chall,feedback.gunning_fog,feedback.flesch_kincaid,feedback.flesch,error,execution_time,run_id
count,19.0,19.0,19.0,19.0,19.0,19.0,19.0,19.0,19.0,0.0,19.0,19
unique,,,,,,,,,,0.0,,19
top,,,,,,,,,,,,863fd050-3716-4b29-8d04-179bc5d5a1d0
freq,,,,,,,,,,,,1
mean,7.942317,11.88526,5.943784,-1.0,16.404039,9.779616,13.808433,11.38348,41.792322,,1.795307,
std,6.552794,19.13308,4.340499,0.0,25.360985,5.2799,16.162248,15.132183,45.47325,,2.469662,
min,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-48.192609,,0.190329,
25%,5.809895,0.463589,2.93239,-1.0,3.25,9.794755,6.185254,3.863229,-1.0,,0.322097,
50%,7.034857,4.696505,6.419873,-1.0,7.454545,10.911927,8.69899,6.425507,65.162168,,0.396171,
75%,8.018528,9.00095,8.935511,-1.0,11.318182,11.867761,12.824089,9.120113,75.528362,,3.095622,


{'project_name': 'runnable-test-43',
 'results': {'9fe928cd-26c2-4eb3-ac0a-6f7993a783e9': {'input': {'age': '7',
    'name': 'William',
    'reading_level': '5',
    'teaching_task': 'How to help others'},
   'feedback': [EvaluationResult(key='spache', score=5.527025939849624, value=None, comment=None, correction=None, evaluator_info={}, source_run_id=None, target_run_id=None),
    EvaluationResult(key='ari', score=0.310037593984962, value=None, comment=None, correction=None, evaluator_info={}, source_run_id=None, target_run_id=None),
    EvaluationResult(key='coleman_liau', score=2.7383458646616496, value=None, comment=None, correction=None, evaluator_info={}, source_run_id=None, target_run_id=None),
    EvaluationResult(key='smog', score=-1, value=None, comment=None, correction=None, evaluator_info={}, source_run_id=None, target_run_id=None),
    EvaluationResult(key='linsear_write', score=3.075, value=None, comment=None, correction=None, evaluator_info={}, source_run_id=None, target

In [1]:
results

NameError: name 'results' is not defined