In [1]:
from cleanlab_studio import Studio

In [2]:
with open('api.txt') as file: 
    key = file.read()
studio = Studio(key)  # Cleanlab Studio API key from https://app.cleanlab.ai/account?tab=General
tlm = studio.TLM()

In [6]:
# Runs a base LLM (GPT-3.5 in this example)
base_llm = studio.TLM(quality_preset='base')
base_llm.prompt("Should perplexity go up or down for a language model to have better performance?")

{'response': 'Perplexity should go down for a language model to have better performance. Perplexity is a measure of how well a language model predicts a given sequence of words. A lower perplexity indicates that the model is more confident and accurate in its predictions. Therefore, a language model with lower perplexity is considered to have better performance.',
 'confidence_score': nan}

In [5]:
# Runs the Cleanlab TLM with confidence reliablity scores
tlm = studio.TLM(quality_preset='best')
tlm.prompt("Should perplexity go up or down for a language model to have better performance?")

{'response': 'Perplexity should go down for a language model to have better performance. Perplexity is a metric commonly used to evaluate the performance of language models. It measures how well a language model can predict the next word in a sentence or sequence of words. A lower perplexity value indicates that the model is more confident and accurate in predicting the next word. Therefore, decreasing perplexity is generally desired to improve the performance of a language model.',
 'confidence_score': 0.939186011279776}

In [7]:
# Runs a faster (slightly less accurate) version of Cleanlab TLM with confidence reliablity scores
tlm = studio.TLM(quality_preset='low')
tlm.prompt("Should perplexity go up or down for a language model to have better performance?")

{'response': 'Perplexity should go down for a language model to have better performance. Perplexity is a measure of how well a language model predicts a given sequence of words. A lower perplexity indicates that the model is more confident and accurate in its predictions. Therefore, a language model with lower perplexity is considered to have better performance.',
 'confidence_score': 0.9392575510655298}

In [8]:
tlm.prompt(
    "Given that machine learning models have trained on data from the internet,"
    "and some small amount of that data includes New York times articles"
    "is the New York times in their right to sue AI companies that train on public internet data?"
)

{'response': 'I am not a lawyer, but I can provide some general information on this topic. The use of publicly available data, including data from the internet, for training machine learning models is a common practice. Generally, publicly available data is considered to be free for use by anyone, including AI companies, as long as they comply with applicable laws and regulations.\n\nHowever, it is important to note that the specific terms and conditions of data usage can vary depending on the source. Some websites or platforms may have specific terms of service that restrict the use of their data for certain purposes. If the New York Times or any other website explicitly prohibits the use of their content for training AI models, then AI companies may be in violation of those terms.\n\nIf the New York Times believes that an AI company has violated their terms of service or infringed upon their intellectual property rights, they may have the right to take legal action. Whether they woul

In [9]:
# Example from together AI hackathon demo
tlm = studio.TLM(quality_preset='best')
tlm.prompt("When did the French revolution start?")  # Actual date according to Google: May 5, 1789

{'response': 'The French Revolution began on July 14, 1789, with the storming of the Bastille prison in Paris. This event is widely regarded as the start of the revolution.',
 'confidence_score': 0.8320045982823503}

In [10]:
tlm.prompt("Did the French revolution start on May 5, 1789?")  # Correct answer according to Google: yes

{'response': 'No, the French Revolution did not officially start on May 5, 1789. The revolution is commonly considered to have started on July 14, 1789, when the people of Paris stormed the Bastille prison, which was a symbolic act against the monarchy and the beginning of widespread popular uprising. May 5, 1789, however, was an important event in the pre-revolutionary period, as it marked the opening of the Estates-General, an assembly that played a significant role leading up to the revolution.',
 'confidence_score': 0.6445548568854021}

In [11]:
# Example from Savant AI during Hackathon demo
tlm.prompt("What's my girlfriend's favorite flowers?")

{'response': "I'm sorry, but as an AI, I don't have personal information about individuals unless shared with me in the course of our conversation. It's best to ask your girlfriend directly about her favorite flowers.",
 'confidence_score': 0.8046503750749281}

In [14]:
tlm.prompt("What is the address of the NeoGenesis House in Hillsborough, CA?")

{'response': "I'm sorry, but I couldn't find any specific information about the address of the NeoGenesis House in Hillsborough, CA. It is possible that there may not be a publicly available address or the establishment you are referring to could be located in a different area. It is recommended to contact NeoGenesis House directly for more accurate information.",
 'confidence_score': 0.6696681338330587}

# Some fun ones

In [18]:
tlm.prompt("How many words are in your response to this prompt?")

{'response': 'There are 19 words in my response to this prompt, including numbers and contractions.',
 'confidence_score': 0.5531959826860577}

In [24]:
# Open-AI GPT-3.5 answer
base_llm.prompt("How many Ns are there in the word enter?")

{'response': 'There are two Ns in the word "enter".', 'confidence_score': nan}

In [22]:
# Cleanlab TLM answer
tlm.prompt("How many Ns are there in the word enter?")

{'response': 'There is only 1 "N" in the word "enter".',
 'confidence_score': 0.4701794459993602}

In [25]:
tlm.prompt("How many engineers does it take to screw a lightbulb?")

{'response': 'It depends on the specific context and situation. Generally, it only takes one engineer to screw in a lightbulb. However, if it is a complex engineering task involving special equipment or specific technical knowledge, it may require a team of engineers working together to install the lightbulb safely and efficiently.',
 'confidence_score': 0.7372778478251901}

In [27]:
# Tricky question because it doesn't say whether this means on the field in-play or on the team in general.
tlm.prompt("How many people are on a football team?")

{'response': 'The number of people on a football team may vary depending on the specific league, competition, or level of play. In general, a typical professional football team has 11 players on the field at a time. However, a team is usually comprised of more than just 11 players in order to have substitutes and cover various positions. A professional team may have around 25-30 players on their roster, including additional backup players, goalkeepers, and substitutes.',
 'confidence_score': 0.6657329116856331}

In [28]:
# Cleanlab (the company was not yet founded by the time GPT-3.5 was trained)
tlm.prompt("When was Cleanlab founded?")

{'response': 'There is no information available about Cleanlab, so it is unclear when it was founded. It could be a relatively new company or a less well-known organization.',
 'confidence_score': 0.5891150110988587}

In [30]:
tlm.prompt("What did the cleanlab open-source package solve back in 2019?")

{'response': 'The cleanlab open-source package, introduced in 2019, solved the problem of learning with noisy labels in machine learning models. It provided tools and techniques to detect and correct label errors or mislabeled instances in a given dataset. This is important because noisy labels can significantly affect the performance and accuracy of machine learning algorithms.',
 'confidence_score': 0.9359887873275701}