In [501]:
from openai import OpenAI
import dotenv
dotenv.load_dotenv('.env')
import torch
from torch import nn
import pandas as pd
import mrf
import mrf_data
import json
import llm
from evaluate import kl_divergence
import tiktoken

encoding = tiktoken.encoding_for_model("gpt-4o")

In [None]:
prompt = {}

prompt['initial_message'] = [{"role": "system", "content": 'You are a statistician interviewing for a market analytics position. ' +\
                    'During the interview, you are asked to analyze the nightly prices of Airbnb listings in the United States in the year of 2017. ' +\
                    'Although you do not have access to any actual data, you are known for your ability to identify structures and relationships among data variables. ' +\
                    'Rely on commonsense guesses, intuition, and experience to make generalizations. ' +\
                    'The interviewer will guide your thinking through a set of predefined questions. ' +\
                    'Do not jump to conclusions. Reason through the problem concisely before giving the answer.'},]


""" json_dialogue = [{"role": "system", "content": 'You are a transcriber tasked to convert parts of a statistician interview for a market analytics position into JSON. ' +\
                    'During the interview, the interviewee was asked to analyze the nightly prices of Airbnb listings in the United States in 2017. ' +\
                    'The JSON has two parts, Variables and Constraints. Constraints can be left out if not mentioned yet. \nVariables define the variable names and their possible values. For example, Variable X with Values x1, x2, and x3. \n\nEach constraint entry has three parts: (Batch) Target, Condition, and (Batch) Probability. Target an Condition also consist of names and possible values. Probability denotes conditional or unconditional probabilities in interval or number. \nFor example, if there is P(Y=y1|X=x1)=0.9, then Target has Variable Y with Value y1, and Condition Variable X Value x1, and a probability of 0.9. \nIf there are multiple conditions in the value, then we are conditioning on the collection of them: P(Y=y1|X=x1 or X=x2) then Condition has Variable X Value x1, x2.\nThere can also be no conditions, like P(X=x1), and then Condition is left empty.\n For simplicity, we also have Batch notation:\nIf conditioned on x1 probability of y1 is 0.7 and of y2 is 40%-50%, then we use Batch Variable with Name Y and Values y1, y2. In this case there is Batch Probability which must has two entries, 0.7 and [0.4, 0.5].\n When transcribing, always provide the existing Variables, but you do not need to include the previous constraints. If the Variable mentioned in the constraints do not exist, leave it out. \nAt the end of the message there may be an additional JSON that states some necessary Variable and possibly their values which should always be included with their exact name. After the initial training warmup (with hints), you will start converting the actual dialogue. Remember that the warmup is to provide examples, not for actual entries afterwards.'},] """




json_dialogue = [{"role": "system", "content": 'You are a transcriber tasked to convert parts of a statistician interview for a market analytics position into JSON and provide querying messages. ' +\
                    'During the interview, the interviewee was asked to analyze the nightly prices of Airbnb listings in the United States in 2017. ' +\
                    'The JSON record has two parts, Variables and Constraints. Constraints can be left out if not mentioned yet. \nVariables define the variable names and their possible values. You should try to simplify the names if possible for convenience, but make sure to keep the naming consistent. For example, Variable X with Values x1, x2, and x3. \n\nEach constraint entry has three parts: (Batch) Target, Condition, and (Batch) Probability. Target an Condition also consist of names and possible values. Probability denotes conditional or unconditional probabilities in interval or number. \nFor example, if there is P(Y=y1|X=x1)=0.9, then Target has Variable Y with Value y1, and Condition Variable X Value x1, and a probability of 0.9. \nIf there are multiple conditions in the value, then we are conditioning on the collection of them: P(Y=y1|X=x1 or X=x2) then Condition has Variable X Value x1, x2.\nThere can also be no conditions, like P(X=x1), and then Condition is left empty.\n For simplicity, we also have Batch notation:\nIf conditioned on x1 probability of y1 is 0.7 and of y2 is 40%-50%, then we use Batch Variable with Name Y and Values y1, y2. In this case there is Batch Probability which must has two entries, 0.7 and [0.4, 0.5].\n When transcribing for records, always provide the existing Variables, but provide only the latest constraints (exclude constraints from previous dialogues). \n There would also be query to the record, in which case you should provide a format similar to the records. The question may have slightly different wording, but you are to provide a query JSON appropriate to the question to the extent allowed by the Variable defitions, without making additional assumptions. \nAfter the initial training warmup (with hints), you will start converting the actual dialogue. Remember that the warmup is only to provide examples.'},]

# put comments in JSON
user_1 = '[warmup][record] After consideration, I think there should be three variables: \nA with values a1, a2, a3\n B with values b1, b2\n C with values c1, c2. \n'
#user_1 += '[required] ' + '{"Variables": [{"Name": "A"}, {"Name": "C", "Value": ["c2"]}]}'
#user_1 += ' (must include these variables with exact naming)'

assistant_1 = '\n{\n  "Variables": [\n    {\n      "Name": "A",\n      "Value": ["a1", "a2", "a3"]\n    },\n    {\n      "Name": "B",\n      "Value": ["b1", "b2"]\n    },\n    {\n      "Name": "C",\n      "Value": ["c1", "c2"]\n    }\n  ]}'

json_dialogue.append({"role": "user", "content": user_1})
json_dialogue.append({"role": "assistant", "content": assistant_1})


user_2 = '[warmup][record] Conditional probability of a1 given b2: 10%-20%\n' +\
    '(0.1<P(A=a1|B=b2)<0.2)\n\n' +\
    'Conditional probability of c2 given a1 or a2: 50% \n' +\
    '(P(C=c2|A=a1 or A=a2)=0.5)\n\n'


assistant_2 = '\n{\n  "Variables": [\n    {\n      "Name": "A",\n      "Value": ["a1", "a2", "a3"]\n    },\n    {\n      "Name": "B",\n      "Value": ["b1", "b2"]\n    },\n    {\n      "Name": "C",\n      "Value": ["c1", "c2"]\n    }\n  ],\n  "Constraints": [\n    {\n      "Target": [\n        {\n          "Name": "A",\n          "Value": ["a1"]\n        }\n      ],\n      "Condition": [\n        {\n          "Name": "B",\n          "Value": ["b2"]\n        },\n      ],\n      "Probability": [0.1, 0.2]\n    },\n    {\n      "Target": [\n        {\n          "Name": "C",\n          "Value": ["c2"]\n        }\n      ],\n      "Condition": [\n        {\n          "Name": "A",\n          "Value": ["a1", "a2"]\n        }\n      ],\n      "Probability": [0.5]\n    }\n  ]\n}'


json_dialogue.append({"role": "user", "content": user_2})
json_dialogue.append({"role": "assistant", "content": assistant_2})


user_3 = '[warmup][record] Marginal probability of a2: 0.8-0.9\n' +\
    '(0.8<P(A=a2)<0.9)\n\n' + \
    'C=c2:\n' +\
    'b1: 30%-40%, b2: 60% (in this case we use Batch for simplicity, as multiple probabilities of Variable B conditioned on c2 is given)\n' +\
    '(0.3<P(B=b1|C=c2)<0.4, P(B=b2|C=c2)=0.6)\n\n'

assistant_3 = '\n{\n  "Variables": [\n    {\n      "Name": "A",\n      "Value": ["a1", "a2", "a3"]\n    },\n    {\n      "Name": "B",\n      "Value": ["b1", "b2"]\n    },\n    {\n      "Name": "C",\n      "Value": ["c1", "c2"]\n    }\n  ],\n  "Constraints": [\n    {\n      "Target": [\n        {\n          "Name": "A",\n          "Value": ["a2"]\n        }\n      ],\n      "Condition": [],\n      "Probability": [0.8, 0.9]\n    },\n    {\n      "Batch Target": [\n        {\n          "Name": "B",\n          "Value": ["b1", "b2"]\n        }\n      ],\n      "Condition": [\n        {\n          "Name": "C",\n          "Value": ["c2"]\n        }\n      ],\n      "Batch Probability": [\n        [0.3, 0.4],\n        [0.6]\n      ]\n    }\n  ]\n}'


json_dialogue.append({"role": "user", "content": user_3})
json_dialogue.append({"role": "assistant", "content": assistant_3})


user_4 = '[warmup][query] What is Beta is we know that A is at most 2? (Beta would be a different way to refer to B in our Variable, and since a specific value is not provided, we perform a batch target query on B with all possible values of B)'
assistant_4 = '\n{\n  "Queries": [\n    {\n      "Batch Target": [\n        {\n          "Name": "B",\n          "Value": ["b1", "b2"]\n        }\n      ],\n      "Condition": [\n        {\n          "Name": "A",\n          "Value": ["a1", "a2"]\n        }\n      ]\n    }\n  ]\n}'


json_dialogue.append({"role": "user", "content": user_4})
json_dialogue.append({"role": "assistant", "content": assistant_4})


prompt['json_dialogue'] = json_dialogue


prompt['binq'] = 'To model the nightly price of Airbnb listings in 2017, we need to first divide the price into a few ranges. Think about the parametric statistical distribution to approximate the price. Divide the price into 4~5 ranges accordingly.'

# Reason through how these variables might interact or correlate with each other. 
prompt['brainq'] = "Let's brainstorm the factors that might affect or determine the prices of Airbnb listings. Think about the obvious and commonsense aspects that would influence pricing. "

prompt['broadq'] = "What are some salient interactions? Consider how these variables might influence each other, rather than just their impact on price. Think about correlations and dependencies among the variables themselves. You could also suggest new variables."

prompt['moreq'] = "Take your time to reason through additional variables and their possible interactions. Consider other factors that might influence the price, or other existing variables."

prompt['varq'] = "Pick some of the most important variables (at most 6), what exhaustively are their possible values? These values should be few but also concrete, and mutually exclusive. Do not forget that the price is also a variable, which you should also include."

prompt['condq'] = "You should provide an updated response based on this new consideration. "

# p(x)  You can give a credible interval if unsure.
prompt['margq'] = "Well, what are the individual marginal probabilities of all the values of these variables? Give a best guess."

# p(price|bedroom)
prompt['pricecondq'] = 'How would they affect the price? Suggest the probability of price, conditioned on each of these variable values. Give a best guess. However, if you think that the variable or value does not affect the price much, you can skip it.'

# p(bedroom=2|property_type=entire_house)
#interactq = "How do you think the rest of the chosen variables interact with each other? Avoid duplicates from before. After reasoning through, suggest the conditional probability that captures those interaction."
# dependencies <-> interaction? point-wise mutual information 
prompt['interactq'] = "What are some dependencies among these variables other than price? Suggest the conditional probability that captures those interaction. Suggest with the probability value."

# save prompt
with open('./prompt.json', 'w') as file:
    json.dump(prompt, file)


chains = ['binq', 'brainq', 'broadq', 'varq']
saws = [None, 'margq', 'pricecondq', 'interactq']

seq = [chains, saws]

with open('./example_question.json', 'r') as file:
    question = json.load(file)

In [539]:
prompt = {}

prompt['initial_message'] = 'You are a statistician interviewing for a market analytics position. ' +\
                    'During the interview, you are asked to analyze the nightly prices of Airbnb listings in the United States in 2017. ' +\
                    'Although you do not have access to any actual data, you are known for your ability to identify structures and relationships among data variables using intuition and commonsense guesses.' +\
                    'The interviewer will guide your thinking through a set of predefined questions. ' +\
                    '\nUse Intuition: Since there is no data to fit to, rely on intuition and commonsense knowledge to make broad generalizations. ' +\
                    '\nVariable Selection: Consider only the most relevant variables and restrict the number of variables and their possible values to cover likely circumstances.' +\
                    '\nReasoning First: Think and reason through the problem concisely before providing structured answers. Avoid giving explanations after your answers.'



""" json_dialogue = [{"role": "system", "content": 'You are a transcriber tasked to convert parts of a statistician interview for a market analytics position into JSON. ' +\
                    'During the interview, the interviewee was asked to analyze the nightly prices of Airbnb listings in the United States in 2017. ' +\
                    'The JSON has two parts, Variables and Constraints. Constraints can be left out if not mentioned yet. \nVariables define the variable names and their possible values. For example, Variable X with Values x1, x2, and x3. \n\nEach constraint entry has three parts: (Batch) Target, Condition, and (Batch) Probability. Target an Condition also consist of names and possible values. Probability denotes conditional or unconditional probabilities in interval or number. \nFor example, if there is P(Y=y1|X=x1)=0.9, then Target has Variable Y with Value y1, and Condition Variable X Value x1, and a probability of 0.9. \nIf there are multiple conditions in the value, then we are conditioning on the collection of them: P(Y=y1|X=x1 or X=x2) then Condition has Variable X Value x1, x2.\nThere can also be no conditions, like P(X=x1), and then Condition is left empty.\n For simplicity, we also have Batch notation:\nIf conditioned on x1 probability of y1 is 0.7 and of y2 is 40%-50%, then we use Batch Variable with Name Y and Values y1, y2. In this case there is Batch Probability which must has two entries, 0.7 and [0.4, 0.5].\n When transcribing, always provide the existing Variables, but you do not need to include the previous constraints. If the Variable mentioned in the constraints do not exist, leave it out. \nAt the end of the message there may be an additional JSON that states some necessary Variable and possibly their values which should always be included with their exact name. After the initial training warmup (with hints), you will start converting the actual dialogue. Remember that the warmup is to provide examples, not for actual entries afterwards.'},] """


json_dialogue = []

system = 'You are a transcriber tasked to convert parts of a statistician interview for a market analytics position into JSON and provide querying messages. ' +\
        'During the interview, the interviewee was asked to analyze the nightly prices of Airbnb listings in the United States in 2017. '

system += 'The JSON record has two parts, Variables and Constraints. Constraints can be left out if not mentioned yet. \nVariables define the variable names and their possible values. You should try to simplify the names if possible for convenience, but make sure to keep the naming consistent. For example, Variable X with Values x1, x2, and x3. \n\nEach constraint entry has three parts: (Batch) Target, Condition, and (Batch) Probability. Target an Condition also consist of names and possible values. Probability denotes conditional or unconditional probabilities in interval or number. \nFor example, if there is P(Y=y1|X=x1)=0.9, then Target has Variable Y with Value y1, and Condition Variable X Value x1, and a probability of 0.9. \nIf there are multiple conditions in the value, then we are conditioning on the collection of them: P(Y=y1|X=x1 or X=x2) then Condition has Variable X Value x1, x2.\nThere can also be no conditions, like P(X=x1), and then Condition is left empty.\n For simplicity, we also have Batch notation:\nIf conditioned on x1 probability of y1 is 0.7 and of y2 is 40%-50%, then we use Batch Variable with Name Y and Values y1, y2. In this case there is Batch Probability which must has two entries, 0.7 and [0.4, 0.5].\n When transcribing for records, always provide the existing Variables, but provide only the latest constraints (exclude constraints from previous dialogues). \n There would also be query to the record, in which case you should provide a format similar to the records. The question may have slightly different wording, but you are to provide a query JSON appropriate to the question to the extent allowed by the Variable defitions, without making additional assumptions. \nAfter the initial training warmup (with hints), you will start converting the actual dialogue. Remember that the warmup is only to provide examples.'

json_dialogue.append({"role": "system", "content": system})

# put comments in JSON
user_1 = '[warmup][record] After consideration, I think there should be three variables: \nA with values a1, a2, a3\n B with values b1, b2\n C with values c1, c2. \n'
#user_1 += '[required] ' + '{"Variables": [{"Name": "A"}, {"Name": "C", "Value": ["c2"]}]}'
#user_1 += ' (must include these variables with exact naming)'

assistant_1 = '\n{\n  "Variables": [\n    {\n      "Name": "A",\n      "Value": ["a1", "a2", "a3"]\n    },\n    {\n      "Name": "B",\n      "Value": ["b1", "b2"]\n    },\n    {\n      "Name": "C",\n      "Value": ["c1", "c2"]\n    }\n  ]}'

json_dialogue.append({"role": "user", "content": user_1})
json_dialogue.append({"role": "assistant", "content": assistant_1})


user_2 = '[warmup][record] Conditional probability of a1 given b2: 10%-20%\n' +\
    '(0.1<P(A=a1|B=b2)<0.2)\n\n' +\
    'Conditional probability of c2 given a1 or a2: 50% \n' +\
    '(P(C=c2|A=a1 or A=a2)=0.5)\n\n'


assistant_2 = '\n{\n  "Variables": [\n    {\n      "Name": "A",\n      "Value": ["a1", "a2", "a3"]\n    },\n    {\n      "Name": "B",\n      "Value": ["b1", "b2"]\n    },\n    {\n      "Name": "C",\n      "Value": ["c1", "c2"]\n    }\n  ],\n  "Constraints": [\n    {\n      "Target": [\n        {\n          "Name": "A",\n          "Value": ["a1"]\n        }\n      ],\n      "Condition": [\n        {\n          "Name": "B",\n          "Value": ["b2"]\n        },\n      ],\n      "Probability": [0.1, 0.2]\n    },\n    {\n      "Target": [\n        {\n          "Name": "C",\n          "Value": ["c2"]\n        }\n      ],\n      "Condition": [\n        {\n          "Name": "A",\n          "Value": ["a1", "a2"]\n        }\n      ],\n      "Probability": [0.5]\n    }\n  ]\n}'


json_dialogue.append({"role": "user", "content": user_2})
json_dialogue.append({"role": "assistant", "content": assistant_2})


user_3 = '[warmup][record] Marginal probability of a2: 0.8-0.9\n' +\
    '(0.8<P(A=a2)<0.9)\n\n' + \
    'C=c2:\n' +\
    'b1: 30%-40%, b2: 60% (in this case we use Batch for simplicity, as multiple probabilities of Variable B conditioned on c2 is given)\n' +\
    '(0.3<P(B=b1|C=c2)<0.4, P(B=b2|C=c2)=0.6)\n\n'

assistant_3 = '\n{\n  "Variables": [\n    {\n      "Name": "A",\n      "Value": ["a1", "a2", "a3"]\n    },\n    {\n      "Name": "B",\n      "Value": ["b1", "b2"]\n    },\n    {\n      "Name": "C",\n      "Value": ["c1", "c2"]\n    }\n  ],\n  "Constraints": [\n    {\n      "Target": [\n        {\n          "Name": "A",\n          "Value": ["a2"]\n        }\n      ],\n      "Condition": [],\n      "Probability": [0.8, 0.9]\n    },\n    {\n      "Batch Target": [\n        {\n          "Name": "B",\n          "Value": ["b1", "b2"]\n        }\n      ],\n      "Condition": [\n        {\n          "Name": "C",\n          "Value": ["c2"]\n        }\n      ],\n      "Batch Probability": [\n        [0.3, 0.4],\n        [0.6]\n      ]\n    }\n  ]\n}'


json_dialogue.append({"role": "user", "content": user_3})
json_dialogue.append({"role": "assistant", "content": assistant_3})


user_4 = '[warmup][query] What is Beta is we know that A is at most 2? (Beta would be a different way to refer to B in our Variable, and since a specific value is not provided, we perform a batch target query on B with all possible values of B)'
assistant_4 = '\n{\n  "Queries": [\n    {\n      "Batch Target": [\n        {\n          "Name": "B",\n          "Value": ["b1", "b2"]\n        }\n      ],\n      "Condition": [\n        {\n          "Name": "A",\n          "Value": ["a1", "a2"]\n        }\n      ]\n    }\n  ]\n}'


json_dialogue.append({"role": "user", "content": user_4})
json_dialogue.append({"role": "assistant", "content": assistant_4})


prompt['json_dialogue'] = json_dialogue


prompt['binq'] = 'To model the nightly price of Airbnb listings in 2017, we need to first divide the price into a few ranges. Think about the appropriate parametric distribution to approximate the price. Divide the price into 4~5 ranges accordingly. This would be our first variable.'

# Reason through how these variables might interact or correlate with each other. 
prompt['brainq'] = "Let's brainstorm the variables that might affect or determine the prices of Airbnb listings. Think about the commonsense intrinsic or extrinsic factors that would influence pricing. "

prompt['broadq'] = "Identify key interactions among the variables. Consider how they might interact with or correlate to each other. For example, how many bedrooms do entire house listings tend to offer? You can suggest new relevant variables if necessary."


prompt['moreq'] = "Take your time to reason through additional variables and their possible interactions. Consider other factors that might influence the price, or other existing variables."

prompt['varq'] = "Select a few key variables (fewer than 7), including but not limited to price. For each variable, exhaustively list all possible values, ensuring they are concrete and mutually exclusive. Focus on variables that reveal meaningful patterns or relationships, not just their impact on price."

prompt['condq'] = "You should provide an updated response based on this new consideration. "

# p(x)  You can give a credible interval if unsure.
prompt['margq'] = "Estimate the marginal probabilities for the values of each variable based on your best intuition. Give a best guess."

# p(price|bedroom)
prompt['pricecondq'] = 'How would they affect the price? Suggest the numeric probability of price, conditioned on each of these variable values. Give a best guess. However, if you think that the variable or value does not affect the price much, you can skip it.'

# p(bedroom=2|property_type=entire_house)
#interactq = "How do you think the rest of the chosen variables interact with each other? Avoid duplicates from before. After reasoning through, suggest the conditional probability that captures those interaction."
# dependencies <-> interaction? point-wise mutual information 
prompt['interactq'] = "What are some dependencies among these variables other than price? Suggest numeric conditional probability that captures those interaction. "

# save prompt
with open('./prompt.json', 'w') as file:
    json.dump(prompt, file)


chains = ['binq', 'brainq', 'broadq', 'varq']
saws = [None, 'margq', 'pricecondq', 'interactq']

seq = [chains, saws]

with open('./example_question.json', 'r') as file:
    question = json.load(file)

In [556]:


prompt = {}

prompt['initial_message'] = 'You are a statistician interviewing for a market analytics position. ' +\
                    'During the interview, you are asked to analyze the nightly prices of Airbnb listings in the United States in 2017. ' +\
                    'Although you do not have access to any actual data, you are known for your ability to identify structures and relationships among data variables using intuition and commonsense guesses.' +\
                    'The interviewer will guide your thinking through a set of predefined questions. ' +\
                    '\nUse Intuition: Since there is no data to fit to, rely on intuition and commonsense knowledge to make broad generalizations. ' +\
                    '\nVariable Selection: Consider only the most relevant variables and restrict the number of variables and their possible values to cover likely circumstances.' +\
                    '\nReasoning First: Think and reason through the problem concisely before providing structured answers. Avoid giving explanations after your answers.'


json_dialogue = []

system = 'You are a transcriber tasked to convert parts of a statistician interview for a market analytics position into JSON and provide querying messages. ' +\
        'During the interview, the interviewee was asked to analyze the nightly prices of Airbnb listings in the United States in 2017. '

system += 'The JSON record has two parts, Variables and Constraints. Constraints can be left out if not mentioned yet. \nVariables define the variable names and their possible values. You should try to simplify the names if possible for convenience, but make sure to keep the naming consistent. For example, Variable X with Values x1, x2, and x3. \n\nEach constraint entry has three parts: (Batch) Target, Condition, and (Batch) Probability. Target an Condition also consist of names and possible values. Probability denotes conditional or unconditional probabilities in interval or number. \nFor example, if there is P(Y=y1|X=x1)=0.9, then Target has Variable Y with Value y1, and Condition Variable X Value x1, and a probability of 0.9. \nIf there are multiple conditions in the value, then we are conditioning on the collection of them: P(Y=y1|X=x1 or X=x2) then Condition has Variable X Value x1, x2.\nThere can also be no conditions, like P(X=x1), and then Condition is left empty.\n For simplicity, we also have Batch notation:\nIf conditioned on x1 probability of y1 is 0.7 and of y2 is 40%-50%, then we use BatchTarget with Name Y and Values y1, y2. In this case there is BatchProbability which must has two entries, 0.7 and [0.4, 0.5].\n When transcribing for records, always provide the existing Variables, but provide only the latest constraints (exclude constraints from previous dialogues). \n There would also be query to the record, in which case you should provide a format similar to the records. The question may have slightly different wording, but you are to provide a query JSON appropriate to the question to the extent allowed by the Variable defitions, without making additional assumptions. \nAfter the initial training warmup (with hints), you will start converting the actual dialogue. Remember that the warmup is only to provide examples. Generate JSON without line breaks or extra white spaces.'

json_dialogue.append({"role": "system", "content": system})

# put comments in JSON
user_1 = '[warmup][record] After consideration, I think there should be three variables: \nA with values a1, a2, a3\n B with values b1, b2\n C with values c1, c2. \n'
#user_1 += '[required] ' + '{"Variables": [{"Name": "A"}, {"Name": "C", "Value": ["c2"]}]}'
#user_1 += ' (must include these variables with exact naming)'

assistant_1 = '\n{\n  "Variables": [\n    {\n      "Name": "A",\n      "Value": ["a1", "a2", "a3"]\n    },\n    {\n      "Name": "B",\n      "Value": ["b1", "b2"]\n    },\n    {\n      "Name": "C",\n      "Value": ["c1", "c2"]\n    }\n  ]}'


print("assistant 1 before stripping: ", len(encoding.encode(assistant_1)))
assistant_1 = assistant_1.replace('\n', '')
assistant_1 = assistant_1.replace(' ', '')
print("assistant 1 after stripping: ", len(encoding.encode(assistant_1)))


json_dialogue.append({"role": "user", "content": user_1})
json_dialogue.append({"role": "assistant", "content": assistant_1})


user_2 = '[warmup][record] Conditional probability of a1 given b2: 10%-20%\n' +\
    '(0.1<P(A=a1|B=b2)<0.2)\n\n' +\
    'Conditional probability of c2 given a1 or a2: 50% \n' +\
    '(P(C=c2|A=a1 or A=a2)=0.5)\n\n'


assistant_2 = '\n{\n  "Variables": [\n    {\n      "Name": "A",\n      "Value": ["a1", "a2", "a3"]\n    },\n    {\n      "Name": "B",\n      "Value": ["b1", "b2"]\n    },\n    {\n      "Name": "C",\n      "Value": ["c1", "c2"]\n    }\n  ],\n  "Constraints": [\n    {\n      "Target": [\n        {\n          "Name": "A",\n          "Value": ["a1"]\n        }\n      ],\n      "Condition": [\n        {\n          "Name": "B",\n          "Value": ["b2"]\n        },\n      ],\n      "Probability": [0.1, 0.2]\n    },\n    {\n      "Target": [\n        {\n          "Name": "C",\n          "Value": ["c2"]\n        }\n      ],\n      "Condition": [\n        {\n          "Name": "A",\n          "Value": ["a1", "a2"]\n        }\n      ],\n      "Probability": [0.5]\n    }\n  ]\n}'


print("assistant 2 before stripping: ", len(encoding.encode(assistant_2)))
assistant_2 = assistant_2.replace('\n', '')
assistant_2 = assistant_2.replace(' ', '')
print("assistant 2 after stripping: ", len(encoding.encode(assistant_2)))


json_dialogue.append({"role": "user", "content": user_2})
json_dialogue.append({"role": "assistant", "content": assistant_2})


user_3 = '[warmup][record] Marginal probability of a2: 0.8-0.9\n' +\
    '(0.8<P(A=a2)<0.9)\n\n' + \
    'C=c2:\n' +\
    'b1: 30%-40%, b2: 60% (in this case we use Batch for simplicity, as multiple probabilities of Variable B conditioned on c2 is given)\n' +\
    '(0.3<P(B=b1|C=c2)<0.4, P(B=b2|C=c2)=0.6)\n\n'

assistant_3 = '\n{\n  "Variables": [\n    {\n      "Name": "A",\n      "Value": ["a1", "a2", "a3"]\n    },\n    {\n      "Name": "B",\n      "Value": ["b1", "b2"]\n    },\n    {\n      "Name": "C",\n      "Value": ["c1", "c2"]\n    }\n  ],\n  "Constraints": [\n    {\n      "Target": [\n        {\n          "Name": "A",\n          "Value": ["a2"]\n        }\n      ],\n      "Condition": [],\n      "Probability": [0.8, 0.9]\n    },\n    {\n      "Batch Target": [\n        {\n          "Name": "B",\n          "Value": ["b1", "b2"]\n        }\n      ],\n      "Condition": [\n        {\n          "Name": "C",\n          "Value": ["c2"]\n        }\n      ],\n      "Batch Probability": [\n        [0.3, 0.4],\n        [0.6]\n      ]\n    }\n  ]\n}'


print("assistant 3 before stripping: ", len(encoding.encode(assistant_3)))
assistant_3 = assistant_3.replace('\n', '')
assistant_3 = assistant_3.replace(' ', '')
print("assistant 3 after stripping: ", len(encoding.encode(assistant_3)))


json_dialogue.append({"role": "user", "content": user_3})
json_dialogue.append({"role": "assistant", "content": assistant_3})


user_4 = '[warmup][query] What is Beta is we know that A is at most 2? (Beta would be a different way to refer to B in our Variable, and since a specific value is not provided, we perform a batch target query on B with all possible values of B)'
assistant_4 = '\n{\n  "Queries": [\n    {\n      "Batch Target": [\n        {\n          "Name": "B",\n          "Value": ["b1", "b2"]\n        }\n      ],\n      "Condition": [\n        {\n          "Name": "A",\n          "Value": ["a1", "a2"]\n        }\n      ]\n    }\n  ]\n}'



print("assistant 4 before stripping: ", len(encoding.encode(assistant_4)))
assistant_4 = assistant_4.replace('\n', '')
assistant_4 = assistant_4.replace(' ', '')
print("assistant 4 after stripping: ", len(encoding.encode(assistant_4)))


json_dialogue.append({"role": "user", "content": user_4})
json_dialogue.append({"role": "assistant", "content": assistant_4})


prompt['json_dialogue'] = json_dialogue


prompt['binq'] = 'To model the nightly price of Airbnb listings in 2017, we need to first divide the price into a few ranges. Think about the appropriate parametric distribution to approximate the price. Divide the price into 4~5 ranges accordingly. This would be our first variable.'

# Reason through how these variables might interact or correlate with each other. 
prompt['brainq'] = "Let's brainstorm the variables that might affect or determine the prices of Airbnb listings. Think about the commonsense intrinsic or extrinsic factors that would influence pricing. "

prompt['broadq'] = "Identify key interactions among the variables. Consider how they might interact with or correlate to each other. For example, how many bedrooms do entire house listings tend to offer? You can suggest new relevant variables if necessary."


prompt['moreq'] = "Take your time to reason through additional variables and their possible interactions. Consider other factors that might influence the price, or other existing variables."

prompt['varq'] = "Select a few key variables (fewer than 7), including but not limited to price. For each variable, exhaustively list all possible values, ensuring they are concrete and mutually exclusive. Focus on variables that reveal meaningful patterns or relationships, not just their impact on price."

prompt['condq'] = "You should provide an updated response based on this new consideration. "

# p(x)  You can give a credible interval if unsure.
prompt['margq'] = "Estimate the marginal probabilities for the values of each variable based on your best intuition. Give a best guess."

# p(price|bedroom)
prompt['pricecondq'] = 'How would they affect the price? Suggest the numeric probability of price, conditioned on each of these variable values. Give a best guess. However, if you think that the variable or value does not affect the price much, you can skip it.'

# p(bedroom=2|property_type=entire_house)
#interactq = "How do you think the rest of the chosen variables interact with each other? Avoid duplicates from before. After reasoning through, suggest the conditional probability that captures those interaction."
# dependencies <-> interaction? point-wise mutual information 
prompt['interactq'] = "What are some dependencies among these variables other than price? Suggest numeric conditional probability that captures those interaction. "

# save prompt
with open('./prompt.json', 'w') as file:
    json.dump(prompt, file)


chains = ['binq', 'brainq', 'broadq', 'varq']
saws = [None, 'margq', 'pricecondq', 'interactq']

seq = [chains, saws]

with open('./example_question.json', 'r') as file:
    question = json.load(file)

assistant 1 before stripping:  83
assistant 1 after stripping:  47
assistant 2 before stripping:  229
assistant 2 after stripping:  127
assistant 3 before stripping:  225
assistant 3 after stripping:  128
assistant 4 before stripping:  75
assistant 4 after stripping:  40


In [557]:
import importlib
importlib.reload(llm)
#agent = llm.LLM(initial_message, model='gpt-3.5-turbo', json_format=formatq)
agent = llm.LLM(question=question, sequences=seq)
#agent = llm.LLM(question=question, model='gpt-3.5-turbo')
agent.reset()

Modified variable question:  Select a few key variables (fewer than 7), including but not limited to price. For each variable, exhaustively list all possible values, ensuring they are concrete and mutually exclusive. Focus on variables that reveal meaningful patterns or relationships, not just their impact on price.
Additionally without loss of generality, we would like to be able to answer questions such as: What is the nightly price of a aribnb listing that is a private room in a property with at most 3 beds?
Make sure that the updated variables can express such questions with existing variables. 


In [558]:
cur_j, query = agent.button()

user : To model the nightly price of Airbnb listings in 2017, we need to first divide the price into a few ranges. Think about the appropriate parametric distribution to approximate the price. Divide the price into 4~5 ranges accordingly. This would be our first variable.


--------------------------------------------------------------------------------------


assistant : The nightly price of Airbnb listings can be approximated using a log-normal distribution, as prices are typically skewed with a long tail on the right. Based on this, we can divide the price into the following ranges:

1. $0 - $50
2. $51 - $100
3. $101 - $200
4. $201 - $500
5. $501 and above


--------------------------------------------------------------------------------------


user : Let's brainstorm the variables that might affect or determine the prices of Airbnb listings. Think about the commonsense intrinsic or extrinsic factors that would influence pricing. 


------------------------------------------------

In [389]:
# pickle
import pickle

with open('vars.pkl', 'wb') as f:
    pickle.dump(agent.var_json, f)
with open('cons.pkl', 'wb') as f:
    pickle.dump(agent.cons_json, f)
with open('current_json.pkl', 'wb') as f:
    pickle.dump(agent.current_json, f)
with open('query.pkl', 'wb') as f:
    pickle.dump(query, f)


In [227]:
# load pickle
import pickle

with open('vars.pkl', 'rb') as f:
    vars = pickle.load(f)
with open('cons.pkl', 'rb') as f:
    constraints_list = pickle.load(f)
with open('current_json.pkl', 'rb') as f:
    current_json = pickle.load(f)
with open('query.pkl', 'rb') as f:
    query = pickle.load(f)

In [510]:
agent.var_json

[{'Name': 'PriceRange',
  'Value': ['$0-$50', '$51-$100', '$101-$200', '$201-$400', '$401 and above']},
 {'Name': 'PropertyType',
  'Value': ['Entire home/apt', 'Private room', 'Shared room']},
 {'Name': 'Bedrooms', 'Value': ['0', '1', '2', '3', '4 and above']},
 {'Name': 'Beds', 'Value': ['1', '2', '3', '4 and above']},
 {'Name': 'Location', 'Value': ['Urban', 'Suburban', 'Rural']},
 {'Name': 'Amenities', 'Value': ['Basic', 'Premium']}]

In [559]:
reload(mrf_data)
dat = mrf_data.data()
#dat.set_bins([50, 100, 150, 300, 500])
dat.set_bins([50, 100, 200, 500])
cond = lambda x: x[(x['room_type'] == 'Private room') & (x['beds'] <= 3)]
ground = dat.marg(cond=cond)
ground = torch.tensor(ground)

All columns available:  Index(['id', 'log_price', 'property_type', 'room_type', 'amenities',
       'accommodates', 'bathrooms', 'bed_type', 'cancellation_policy',
       'cleaning_fee', 'city', 'description', 'first_review',
       'host_has_profile_pic', 'host_identity_verified', 'host_response_rate',
       'host_since', 'instant_bookable', 'last_review', 'latitude',
       'longitude', 'name', 'neighbourhood', 'number_of_reviews',
       'review_scores_rating', 'thumbnail_url', 'zipcode', 'bedrooms', 'beds',
       'price'],
      dtype='object')
Columns used:  ['log_price', 'bathrooms', 'property_type', 'room_type', 'accommodates', 'bedrooms', 'beds', 'amenities', 'review_scores_rating', 'city']
-1 - 50: 	22.24%
50 - 100: 	57.95%
100 - 200: 	16.96%
200 - 500: 	2.38%
500 - 1999: 	0.47%


In [560]:
from importlib import reload
reload(mrf)


# guidance package?
vars = agent.var_json
constraints_list = agent.cons_json
test = mrf.Brute()
#build_ret = test.from_json(vars=vars, constraints=constraints_list)
build_ret = test.from_json(entire_json=agent.current_json)
test.update(10000)

here

Variable:  Price Range
Value:  $0 - $50, $51 - $100, $101 - $200, $201 - $500, $501 and above 


Variable:  Property Type
Value:  Entire Home/Apartment, Private Room, Shared Room 


Variable:  Number of Beds
Value:  1 Bed, 2 Beds, 3 Beds, 4+ Beds 


Variable:  Location
Value:  Urban, Suburban, Rural 


Variable:  Amenities
Value:  Basic, Premium, Luxury 


Variable:  Season
Value:  Peak Season, Off-Peak Season 

-----------------------------------------------
Finished building
Features:  132
Constraints:  131
-----------------------------------------------


  1%|          | 123/10000 [00:00<00:08, 1227.88it/s]

Loss:  tensor(13.1980, grad_fn=<SumBackward0>) tensor(0.1007, grad_fn=<MeanBackward0>)


 23%|██▎       | 2263/10000 [00:01<00:05, 1407.83it/s]

Loss:  tensor(0.0797, grad_fn=<SumBackward0>) tensor(0.0006, grad_fn=<MeanBackward0>)


 42%|████▏     | 4245/10000 [00:03<00:04, 1382.22it/s]

Loss:  tensor(0.0735, grad_fn=<SumBackward0>) tensor(0.0006, grad_fn=<MeanBackward0>)


 61%|██████▏   | 6137/10000 [00:04<00:02, 1375.60it/s]

Loss:  tensor(0.0713, grad_fn=<SumBackward0>) tensor(0.0005, grad_fn=<MeanBackward0>)


 82%|████████▏ | 8234/10000 [00:06<00:01, 1637.78it/s]

Loss:  tensor(0.0704, grad_fn=<SumBackward0>) tensor(0.0005, grad_fn=<MeanBackward0>)


100%|██████████| 10000/10000 [00:07<00:00, 1357.93it/s]

Loss:  tensor(0.0699, grad_fn=<SumBackward0>) tensor(0.0005, grad_fn=<MeanBackward0>)





In [563]:
query = agent.query_q
if type(query) == str:
    query = json.loads(query)

#query['Queries'][0]['Condition'].append({'Name': 'Location', 'Value': ['Urban']})

print(query)
res = test.query(query)
print(res)
print(kl_divergence(ground, res))


{'Queries': [{'BatchTarget': [{'Name': 'Price Range', 'Value': ['$0 - $50', '$51 - $100', '$101 - $200', '$201 - $500', '$501 and above']}], 'Condition': [{'Name': 'Property Type', 'Value': ['Private Room']}, {'Name': 'Number of Beds', 'Value': ['1 Bed', '2 Beds', '3 Beds']}]}]}
[{'Name': 'Property Type', 'Value': ['Private Room']}, {'Name': 'Number of Beds', 'Value': ['1 Bed', '2 Beds', '3 Beds']}]
tensor([0.1976, 0.4914, 0.2517, 0.0463, 0.0131])
tensor(0.0343, dtype=torch.float64)


In [564]:
query = agent.query_q
if type(query) == str:
    query = json.loads(query)
#query['Queries'][0]['Condition'].append({'Name': 'Location', 'Value': ['Urban']})
#query['Queries'][0]['Condition'] = []
print(query)
res = test.query(query)
print(res)
print(kl_divergence(ground, torch.tensor([0.05, 0.35, 0.4, 0.18, 0.02])))


{'Queries': [{'BatchTarget': [{'Name': 'Price Range', 'Value': ['$0 - $50', '$51 - $100', '$101 - $200', '$201 - $500', '$501 and above']}], 'Condition': [{'Name': 'Property Type', 'Value': ['Private Room']}, {'Name': 'Number of Beds', 'Value': ['1 Bed', '2 Beds', '3 Beds']}]}]}
[{'Name': 'Property Type', 'Value': ['Private Room']}, {'Name': 'Number of Beds', 'Value': ['1 Bed', '2 Beds', '3 Beds']}]
tensor([0.1976, 0.4914, 0.2517, 0.0463, 0.0131])
tensor(0.4235, dtype=torch.float64)


In [404]:
que = query.copy()
que['Queries'][0]['Batch Target'] = [{'Name': 'Price'}]
que['Queries'][0]['Condition'] = [{'Name': 'Property Type', 'Value': ['Private Room']}, {'Name': 'Number of Beds', 'Value': ['1 Bed']}]
print(test.query(que))

[{'Name': 'Property Type', 'Value': ['Private Room']}, {'Name': 'Number of Beds', 'Value': ['1 Bed']}]
tensor([0.1718, 0.3613, 0.2856, 0.1082, 0.0412, 0.0318])


In [396]:
que = query.copy()
que['Queries'][0]['Batch Target'] = [{'Name': 'Number of Beds'}]
que['Queries'][0]['Condition'] = [{'Name': 'Property Type', 'Value': ['Private Room']}]
print(test.query(que))

[{'Name': 'Property Type', 'Value': ['Private Room']}]
tensor([0.5706, 0.3243, 0.0835, 0.0215])
