In [11]:
import utils
from jsonl_dump import download_parse_delivered_into_jsonl
from utils import DATA_DIR
LIMIT_ITEMS_TO_FIRST_N = 2 # None for full
BATCH_IDS = [1,2,3,4]
max_workers = 20

In [12]:
delivered4 = download_parse_delivered_into_jsonl(BATCH_IDS, no_work=False)

In [13]:
delivered_df = delivered4['delivered_df']
conversations = delivered4['conversations']

In [14]:
from importlib import reload
import use_case
reload(use_case)
from use_case import get_use_case_data_batch_conversations
import areas_of_focus
reload(areas_of_focus)
from areas_of_focus import get_areas_of_focus_data_batch_conversations
import behaviour_tagging
reload(behaviour_tagging)
from behaviour_tagging import get_behavioural_tags_data_batch_conversations

In [15]:
from utils import DATA_DIR
import os
import pickle
from datetime import datetime

batch_folder_prefix = DATA_DIR + 'jsonl_conversations/batch_'
progress_backup_folder = DATA_DIR + 'progress_backup/'
current_date = datetime.now().strftime('%Y-%m-%d__%H-%M-%S')

batch_results = {}
item_types = ['behavioural_tags', 'use_case_data', 'areas_of_focus_data']

number_of_batches_total = len(BATCH_IDS)
for i in range(1, 1 + number_of_batches_total):
    print('='*60)
    print(f"Processing batch {i} of {number_of_batches_total}")
    batch_folder = batch_folder_prefix + str(i)
    batch_results[i] = {}
    
    print("  Generating behavioural tags...")
    batch_results[i]['behavioural_tags'] = get_behavioural_tags_data_batch_conversations(
        batch_folder, max_workers=max_workers, limit_items_to_first_n=LIMIT_ITEMS_TO_FIRST_N
    )
    # Save progress after each item
    progress_backup_path = os.path.join(progress_backup_folder, f'{current_date}.pkl')
    os.makedirs(os.path.dirname(progress_backup_path), exist_ok=True)
    with open(progress_backup_path, 'wb') as backup_file:
        pickle.dump(batch_results, backup_file)
    
    print("  Generating use case data...")
    # Assuming similar functions exist for use_case_data and areas_of_focus_data
    # and they follow the same function signature as get_behavioural_tags_data_batch_conversations
    batch_results[i]['use_case_data'] = get_use_case_data_batch_conversations(
        batch_folder, max_workers=max_workers, limit_items_to_first_n=LIMIT_ITEMS_TO_FIRST_N
    )
    # Save progress after each item
    with open(progress_backup_path, 'wb') as backup_file:
        pickle.dump(batch_results, backup_file)
    
    print("  Generating areas of focus data...")
    batch_results[i]['areas_of_focus_data'] = get_areas_of_focus_data_batch_conversations(
        batch_folder, max_workers=max_workers, limit_items_to_first_n=LIMIT_ITEMS_TO_FIRST_N
    )
    # Save progress after each item
    with open(progress_backup_path, 'wb') as backup_file:
        pickle.dump(batch_results, backup_file)

print('DONE.')

Processing batch 1 of 4
  Generating behavioural tags...


Processing Conversations: 100%|██████████| 2/2 [00:07<00:00,  3.69s/it]


  Generating use case data...


100%|██████████| 2/2 [00:10<00:00,  5.34s/it]


  Generating areas of focus data...


Processing Conversations: 100%|██████████| 2/2 [00:04<00:00,  2.01s/it]


Processing batch 2 of 4
  Generating behavioural tags...


Processing Conversations: 100%|██████████| 2/2 [00:13<00:00,  6.83s/it]


  Generating use case data...


100%|██████████| 2/2 [00:09<00:00,  4.52s/it]


  Generating areas of focus data...


Processing Conversations: 100%|██████████| 2/2 [00:03<00:00,  1.86s/it]


Processing batch 3 of 4
  Generating behavioural tags...


Processing Conversations: 100%|██████████| 2/2 [00:21<00:00, 10.86s/it]


  Generating use case data...


100%|██████████| 2/2 [00:06<00:00,  3.49s/it]


  Generating areas of focus data...


Processing Conversations: 100%|██████████| 2/2 [00:16<00:00,  8.07s/it]


Processing batch 4 of 4
  Generating behavioural tags...


Processing Conversations: 100%|██████████| 2/2 [00:27<00:00, 13.62s/it]


  Generating use case data...


100%|██████████| 2/2 [00:33<00:00, 16.79s/it]


  Generating areas of focus data...


Processing Conversations: 100%|██████████| 2/2 [00:07<00:00,  3.90s/it]

DONE.





In [16]:
import pandas as pd

# Function to flatten the batch_results into a list of dictionaries
def flatten_batch_results(batch_results):
    flattened_data = []
    for batch_id, analytics_types in batch_results.items():
        for analytics_type, analytics_items in analytics_types.items():
            for item in analytics_items:
                item_data = {
                    'batch_id': batch_id,
                    'colab_link': item['colab_link'],
                    analytics_type: {k: v for k, v in item.items() if k not in ['colab_link', 'id']}
                }
                flattened_data.append(item_data)
    return flattened_data

# Flatten the batch_results
flattened_data = flatten_batch_results(batch_results)

# Create a dataframe from the flattened data
batch_results_df = pd.DataFrame(flattened_data)

# Function to extend the delivered_df with analytics data
def extend_delivered_df(delivered_df, batch_results_df):
    for analytics_type in item_types:
        analytics_df = batch_results_df[['batch_id', 'colab_link', analytics_type]].dropna()
        # Merge on 'colab_link' and exclude 'batch_id' and 'colab_link' from the right dataframe
        merged_df = delivered_df.merge(analytics_df, left_on='task_link', right_on='colab_link', how='left', suffixes=('', '_right'))
        # Drop the '_right' columns after merging
        for col in ['batch_id_right', 'colab_link_right']:
            if col in merged_df.columns:
                merged_df.drop(columns=[col], inplace=True)
        delivered_df = merged_df
    return delivered_df

# Extend the delivered_df with the batch_results_df
extended_delivered_df = extend_delivered_df(delivered_df, batch_results_df)

# Filter the extended dataframe to show only rows with non-null analytics data
filtered_extended_delivered_df = extended_delivered_df[
    extended_delivered_df.filter(like='_data').notnull().any(axis=1)
]

filtered_extended_delivered_df


Unnamed: 0,task_link,jsonl_link,metadata__topic,number_of_turns,duration_mins,batch_id,colab_link,behavioural_tags,use_case_data,areas_of_focus_data
33,https://colab.research.google.com/drive/1Iz1hQ...,https://drive.google.com/file/d/1dp99pw8zVR327...,algorithms > by_topic > bit_manipulation,1,10,1,https://colab.research.google.com/drive/1Iz1hQ...,{'behaviours': [{'top_level': 'Continuation Fo...,{'summary': 'The user is utilizing the Assista...,"{'top_level': 'Basic scripting', 'sub_level': ..."
231,https://colab.research.google.com/drive/1hTG6I...,https://drive.google.com/file/d/1NloPbJ2VooS9H...,algorithms > by_data_structure > trees,1,10,1,https://colab.research.google.com/drive/1hTG6I...,{'behaviours': [{'top_level': 'Continuation Fo...,{'summary': 'The user is utilizing the Assista...,"{'top_level': 'Basic scripting', 'sub_level': ..."
532,https://colab.research.google.com/drive/1YW-M2...,https://drive.google.com/file/d/1MVsHt6ALfw348...,unit_testing_methodology > data_quality_tests,1,20,2,https://colab.research.google.com/drive/1YW-M2...,{'behaviours': [{'top_level': 'Continuation Fo...,{'summary': 'The user employs the Assistant to...,"{'top_level': 'Write unit test', 'sub_level': ..."
733,https://colab.research.google.com/drive/1caCIY...,https://drive.google.com/file/d/1NWZjVItaf56kI...,algorithms > by_topic > greedy_algorithms,3,35,2,https://colab.research.google.com/drive/1caCIY...,{'behaviours': [{'top_level': 'Continuation Fo...,{'summary': 'The user is utilizing the Assista...,"{'top_level': 'General coding help', 'sub_leve..."
898,https://colab.research.google.com/drive/1ZjvRQ...,https://drive.google.com/file/d/163aw3WQKYHEg8...,web_development > web_servers,5,30,3,https://colab.research.google.com/drive/1ZjvRQ...,{'behaviours': [{'top_level': 'Continuation Fo...,{'summary': 'The user is utilizing the Assista...,"{'top_level': 'Basic scripting', 'sub_level': ..."
977,https://colab.research.google.com/drive/1dpcio...,https://drive.google.com/file/d/1ATiUtzQ10bna6...,web_development > web_crawling,3,32,3,https://colab.research.google.com/drive/1dpcio...,{'behaviours': [{'top_level': 'Continuation Fo...,{'summary': 'The user is utilizing the Assista...,"{'top_level': 'General coding help', 'sub_leve..."
1658,https://colab.research.google.com/drive/12vH9f...,https://drive.google.com/file/d/12KtrDnC-aoOKW...,unit_testing_methodology > test_ai_and_ml_models,4,25,4,https://colab.research.google.com/drive/12vH9f...,{'behaviours': [{'top_level': 'Continuation Fo...,{'summary': 'The user is utilizing the Assista...,"{'top_level': 'Basic scripting', 'sub_level': ..."
1740,https://colab.research.google.com/drive/1bynoO...,https://drive.google.com/file/d/1UIspm0yAJZg61...,python_language_and_scripting > logging,3,35,4,https://colab.research.google.com/drive/1bynoO...,{'behaviours': [{'top_level': 'Continuation Fo...,{'summary': 'The user is utilizing the Assista...,"{'top_level': 'Basic scripting', 'sub_level': ..."


In [17]:

def extract_summary(data):
    return data.get('summary', '') if pd.notnull(data) else ''

def extract_tags(data):
    return '||'.join(data.get('tags', [])) if pd.notnull(data) else ''

expanded_enriched_df = filtered_extended_delivered_df.copy()
expanded_enriched_df['use_case_summary'] = expanded_enriched_df['use_case_data'].map(extract_summary)
expanded_enriched_df['use_case_tags'] = expanded_enriched_df['use_case_data'].map(extract_tags)
expanded_enriched_df.drop(columns=['use_case_data'], inplace=True)

def extract_area_of_focus(data):
    if pd.notnull(data):
        return {
            'area_of_focus_top_level': data.get('top_level', ''),
            'area_of_focus_sub_level': data.get('sub_level', ''),
            'area_of_focus_detailed_level': data.get('detailed_level', ''),
            'area_of_focus_custom_category': data.get('new_filled_category', False)
        }
    else:
        return {
            'area_of_focus_top_level': '',
            'area_of_focus_sub_level': '',
            'area_of_focus_detailed_level': '',
            'area_of_focus_custom_category': False
        }

areas_of_focus_columns = expanded_enriched_df['areas_of_focus_data'].map(extract_area_of_focus).apply(pd.Series)
expanded_enriched_df = pd.concat([expanded_enriched_df, areas_of_focus_columns], axis=1)
expanded_enriched_df.drop(columns=['areas_of_focus_data'], inplace=True)

def extract_behavioural_tags(data):
    if pd.notnull(data) and 'behaviours' in data:
        tags = []
        for behaviour in data['behaviours']:
            tags.append(
                f"{behaviour.get('top_level', '')}->{behaviour.get('sub_level', '')};;"
                f"{behaviour.get('detailed_level', '')};;"
                f"{behaviour.get('new_filled_category', False)}"
            )
        return '||'.join(tags)
    else:
        return ''

behavioural_tags_column = expanded_enriched_df['behavioural_tags'].map(extract_behavioural_tags)
expanded_enriched_df['user_behaviour_tags'] = behavioural_tags_column
expanded_enriched_df.drop(columns=['behavioural_tags'], inplace=True)
cols = list(expanded_enriched_df.columns)
cols.append(cols.pop(cols.index('area_of_focus_custom_category')))
expanded_enriched_df = expanded_enriched_df[cols]

expanded_enriched_df.to_csv(DATA_DIR + f'batches1-4{"full" if LIMIT_ITEMS_TO_FIRST_N is None else str(LIMIT_ITEMS_TO_FIRST_N) + "items"}-use_case-area_of_focus-behaviour_tagging.csv')
expanded_enriched_df

Unnamed: 0,task_link,jsonl_link,metadata__topic,number_of_turns,duration_mins,batch_id,colab_link,use_case_summary,use_case_tags,area_of_focus_top_level,area_of_focus_sub_level,area_of_focus_detailed_level,user_behaviour_tags,area_of_focus_custom_category
33,https://colab.research.google.com/drive/1Iz1hQ...,https://drive.google.com/file/d/1dp99pw8zVR327...,algorithms > by_topic > bit_manipulation,1,10,1,https://colab.research.google.com/drive/1Iz1hQ...,The user is utilizing the Assistant to underst...,programming||Python||bit manipulation||technic...,Basic scripting,Write simple code in Python,Bit manipulation operations,Continuation Follow up->Incrementally Build;;A...,False
231,https://colab.research.google.com/drive/1hTG6I...,https://drive.google.com/file/d/1NloPbJ2VooS9H...,algorithms > by_data_structure > trees,1,10,1,https://colab.research.google.com/drive/1hTG6I...,The user is utilizing the Assistant to solve a...,programming assistance||educational resource||...,Basic scripting,Write simple code in Python,Merging binary trees,Continuation Follow up->Incrementally Build;;A...,False
532,https://colab.research.google.com/drive/1YW-M2...,https://drive.google.com/file/d/1MVsHt6ALfw348...,unit_testing_methodology > data_quality_tests,1,20,2,https://colab.research.google.com/drive/1YW-M2...,The user employs the Assistant to understand a...,unit testing||Python||date validation||educati...,Write unit test,Cover different languages,Writing a unit test in Python to verify date r...,Continuation Follow up->Incrementally Build;;A...,False
733,https://colab.research.google.com/drive/1caCIY...,https://drive.google.com/file/d/1NWZjVItaf56kI...,algorithms > by_topic > greedy_algorithms,3,35,2,https://colab.research.google.com/drive/1caCIY...,The user is utilizing the Assistant to debug a...,debugging||code optimization||algorithm explan...,General coding help,Debug and fix code,Fixing algorithm for optimal coin change,Continuation Follow up->Incrementally Build;;A...,False
898,https://colab.research.google.com/drive/1ZjvRQ...,https://drive.google.com/file/d/163aw3WQKYHEg8...,web_development > web_servers,5,30,3,https://colab.research.google.com/drive/1ZjvRQ...,The user is utilizing the Assistant to learn h...,Flask setup||Web server creation||Dependency m...,Basic scripting,Write simple code in Python,Setting up a Flask server,Continuation Follow up->Incrementally Build;;A...,False
977,https://colab.research.google.com/drive/1dpcio...,https://drive.google.com/file/d/1ATiUtzQ10bna6...,web_development > web_crawling,3,32,3,https://colab.research.google.com/drive/1dpcio...,The user is utilizing the Assistant to enhance...,web scraping||BeautifulSoup||HTML parsing||CSV...,General coding help,Explain complex code snippets,Web scraping with BeautifulSoup and CSV output,Continuation Follow up->Incrementally Build;;U...,False
1658,https://colab.research.google.com/drive/12vH9f...,https://drive.google.com/file/d/12KtrDnC-aoOKW...,unit_testing_methodology > test_ai_and_ml_models,4,25,4,https://colab.research.google.com/drive/12vH9f...,The user is utilizing the Assistant to underst...,neural network testing||dropout layer simulati...,Basic scripting,ML code snippets,Simulating dropout during neural network testing,Continuation Follow up->Incrementally Build;;A...,False
1740,https://colab.research.google.com/drive/1bynoO...,https://drive.google.com/file/d/1UIspm0yAJZg61...,python_language_and_scripting > logging,3,35,4,https://colab.research.google.com/drive/1bynoO...,The user is utilizing the Assistant to underst...,logging||rotating file handler||Python||log ma...,Basic scripting,Write simple code in Python,Logging configuration with RotatingFileHandler,Continuation Follow up->Request for clarificat...,False


In [18]:
print(filtered_extended_delivered_df['use_case_data'].iloc[0])
print(filtered_extended_delivered_df['areas_of_focus_data'].iloc[0])
print(filtered_extended_delivered_df['behavioural_tags'].iloc[0])


{'summary': "The user is utilizing the Assistant to understand and implement a programming technique in Python, specifically for clearing the lowest set bit of an integer. The user's use case involves seeking step-by-step guidance and a clear explanation of the concept, followed by a practical example of how to apply the technique in code. The Assistant provides a detailed explanation of the operation, a code snippet to demonstrate the method, and an example to illustrate the result of the operation.", 'tags': ['programming', 'Python', 'bit manipulation', 'technical guidance', 'code example']}
{'top_level': 'Basic scripting', 'sub_level': 'Write simple code in Python', 'detailed_level': 'Bit manipulation operations', 'new_filled_category': False}
{'behaviours': [{'top_level': 'Continuation Follow up', 'sub_level': 'Incrementally Build', 'detailed_level': 'Assistant provides a step-by-step explanation and example code to clear the lowest set bit.', 'new_filled_category': False}, {'top_l

In [19]:
delivered4 = download_parse_delivered_into_jsonl([1,2,3,4], no_work=True)
delivered_df = delivered4['delivered_df']
conversations = delivered4['conversations']

In [20]:
data = [
    # User Behaviors
    ("User", "Emotional Response", "Confusion"),
    ("User", "Emotional Response", "Frustration"),
    ("User", "Behavior", "Self-Contradiction"),
    ("User", "Behavior", "Error Identification"),
    ("User", "Behavior", "Making Errors"),
    ("User", "Feedback", "Satisfaction Expression"),
    ("User", "Feedback", "Compliment Offering"),
    ("User", "Intent", "Information Seeking"),
    ("User", "Intent", "Transactional"),
    ("User", "Intent", "Casual Interaction"),
    ("User", "Conversation Progression", "Building Upon Discussion"),
    ("User", "Conversation Progression", "Extending Scope"),
    ("User", "Conversation Progression", "Seeking Clarification"),
    ("User", "Conversation Progression", "Exploring Alternatives"),
    ("User", "Conversation Direction", "Topic Shift"),
    ("User", "Conversation Direction", "Goal Reassessment"),
    
    # Assistant Behaviors
    ("Assistant", "Interaction", "Providing Information"),
    ("Assistant", "Interaction", "Concluding Interaction"),
    ("Assistant", "Interaction", "Resolving Ambiguities"),
    ("Assistant", "Performance", "Misunderstanding"),
    ("Assistant", "Performance", "Accurate Information"),
    ("Assistant", "Performance", "Issue Escalation"),
    ("Assistant", "Technical Support", "Issue Identification"),
    ("Assistant", "Technical Support", "Support Provision"),
    
    # Conversation Closure
    ("General", "Conversation Closure", "Satisfied Conclusion"),
    ("General", "Conversation Closure", "Unsatisfied Conclusion"),
    ("General", "Conversation Closure", "Inactive Termination"),
    
    # Other categories can be added under "General" or other relevant top-level categories
]


import pandas as pd

behavioural_tags_df2 = pd.DataFrame(data, columns=["Top_Level_Category", "Sub_Level_Category", "Detailed_Level_Category"])
behavioural_tags_df2






Unnamed: 0,Top_Level_Category,Sub_Level_Category,Detailed_Level_Category
0,User,Emotional Response,Confusion
1,User,Emotional Response,Frustration
2,User,Behavior,Self-Contradiction
3,User,Behavior,Error Identification
4,User,Behavior,Making Errors
5,User,Feedback,Satisfaction Expression
6,User,Feedback,Compliment Offering
7,User,Intent,Information Seeking
8,User,Intent,Transactional
9,User,Intent,Casual Interaction
