In [5]:
import sys
import os
import pandas as pd


# Add project root to sys.path in Jupyter or interactive session
sys.path.append(os.path.abspath(os.path.join(os.getcwd(), "../../../")))

from config.path_config import DATA_DIR, PROJECT_DIR, DEPENDENCIES_DIR
from src.helper_functions.data_pre_processing.data_upload_processor.processor import *
from src.helper_functions.database_upload.postgres_uploader import *

from dotenv import load_dotenv

In [6]:
yaml_file_path = "testing_mapping.yaml"
print(yaml_file_path)

testing_mapping.yaml


In [4]:
raw_data = pd.read_excel(os.path.join(DATA_DIR, "wave_10_responses.xlsx"))
data_map = pd.read_excel(os.path.join(DATA_DIR, "wave_10_question_mapping.xlsx"))

In [7]:
env_path = os.path.join(DEPENDENCIES_DIR, ".env")
load_dotenv(env_path)
api_key = os.getenv("OPENAI_API_KEY")

In [8]:
data_dict = run_question_data_pipeline(data_map, raw_data, api_key,'wave_10')


🚨 DEBUG: Inside process_question_guide
Shape of data_map: (786, 2)
Columns in data_map: ['Q80', 'Where do you live?']
First 5 rows:
   Q80 Where do you live?
0   1      United States
1   2     United Kingdom
2   3              Italy
3   4             France
4   5            Germany


  text_splitter = SemanticChunker(OpenAIEmbeddings(api_key=api_key), breakpoint_threshold_type="standard_deviation")


In [9]:
engine = create_postgres_engine('postgres', 'postgres', 'localhost', 5432, 'cl_survey_data')

In [10]:
data_dict.keys()

dict_keys(['question_guide', 'mapped_data', 'type_subtype', 'question_dict', 'embeddings_metadata_df', 'embedding_df'])

In [19]:
data_dict['embedding_df']

Unnamed: 0,primary_key,chunk_embedding
0,wave_10_0,"[0.008924829303182252, -0.011800192605846187, ..."
1,wave_10_1,"[-0.0014093561579214353, -0.00326208426778693,..."
2,wave_10_2,"[0.0013897184133592167, -0.0027689482725226987..."
3,wave_10_3,"[0.005397405175799886, 0.010119272765629822, -..."
4,wave_10_4,"[0.01133035702695665, -0.010589252418149469, -..."
...,...,...
73,wave_10_73,"[-0.02961198816351107, -0.007391252131821712, ..."
74,wave_10_74,"[-0.027036405265667418, -0.02393788211700151, ..."
75,wave_10_75,"[-0.011293693201839913, -0.041105235970841536,..."
76,wave_10_76,"[0.0036923528226332934, -0.04142120753422593, ..."


In [11]:
push_dataframe_to_postgres_db(data_dict['embeddings_metadata_df'], 'wave_10_embeddings_metadata', engine)
push_dataframe_to_postgres_db(data_dict['type_subtype'], file_name='wave_10_type_subtype', engine=engine)
push_dataframe_to_postgres_db(data_dict['question_guide'], 'wave_10_question_guide', engine)
process_raw_data(data_dict['mapped_data'], 'wave_10', engine, response_id_col='Respondent', chunk_size=60)

2025-04-14 13:30:03.609 
  command:

    streamlit run C:\Users\73315\AppData\Local\Packages\PythonSoftwareFoundation.Python.3.13_qbz5n2kfra8p0\LocalCache\local-packages\Python313\site-packages\ipykernel_launcher.py [ARGUMENTS]


In [12]:
a = load_full_survey_dataset(engine, 'wave_10', yaml_file_path, max_chunks=100)



In [13]:
a.keys()

dict_keys(['mapped_raw_data', 'type_subtype', 'question_guide', 'embeddings_metadata'])

In [14]:
a['mapped_raw_data']

Unnamed: 0,Respondent,Status,Term_reason,Start_time_GMT,End_time_GMT,Panel,country,first_param,second_param,income_class,...,Q78.AI_will_replace_my_job,Q78.AI_will_steal_my_identity__e.g.__replicate_my_voice__facial_recognition_,Q78.AI_will_monitor_my_daily_activities_and_movements,Q78.AI_will_spread_false_information,Q78.AI_will_turn_against_humans,Q78.AI_will_use_my_data_to_help_employers__lenders__etc._evaluate_me,Q78.AI_will_replace_human_interactions_with_family___friends,Q78.None_of_the_above_,Q79,Q79__user_input
0,5AfmucpVJF63JuU32MC8Bg**,Qualified,,2024-10-01 14:13:15.565,2024-10-01 14:22:36.490,Dynata - France,4,Charente-Maritime,Nouvelle-Aquitaine,3,...,,,,AI will replace my job,,"AI will steal my identity (e.g., replicate my ...",AI will monitor my daily activities and movements,,No one,
1,5AfmucpVJF6BwCDVcf_cQw**,Qualified,,2024-10-01 14:14:50.335,2024-10-01 14:24:07.254,Dynata - Italy,3,Ravenna,Emilia-Romagna,2,...,,AI will replace my job,,"AI will steal my identity (e.g., replicate my ...",AI will monitor my daily activities and movements,,,,"Cyber security firms (e.g., McAfee, Norton, Bi...",
2,5AfmucpVJF5qH3Ly_L-u1Q**,Qualified,,2024-10-01 14:16:36.935,2024-10-01 14:24:34.457,Dyanta - Spain,7,Madrid,Madrid,3,...,,,AI will replace my job,,,AI will monitor my daily activities and movements,"AI will steal my identity (e.g., replicate my ...",,Myself or a family member,
3,5AfmucpVJF4WooWl3om3mw**,Qualified,,2024-10-01 14:15:35.866,2024-10-01 14:24:42.186,Dynata - Poland,6,Małopolskie,Powiat proszowicki,1,...,"AI will steal my identity (e.g., replicate my ...",AI will replace my job,,,,AI will monitor my daily activities and movements,,,No one,
4,5AfmucpVJF6P1Eb2aefL1A**,Qualified,,2024-10-01 14:16:47.263,2024-10-01 14:25:02.217,Dynata - UK,2,E30000254,E12000008,1,...,,,,,,,,1.0,No one,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
10495,xjUBnUXDVJ3QeEngCPefXw**,Qualified,,2024-10-11 14:25:45.250,2024-10-11 14:59:56.577,Dynata - Poland,6,Pomorskie,Powiat wejherowski,2,...,,,AI will monitor my daily activities and movements,AI will replace my job,"AI will steal my identity (e.g., replicate my ...",,,,No one,
10496,xjUBnUXDVJ1oT0NhUlBY6g**,Qualified,,2024-10-11 14:40:46.093,2024-10-11 15:00:57.319,Dynata - Poland,6,Lubelskie,"Chełm County, Chełm",3,...,AI will replace my job,"AI will steal my identity (e.g., replicate my ...",,,AI will monitor my daily activities and movements,,,,"Big technology companies (e.g., Google, Apple,...",
10497,xjUBnUXDVJ1HREX-hkINKQ**,Qualified,,2024-10-11 14:27:23.980,2024-10-11 15:02:17.112,Dynata - Poland,6,Mazowieckie,Warszawa,1,...,,,,"AI will steal my identity (e.g., replicate my ...",AI will monitor my daily activities and movements,,AI will replace my job,,Grassroots activist groups “Hacktivists” (e.g....,
10498,xjUBnUXDVJ0lCWdkspmwQQ**,Qualified,,2024-10-11 14:26:29.472,2024-10-11 15:05:30.095,Dynata - Poland,6,Wielkopolskie,Poznań,2,...,,"AI will steal my identity (e.g., replicate my ...",,AI will replace my job,,,AI will monitor my daily activities and movements,,"Cyber security firms (e.g., McAfee, Norton, Bi...",


In [15]:
a['question_guide']

Unnamed: 0,Question_code,Question_string,answer_code,answer_string
0,Q4,What is your age?,,What is your age?
1,Q5,What is your <zip code|post code (Outward or f...,,What is your <zip code|post code (Outward or f...
2,Q20,Please describe what are you most excited / op...,,Please describe what are you most excited / op...
3,Q21,Please describe what are you most worried / pe...,,Please describe what are you most worried / pe...
4,Q30,You said your household typically eats dinner ...,,You said your household typically eats dinner ...
...,...,...,...,...
620,Q79,Who would you trust the most to protect you fr...,4,"Cyber security firms (e.g., McAfee, Norton, Bi..."
621,Q79,Who would you trust the most to protect you fr...,5,Grassroots activist groups “Hacktivists” (e.g....
622,Q79,Who would you trust the most to protect you fr...,6,Myself or a family member
623,Q79,Who would you trust the most to protect you fr...,7,No one


In [16]:
a['embeddings_metadata']

Unnamed: 0,question_code,question_text,question_with_answers,answer_options,source,chunk_text,primary_key
0,Q10,What is your current employment status?,What is your current employment status? Full-t...,Full-time employed in one job (working >30 hou...,wave_10,What is your current employment status? Full-t...,wave_10_0
1,Q11,What was your total household income before ta...,What was your total household income before ta...,"Less than <currency>25,000|| <currency>25,000 ...",wave_10,What was your total household income before ta...,wave_10_1
2,Q12,What was your total personal income before tax...,What was your total personal income before tax...,"Less than <currency>25,000|| <currency>25,000 ...",wave_10,What was your total personal income before tax...,wave_10_2
3,Q13,What is your household / family situation? Ple...,What is your household / family situation? Ple...,I have no children|| I have a child(ren) under...,wave_10,What is your household / family situation? Ple...,wave_10_3
4,Q14,Who do you live with? Please select all that a...,Who do you live with? Please select all that a...,Live alone|| Live with my partner|| Live with ...,wave_10,Who do you live with? Please select all that a...,wave_10_4
...,...,...,...,...,...,...,...
73,Q77,How often do you use the following with the su...,How often do you use the following with the su...,1\nNot at all|| 2|| 3|| 4|| 5\nVery often|| I ...,wave_10,How often do you use the following with the su...,wave_10_73
74,Q78,Please rank the top 3 potential downsides of A...,Please rank the top 3 potential downsides of A...,AI will replace my job|| AI will steal my iden...,wave_10,Please rank the top 3 potential downsides of A...,wave_10_74
75,Q79,Who would you trust the most to protect you fr...,Who would you trust the most to protect you fr...,Governments|| International organizations (e.g...,wave_10,Who would you trust the most to protect you fr...,wave_10_75
76,Q8,If you relocated (moved to a new place and est...,If you relocated (moved to a new place and est...,I have not relocated in the past year|| I was ...,wave_10,If you relocated (moved to a new place and est...,wave_10_76


In [17]:
a['type_subtype']

Unnamed: 0,Question_no.,Question,Type,Sub-type
0,Q4,What is your age?,Multiple choice,single-select
1,Q5,What is your <zip code|post code (Outward or f...,Multiple choice,single-select
2,Q20,Please describe what are you most excited / op...,Multiple choice,single-select
3,Q21,Please describe what are you most worried / pe...,Multiple choice,single-select
4,Q30,You said your household typically eats dinner ...,Matrix,single-select
...,...,...,...,...
73,Q75,How excited are you about AI’s ability to impr...,Matrix,single-select
74,Q76,How much do you trust the following if they we...,Matrix,single-select
75,Q77,How often do you use the following with the su...,Matrix,single-select
76,Q78,Please rank the top 3 potential downsides of A...,Matrix,single-select


In [18]:
get_wave_n_list(engine)

['wave_10']