# From json dump to parquet with embeddings

In [4]:
%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [None]:
import os
import json
import pandas as pd

from functions.functions_cleancode import remove_soft_hyphens, create_embeddings_in_batches
from llm.setup import create_azure_client

## Data

In [None]:
# Directory containing JSON files
json_dir = '../data/json_files'

# List to store cleaned data from all files
all_cleaned_data = []

# Iterate through all JSON files in the directory
for filename in os.listdir(json_dir):
    if filename.endswith('.json'):
        file_path = os.path.join(json_dir, filename)
        
        # Load the JSON file
        with open(file_path, 'r', encoding='utf-8') as file:
            data = json.load(file)
        
        # Clean the data
        cleaned_data = remove_soft_hyphens(data)
        
        # Append cleaned data to the list
        all_cleaned_data.extend(cleaned_data)

# Combine all cleaned data into a single DataFrame
df = pd.DataFrame(all_cleaned_data)
df = df.drop_duplicates(subset=['id'])
print(f'The number of news articles: ' + str(len(df)))

# Display the first few rows of the DataFrame
display(df.head())

In [None]:
# Combine 'title' and 'summary' into a new column 'description'
df.loc[:, 'description'] = df.apply(
    lambda row: row['title'] if pd.isna(row['summary']) else row['title'] + '. ' + row['summary'], 
    axis=1
)

for i in range(5):
    print("- " + df['description'][i])

df_description = df[['description']].copy()

## Embeddings

Embedding the text using AzureOpenAI endpoint, and model **text-embedding-3-large**.

In [None]:
client = create_azure_client(async_mode=False)

In [None]:
#Uncomment if you want to create embeddings again
all_embeddings = create_embeddings_in_batches(df=df_description, llm_client=client)

In [None]:
#add the embeddings to the dataframe
df_description['embeddings'] = df_description.index.map(all_embeddings)
#save to a parquet file
df_description.to_parquet('../data/embeddings/embeddings.parquet', index=False)