In [1]:
!pip install spacy google-cloud-storage google-cloud-bigquery
!python -m spacy download en_core_web_sm

Collecting en-core-web-sm==3.7.1
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.7.1/en_core_web_sm-3.7.1-py3-none-any.whl (12.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m12.8/12.8 MB[0m [31m75.9 MB/s[0m eta [36m0:00:00[0m
[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('en_core_web_sm')
[38;5;3m⚠ Restart to reload dependencies[0m
If you are in a Jupyter or Colab notebook, you may need to restart Python in
order to load all the package's dependencies. You can do this by selecting the
'Restart kernel' or 'Restart runtime' option.


In [13]:
import re
import spacy
import pandas as pd
from spacy.matcher import Matcher
from google.cloud import storage, bigquery
from datetime import datetime
import io

In [14]:
# Load spaCy model
nlp = spacy.load("en_core_web_sm")

# Initialize the Matcher with defined patterns for unwanted phrases
matcher = Matcher(nlp.vocab)
patterns = [
    [{"LOWER": "thank"}, {"LOWER": "you"}, {"LOWER": "for"}, {"LOWER": "your"}, {"LOWER": "query"}],
    [{"LOWER": "i"}, {"LOWER": "understand"}, {"LOWER": "your"}, {"LOWER": "concern"}],
    [{"LOWER": "sorry"}, {"IS_PUNCT": True}, {"LOWER": "the"}, {"LOWER": "images"}, {"LOWER": "were"}, {"LOWER": "not"}, {"LOWER": "clear"}],
    [{"LOWER": "attachment"}, {"LOWER": "removed"}, {"LOWER": "to"}, {"LOWER": "protect"}, {"LOWER": "patient"}, {"LOWER": "identity"}]
]
for pattern in patterns:
    matcher.add("UNWANTED_PHRASE", [pattern])

In [16]:
def list_gcs_files(bucket_name, prefix=""):
    """List all files in the GCS bucket with the specified prefix."""
    storage_client = storage.Client()
    bucket = storage_client.bucket(bucket_name)
    return [blob.name for blob in bucket.list_blobs(prefix=prefix) if blob.name.endswith('.json')]

def load_json_from_gcs(bucket_name, file_name):
    """Load a JSON file from GCS into a Pandas DataFrame."""
    storage_client = storage.Client()
    bucket = storage_client.bucket(bucket_name)
    blob = bucket.blob(file_name)
    data = blob.download_as_text()
    return pd.read_json(io.StringIO(data), lines=True)

def remove_sentences_with_keywords(text, keywords=None):
    """Remove sentences containing specific keywords."""
    if keywords is None:
        keywords = ["chat doctor", "welcome", "thanks", "thank you", "concerns", "hello", "regards"]
    pattern = r'(?i)\b(?:' + '|'.join(map(re.escape, keywords)) + r')\b'
    sentences = re.split(r'(?<=[.!?])\s+', text.lower())
    return ' '.join([sentence for sentence in sentences if not re.search(pattern, sentence)])

def remove_unwanted_phrases(text):
    """Remove matched phrases using spaCy Matcher."""
    doc = nlp(text)
    matches = matcher(doc)
    spans = [doc[start:end] for _, start, end in matches]
    for span in spans:
        text = text.replace(span.text, "")
    return text.strip()

def clean_answer(text):
    """Apply cleaning transformations to the Answer field."""
    text = remove_sentences_with_keywords(text)
    text = remove_unwanted_phrases(text)
    text = re.sub(r'\(attachment removed to protect the patient\'s identity\)', '', text, flags=re.IGNORECASE)
    if len(text.split()) < 20:
        return None
    text = re.sub(r'\s+', ' ', text)
    text = re.sub(r'[^A-Za-z0-9.,? ]+', '', text)
    return text.strip()

def process_json_files(bucket_name, prefix, table_id):
    """Process all JSON files in the GCS bucket under a specific prefix and load to BigQuery."""
    all_data = []
    file_names = list_gcs_files(bucket_name, prefix)

    for file_name in file_names:
        print(f"Processing file: {file_name}")
        df = load_json_from_gcs(bucket_name, file_name)

        # Apply cleaning and transformations
        df['Answer'] = df['output'].apply(clean_answer)
        df = df.dropna(subset=['Answer']).reset_index(drop=True)
        df = df.rename(columns={'input': 'Question'})
        df['source'] = 'healthcaremagic'
        #df['last_updated_date'] = datetime.now().strftime('%Y-%m-%d')
        df['last_updated_date'] = datetime.now().date()
        df['last_updated_date'] = pd.to_datetime(df['last_updated_date'], errors='coerce')
        df['Title'] = ''

        # Select and reorder columns
        all_data.append(df[['Title', 'Question', 'Answer', 'source', 'last_updated_date']])

    combined_df = pd.concat(all_data, ignore_index=True)
    load_to_bigquery(combined_df, table_id)
    print("All data loaded successfully.")

def load_to_bigquery(df, table_id):
    """Load DataFrame to BigQuery."""
    client = bigquery.Client()
    job = client.load_table_from_dataframe(df, table_id)
    job.result()
    print("Data loaded successfully into BigQuery.")



In [17]:
# Set GCS bucket, prefix, and BigQuery table details
bucket_name = "raw_dataset_genai"
prefix = "heathcaremagic/"
table_id = "health-ai-agent-sjsu.transformed_data.all_merged"

# Execute the pipeline
process_json_files(bucket_name, prefix, table_id)


Processing file: heathcaremagic/HealthCareMagic_chunk_5_1.json
Data loaded successfully into BigQuery.
All data loaded successfully.
