In [None]:
!pip install spacy
!python -m spacy download en_core_web_sm



Collecting en-core-web-sm==3.7.1
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.7.1/en_core_web_sm-3.7.1-py3-none-any.whl (12.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m12.8/12.8 MB[0m [31m78.8 MB/s[0m eta [36m0:00:00[0m
[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('en_core_web_sm')
[38;5;3m⚠ Restart to reload dependencies[0m
If you are in a Jupyter or Colab notebook, you may need to restart Python in
order to load all the package's dependencies. You can do this by selecting the
'Restart kernel' or 'Restart runtime' option.


In [None]:
import re
import spacy
import pandas as pd
from spacy.matcher import Matcher
from google.cloud import storage, bigquery
from datetime import datetime

In [None]:
# Load spaCy model
nlp = spacy.load("en_core_web_sm")

# Initialize the Matcher
matcher = Matcher(nlp.vocab)

# Define patterns to remove unwanted phrases
patterns = [
    [{"LOWER": "thank"}, {"LOWER": "you"}, {"LOWER": "for"}, {"LOWER": "your"}, {"LOWER": "query"}],
    [{"LOWER": "i"}, {"LOWER": "understand"}, {"LOWER": "your"}, {"LOWER": "concern"}],
    [{"LOWER": "sorry"}, {"IS_PUNCT": True}, {"LOWER": "the"}, {"LOWER": "images"}, {"LOWER": "were"}, {"LOWER": "not"}, {"LOWER": "really"}, {"LOWER": "very"}, {"LOWER": "clear"}],
    [{"LOWER": "attachment"}, {"LOWER": "removed"}, {"LOWER": "to"}, {"LOWER": "protect"}, {"LOWER": "patient"}, {"LOWER": "identity"}],
    [{"LOWER": "hello"}, {"IS_PUNCT": True}, {"LOWER": "welcome"}, {"LOWER": "to"}, {"LOWER": "icliniq.com"}],
    [{"LOWER": "i"}, {"LOWER": "read"}, {"LOWER": "your"}, {"LOWER": "query"}, {"LOWER": "and"}, {"LOWER": "understand"}, {"LOWER": "your"}, {"LOWER": "concern"}],
    [{"LOWER": "hi"}, {"IS_PUNCT": True}, {"LOWER": "glad"}, {"LOWER": "to"}, {"LOWER": "hear"}, {"LOWER": "from"}, {"LOWER": "you"}],
    [{"LOWER": "hello"}, {"IS_PUNCT": True}, {"LOWER": "welcome"}, {"LOWER": "to"}, {"LOWER": "icliniq"}],
    [{"LOWER": "thank"}, {"LOWER": "you"}, {"LOWER": "for"}, {"LOWER": "stopping"}, {"LOWER": "by"}]
]

In [None]:
def list_gcs_files(bucket_name, prefix=""):
    """List all files in the GCS bucket with the specified prefix."""
    storage_client = storage.Client()
    bucket = storage_client.bucket(bucket_name)
    blobs = bucket.list_blobs(prefix=prefix)
    return [blob.name for blob in blobs if blob.name.endswith('.csv')]

import io

def load_csv_from_gcs(bucket_name, file_name):
    """Load a CSV file from GCS into a Pandas DataFrame."""
    storage_client = storage.Client()
    bucket = storage_client.bucket(bucket_name)
    blob = bucket.blob(file_name)

    # Download content as string and read into DataFrame
    data = blob.download_as_text()
    return pd.read_csv(io.StringIO(data))

def remove_sentences_with_keywords(text, keywords=None):
    """Remove sentences containing specific keywords."""
    if keywords is None:
        keywords = ["patients identity", "welcome", "icliniq.com", "more details", "concern", "thank you", "picture", "concerns", "iclinq.com"]
    pattern = r'(?i)\b(?:' + '|'.join(map(re.escape, keywords)) + r')\b'
    sentences = re.split(r'(?<=[.!?])\s+', text)
    return ' '.join([sentence for sentence in sentences if not re.search(pattern, sentence)])

def remove_unwanted_phrases(text):
    """Remove matched phrases using spaCy Matcher."""
    doc = nlp(text)
    matches = matcher(doc)
    spans = [doc[start:end] for _, start, end in matches]
    for span in spans:
        text = text.replace(span.text, "")
    return text.strip()

def clean_answer(text):
    """Apply cleaning transformations to the Answer field."""
    text = remove_sentences_with_keywords(text)
    text = remove_unwanted_phrases(text)
    text = re.sub(r'\(attachment removed to protect the patient\'s identity\)', '', text, flags=re.IGNORECASE)
    if len(text.split()) < 20:
        return None
    text = re.sub(r'\s+', ' ', text)
    text = re.sub(r'[^A-Za-z0-9.,? ]+', '', text)
    return re.sub(r'^[^A-Za-z]+', '', text)

def process_and_load_data(bucket_name, prefix, table_id):
    """Process all CSVs in the GCS bucket under a specific prefix and load to BigQuery."""
    all_data = []
    file_names = list_gcs_files(bucket_name, prefix=prefix)

    for file_name in file_names:
        print(f"Processing file: {file_name}")
        df = load_csv_from_gcs(bucket_name, file_name)
        df['Cleaned_Answer'] = df['Answer'].apply(clean_answer)
        df = df.dropna(subset=['Cleaned_Answer']).reset_index(drop=True)
        df = df.drop(columns=['Answer']).rename(columns={'Cleaned_Answer': 'Answer'})
        df['source'] = 'iqlinic'
        df['last_updated_date'] = datetime.now().date()
        df['last_updated_date'] = pd.to_datetime(df['last_updated_date'], errors='coerce')

        all_data.append(df)

    combined_df = pd.concat(all_data, ignore_index=True)
    load_to_bigquery(combined_df, table_id)
    print("All data loaded successfully.")

def load_to_bigquery(df, table_id):
    """Load DataFrame to BigQuery."""
    client = bigquery.Client()
    job = client.load_table_from_dataframe(df, table_id)
    job.result()
    print("Data loaded successfully into BigQuery.")



In [None]:
# Set GCS bucket, prefix, and BigQuery table details
bucket_name = "raw_dataset_genai"
prefix = "iqlinic/"
table_id = "health-ai-agent-sjsu.transformed_data.all_merged"

# Execute the pipeline
process_and_load_data(bucket_name, prefix, table_id)

Processing file: iqlinic/icliniq_chunk_5_1.csv
Data loaded successfully into BigQuery.
All data loaded successfully.


In [None]:
# Running this code will query a table in BigQuery and download
# the results to a Pandas DataFrame named `results`.
# Learn more here: https://cloud.google.com/bigquery/docs/visualize-jupyter

%%bigquery results --project health-ai-agent-sjsu
SELECT * FROM `health-ai-agent-sjsu.transformed_data.all_merged` #this table name was set based on the table you chose to query

Query is running:   0%|          |

Downloading:   0%|          |

In [None]:
# You can view the resulting Pandas DataFrame and work with using the Pandas library.
# https://pandas.pydata.org/docs/getting_started/index.html#getting-started
results

Unnamed: 0,Title,Question,Answer,source,last_updated_date
0,Why are there burning and itching in vagina wi...,I am currently not taking any medication. I am...,I have seen the case summary. I think your fun...,iqlinic,2024-11-05 00:00:00+00:00
1,I have plaque deposits on my teeth. What to do?,I have plaque deposits on my teeth and gums. M...,"I have viewed your image , and it is calculus ...",iqlinic,2024-11-05 00:00:00+00:00
2,How can a medical aspirant overcome his fear o...,I am a medical aspirant and I have my medical ...,I think you should stop getting worried. Just ...,iqlinic,2024-11-05 00:00:00+00:00
3,What are the methods to treat flaccid penis?,Hello doctorI am recently married. I am unable...,"Yes, taking Tablet.Vigora 100 Sildenafil will ...",iqlinic,2024-11-05 00:00:00+00:00
4,"What are the home remedies for a sore throat, ...","For the entire day, my cough did not stop for ...","Nowadays, colds with sore throat and fever are...",iqlinic,2024-11-05 00:00:00+00:00
...,...,...,...,...,...
976,Thought of getting rid of masturbation makes m...,I think I am so much addicted to masturbation....,Doing masturbation three or four days a week i...,iqlinic,2024-11-05 00:00:00+00:00
977,"I have acne marks, dark spots and pigmentation...","I do have acne marks, dark spots and pigmentat...",I went through the history and photos you prov...,iqlinic,2024-11-05 00:00:00+00:00
978,What is the most effective treatment for strep...,I have shortness of breath (worsens on mild ex...,"Yes, Augmentin Amoxicillin and Clavulanic acid...",iqlinic,2024-11-05 00:00:00+00:00
979,Is it normal to get decreased taste sensation ...,"I am a 40 year old female, non smoker. I had a...",We will help you alleviate the problem. I unde...,iqlinic,2024-11-05 00:00:00+00:00


# Example 2: Query a table with BigQuery DataFrames

In [None]:


import bigframes.pandas as bf

bf.options.bigquery.location = "US" #this variable is set based on the dataset you chose to query
bf.options.bigquery.project = "health-ai-agent-sjsu" #this variable is set based on the dataset you chose to query

In [None]:
df = bf.read_gbq("health-ai-agent-sjsu.transformed_data.transformed_merged") #this variable is set based on the dataset you chose to query



In [None]:
# BigFrames can work with tables that are too large to fit in the notebook memory.
# Look at the first 20 rows.
df.head(20)

Unnamed: 0,Title,Question,Answer
0,What can be the reason for pain while resting ...,My nephew had a tumor on the left side of his ...,I see it is indeed a difficult case. I suggest...
1,Does weight training cause nerve issues in the...,"For the last couple of years, I have had some ...",Please attach the nerve conduction study repor...
2,What could be the cause consistent bloating an...,I have been consistently experiencing symptoms...,The symptoms that you are experiencing are lik...
3,I got my periods 4 days earlier than my due da...,I had protected sex last month. Just to be sur...,I have gone through your thorough description ...
4,How to prevent stomach bloating in 25-year-old?,I am 25 years old and facing constant belly bl...,"Add regular probiotics to your diet, either yo..."
5,Should a patient with blood in his stools duri...,I am 40 years old and noticed blood during bow...,Mostly it will be an anal fissure and not canc...
6,How long does it take for someone with irregul...,"I am 22 years old, female weighing 187 pounds....",Let me start on a positive note that you are a...
7,"Is my hymen intact, and to what degree is it d...",Please check the attached images and tell me i...,I looked over the image . Your vagina and hyme...
8,Why am I facing bowel problem whenever I go to...,I am a 21 year old male. I came to my native r...,The problem you are experiencing seems to occu...
9,How to deal with blocked ears after cold?,I had a cold and a catarrh last week. After th...,I am here to answer all your questions. Sorry ...


In [None]:
'''from google.cloud import bigquery

# Set up BigQuery client
client = bigquery.Client(project="health-ai-agent-sjsu")

# Define table ID
table_id = "health-ai-agent-sjsu.transformed_data.transformed_merged"

# Create a query to delete all rows from the table
query = f"DELETE FROM `{table_id}` WHERE TRUE"

# Run the query
query_job = client.query(query)

# Wait for the job to complete
query_job.result()

print("All rows have been deleted from the table.")
'''

All rows have been deleted from the table.
