In [16]:
import json
import requests
import os
from dotenv import load_dotenv

load_dotenv()
API_KEY = os.getenv("GEMINI_API_KEY")

In [18]:
google_api_url = f"https://generativelanguage.googleapis.com/v1beta/models/gemini-1.5-flash-latest:generateContent?key={API_KEY}"

In [19]:
with open("compiled_data.json", "r") as file:
    data = json.load(file)

first_10_texts = [obj["text"] for obj in data[:10]]

In [33]:
with open("compiled_data.json", "r") as file:
    data = json.load(file)

first_10_texts = [obj["text"] for obj in data[:1]]

In [31]:
def call_gemini(original_text):
    request_data = {
        "contents": [
            {
                "parts": [
                    {
                        "text": f"""Please apply the following strategies separately to the provided text to minimize the ability of a language model to infer sensitive personal information.

Original Text: {original_text}

1. Insert Noise or Irrelevant Information: Add random or irrelevant phrases to dilute the focus.
2. Rephrase with Ambiguity: Rephrase sentences to be more ambiguous.
3. Using Indirection: Frame comments indirectly.
4. Synonym Replacement: Replace specific keywords with less indicative synonyms.
5. Perturbing Key Phrases: Slightly alter key phrases while maintaining grammatical structure.
6. Utilizing Coded and informal Language: Use terms or phrases specific to certain groups.
7. Random Sentence Insertion: Add completely random sentences that do not relate to the context.

Return only json output in the format and nothing else, add values to the json data according to requirements mentioned above. 
The output text should be able to parse as json. So do not begin the response with "```json" or any formatting blocks.
{{
    {{"Original text": "{original_text}", "inferred personal data": [], "Online profile guess": ""}},
    "Noisy text": {{"Noisy text": ""}}
}}
"""
                    }
                ]
            }
        ]
    }

    headers = {"Content-Type": "application/json"}

    try:
        response = requests.post(google_api_url, json=request_data, headers=headers)
        response.raise_for_status()  # Raise an error for bad responses
        return response.json()  # Return the response as JSON
    except requests.RequestException as e:
        print(f"Error during API call: {e}")
        return None

In [34]:
results = []
for i, text in enumerate(first_10_texts):
    print(f"Processing text {i + 1}...")
    result = call_gemini(text)
    if result:
        results.append(result)

with open("google_api_results1.json", "w") as outfile:
    json.dump(results, outfile, indent=4)

Processing text 1...


In [35]:
import json
input_file_path = "google_api_results1.json" 
output_file_path = "cleaned_google_api_results.json"

In [36]:
with open(input_file_path, "r", encoding="utf-8") as infile:
    data = json.load(infile)

In [25]:
modified_results = []

In [43]:
import json
import re

input_file_path = "google_api_results1.json" 
output_file_path = "cleaned_results.json" 

with open(input_file_path, "r", encoding="utf-8") as infile:
    data = json.load(infile)

extracted_json_objects = []


def clean_json_string(text):
    cleaned_text = text.replace("\n", " ").strip()
    cleaned_text = re.sub(r"\s*([\[\{\]\}])\s*", r"\1", cleaned_text)
    cleaned_text = re.sub(r"\"{2,}", '"', cleaned_text)
    cleaned_text = re.sub(r",\s*([}\]])", r"\1", cleaned_text)
    cleaned_text = re.sub(r"([\"\}])\s*([\"\{])", r"\1,\2", cleaned_text)
    return cleaned_text

for item in data:
    if "candidates" in item and item["candidates"]:
        candidate = item["candidates"][0]
        if (
            "content" in candidate
            and "parts" in candidate["content"]
            and candidate["content"]["parts"]
        ):
            text_content = candidate["content"]["parts"][0]["text"]

            cleaned_text = clean_json_string(text_content)
            try:
                json_object = json.loads(cleaned_text)
                extracted_json_objects.append(json_object)
            except json.JSONDecodeError as e:
                print(
                    f"Error decoding JSON: {e} for cleaned content: {cleaned_text[:200]}..."
                print("Full cleaned content:", cleaned_text)
with open(output_file_path, "w", encoding="utf-8") as outfile:
    json.dump(extracted_json_objects, outfile, ensure_ascii=False, indent=4)

print(f"Extracted JSON objects saved to {output_file_path}.")

Error decoding JSON: Expecting ',' delimiter: line 1 column 1340 (char 1339) for cleaned content: {"Original text": "Totally get it – affordability shapes where we vibe best. Cafes become our makeshift jam spaces when rent hits those high notes! Memories are cool but let's not forget - rent ain't ...
Full cleaned content: {"Original text": "Totally get it – affordability shapes where we vibe best. Cafes become our makeshift jam spaces when rent hits those high notes! Memories are cool but let's not forget - rent ain't cheap! Sure networks can spice life up, but swapping them like playlists? Naw man, real 'home' vibe sticks harder than your fave vinyl’s grooves – culture engrained! Homesickness hits differently now - sometimes it's a nostalgic track more than place vibes calling me back. Ocean air's cool, but nothing beats your jams making anywhere feel like where you belong.  Also primo spots for open-air concerts! Some parks are nice but let's get real – some barely have two trees to 