In [48]:
# Import required libraries
from serpapi import GoogleSearch
import requests
from readability import Document
import pandas as pd
from bs4 import BeautifulSoup


# SerpAPI setup
api_key = "c9691012c0d88d2800f8b26247609a0231402853f0c2cb6c89c3d9d1fa39a41f"  # Replace with your SerpAPI key
article_citation_id = "1NtVbf1efHoJ"  # Replace with your citation ID

# List to store URLs
urls = []
# Paginate through results, stopping after retrieving 100 URLs
start = 0  # Pagination index
MAX_RESULTS = 50

while len(urls) < MAX_RESULTS:
    params = {
        "engine": "google_scholar",
        "cites": article_citation_id,
        "api_key": api_key,
        "start": start
    }

    search = GoogleSearch(params)
    results = search.get_dict()
    organic_results = results.get("organic_results", [])

    if not organic_results:
        break  # No more results, exit the loop

    for result in organic_results:
        urls.append(result["link"])
        if len(urls) >= MAX_RESULTS:
            break  # Stop once we reach the desired number of articles

    start += 10  # Move to the next page

# Save the URLs to a file
with open("urls.txt", "w") as file:
    for url in urls:
        file.write(url + "\n")

print(f"Retrieved {len(urls)} URLs (limited to {MAX_RESULTS}).")



Retrieved 50 URLs (limited to 50).


In [49]:
def clean_html_content(html_content):
    """Extract plain text from HTML content."""
    soup = BeautifulSoup(html_content, "html.parser")  # Parse the HTML
    text = soup.get_text(separator=' ', strip=True)   # Extract text, strip whitespace
    return text

In [50]:
# Function to fetch and parse article content
def fetch_article_content(url):
    try:
        response = requests.get(url, timeout=10)
        response.raise_for_status()  # Raise an error for bad status codes
        doc = Document(response.text)
        cleaned_text = clean_html_content(doc.summary())
        return cleaned_text, doc.title()
    except Exception as e:
        print(f"Failed to process {url}: {e}")
        return None, None

# Loop through URLs and fetch content
articles = []

for url in urls:
    content, title = fetch_article_content(url)
    if content:
        keywords = ["PhysiCell", "cell simulation", "agent-based model"]
        if (len(content.split()) < 50 or 
            not any(keyword.lower() in content.lower() for keyword in keywords)):
            continue
        articles.append({"url": url, "title": title, "content": content})
        print(f"Fetched: {title}")

print(f"Successfully fetched content for {len(articles)} articles.")


Failed to process https://www.cell.com/cell-systems/fulltext/S2405-4712(19)30234-0: 403 Client Error: Forbidden for url: https://www.cell.com/cell-systems/fulltext/S2405-4712(19)30234-0
Failed to process https://ascopubs.org/doi/abs/10.1200/CCI.18.00069: 403 Client Error: Forbidden for url: https://ascopubs.org/doi/abs/10.1200/CCI.18.00069
Failed to process https://www.cell.com/trends/cell-biology/fulltext/S0962-8924(22)00240-9: 403 Client Error: Forbidden for url: https://www.cell.com/trends/cell-biology/fulltext/S0962-8924(22)00240-9
Failed to process https://www.tandfonline.com/doi/abs/10.1080/19336918.2022.2055520: 403 Client Error: Forbidden for url: https://www.tandfonline.com/doi/abs/10.1080/19336918.2022.2055520
Fetched: Multiscale Agent-Based and Hybrid Modeling of the Tumor Immune Microenvironment
Failed to process https://www.cell.com/trends/cancer/fulltext/S2405-8033(20)30017-0: 403 Client Error: Forbidden for url: https://www.cell.com/trends/cancer/fulltext/S2405-8033(20)3

In [37]:
print(articles[1])

{'url': 'https://journals.plos.org/ploscompbiol/article?id=10.1371/journal.pcbi.1007961', 'title': 'Mathematical modelling reveals cellular dynamics within tumour spheroids | PLOS Computational Biology', 'content': 'Abstract Tumour spheroids are widely used as an in vitro assay for characterising the dynamics and response to treatment of different cancer cell lines. Their popularity is largely due to the reproducible manner in which spheroids grow: the diffusion of nutrients and oxygen from the surrounding culture medium, and their consumption by tumour cells, causes proliferation to be localised at the spheroid boundary. As the spheroid grows, cells at the spheroid centre may become hypoxic and die, forming a necrotic core. The pressure created by the localisation of tumour cell proliferation and death generates an cellular flow of tumour cells from the spheroid rim towards its core. Experiments by Dorie et al . showed that this flow causes inert microspheres to infiltrate into tumour

In [51]:
import google.generativeai as genai

# Set your Google API key
GOOGLE_API_KEY = "AIzaSyCgdO2MKIHs7YbNYFD6DNB3BEpJBTG-UBM"  # Replace with your Gemini API key
genai.configure(api_key=GOOGLE_API_KEY)

def analyze_article_with_gemini(content):
    """
    Sends article content to Google's Gemini model to summarize or analyze the content.
    """
    try:
        # Initialize the generative model (gemini-pro is the default choice)
        model = genai.GenerativeModel("gemini-pro")

        # Create a prompt for the article summary
        prompt = (
            "Concisely summarize the key conclusions about the use of PhysiCell "
            "in the following academic article:\n\n" + content
        )

        # Generate the response
        response = model.generate_content(prompt)
        return response.text  # Extract the generated text from the response

    except Exception as e:
        print("Error:", e)
        return None

# Example Usage

for i in range(len(articles)):
    article_content = articles[i]["content"]
    analysis_result = analyze_article_with_gemini(article_content)
    articles[i]["summary"] = analysis_result


df = pd.DataFrame(articles)

# Save DataFrame to a CSV file
df.to_csv("articles_summary.csv", index=False)

print(f"Retrieved {len(df)} articles and saved to 'articles_summary.csv'.")


Error: 500 An internal error has occurred. Please retry or report in https://developers.generativeai.google/guide/troubleshooting
Error: 429 Resource has been exhausted (e.g. check quota).
Retrieved 19 articles and saved to 'articles_summary.csv'.


In [62]:
from sklearn.feature_extraction.text import TfidfVectorizer
import pandas as pd

summaries = [article['summary'] for article in articles]
# 1. Filter out None and empty strings
cleaned_summaries = [s for s in summaries if s and isinstance(s, str) and s.strip()]

# 2. Initialize and fit TfidfVectorizer
vectorizer = TfidfVectorizer()
tfidf_matrix = vectorizer.fit_transform(cleaned_summaries)

# 3. Compute relevance scores (sum of TF-IDF values per summary)
relevance_scores = tfidf_matrix.sum(axis=1)  # Sum the TF-IDF values for each row (summary)

# 4. Create a DataFrame with relevance scores
relevance_df = pd.DataFrame({
    "Summary": cleaned_summaries,
    "Relevance Score": [score[0, 0] for score in relevance_scores]
})

# 5. Display the relevance scores
print("Relevance Scores for Summaries:")
print(relevance_df)

# Save to a CSV file (optional)
relevance_df.to_csv("relevance_scores.csv", index=False)

Relevance Scores for Summaries:
                                              Summary  Relevance Score
0   **Key Conclusions about the Use of PhysiCell i...         8.200998
1   **Key Conclusions about PhysiCell in Spheroid ...         8.701063
2   **Key Conclusions about PhysiCell**\n\n* **Inc...         8.290720
3   **Key Conclusions about the Use of PhysiCell:*...         8.334801
4   **Key Conclusions Regarding Use of PhysiCell**...         5.980114
5   **Key Conclusions about the Use of PhysiCell i...         7.645453
6   **Key Conclusions about the Use of PhysiCell:*...         8.496177
7   **Key Conclusions of PhysiCell in the Academic...         8.971893
8   **Key Conclusions about the Use of PhysiCell i...         7.090108
9   **Key Conclusions on PhysiCell Use in the Stud...         8.667714
10  PhysiCell is an open-source agent-based cell s...         8.443123
11  Conclusions about the use of PhysiCell in a pa...         4.210139
12  **Key Conclusions**\n\nPhysiCell, a multi