<a href="https://colab.research.google.com/github/anishasingh23/wikipediaArticleSummary/blob/main/wikipedia.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# **Wikipedia**

In [1]:
!pip install streamlit pyngrok wikipedia-api transformers plotly requests pillow numpy scikit-learn

Collecting streamlit
  Downloading streamlit-1.44.1-py3-none-any.whl.metadata (8.9 kB)
Collecting pyngrok
  Downloading pyngrok-7.2.3-py3-none-any.whl.metadata (8.7 kB)
Collecting wikipedia-api
  Downloading wikipedia_api-0.8.1.tar.gz (19 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting watchdog<7,>=2.1.5 (from streamlit)
  Downloading watchdog-6.0.0-py3-none-manylinux2014_x86_64.whl.metadata (44 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m44.3/44.3 kB[0m [31m1.2 MB/s[0m eta [36m0:00:00[0m
Collecting pydeck<1,>=0.8.0b4 (from streamlit)
  Downloading pydeck-0.9.1-py2.py3-none-any.whl.metadata (4.1 kB)
Downloading streamlit-1.44.1-py3-none-any.whl (9.8 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m9.8/9.8 MB[0m [31m21.4 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading pyngrok-7.2.3-py3-none-any.whl (23 kB)
Downloading pydeck-0.9.1-py2.py3-none-any.whl (6.9 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [3

In [2]:
# %%writefile wiki_app_advanced.py
# import streamlit as st
# import wikipediaapi
# import plotly.express as px
# from transformers import BertTokenizer, BertModel
# import numpy as np
# from sklearn.metrics.pairwise import cosine_similarity
# from PIL import Image
# import requests
# from io import BytesIO

# # --- Initialize Models with Caching ---
# @st.cache_resource
# def load_models():
#     tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
#     model = BertModel.from_pretrained('bert-base-uncased')
#     return tokenizer, model

# tokenizer, model = load_models()

# def get_bert_embedding(text):
#     inputs = tokenizer(text, return_tensors="pt", truncation=True, max_length=512)
#     outputs = model(**inputs)
#     return outputs.last_hidden_state.mean(dim=1).squeeze().detach().numpy()

# # --- Enhanced UI ---
# st.set_page_config(layout="wide")
# st.title("🌐 Wikipedia Semantic Explorer")

# # --- Search & Display Logic ---
# wiki = wikipediaapi.Wikipedia(
#     language="en",
#     user_agent="MyApp/1.0"
# )

# query = st.text_input("**Search Wikipedia**", "Artificial Intelligence")

# if query:
#     with st.spinner("Fetching and analyzing..."):
#         page = wiki.page(query)
#         if page.exists():
#             col1, col2 = st.columns([2, 1])

#             with col1:
#                 st.subheader(f"📖 {page.title}")
#                 st.markdown(page.summary[:1000])

#                 # Related articles
#                 try:
#                     links = list(page.links.keys())[:5]
#                     embeddings = np.array([get_bert_embedding(link) for link in [query] + links])
#                     similarities = cosine_similarity(embeddings[0:1], embeddings[1:])[0]

#                     st.subheader("🔗 Top Related Articles")
#                     for link, score in zip(links, similarities):
#                         st.write(f"- {link} (Relevance: {score:.2f})")
#                 except Exception as e:
#                     st.error(f"Similarity calculation failed: {str(e)}")

#             with col2:
#                 # Fixed image handling
#                 try:
#                     # Get image URL from Wikipedia API differently
#                     image_url = f"https://en.wikipedia.org/w/api.php?action=query&titles={query}&prop=pageimages&format=json&pithumbsize=300"
#                     response = requests.get(image_url).json()
#                     pages = response.get('query', {}).get('pages', {})
#                     thumbnail = next(iter(pages.values())).get('thumbnail', {}).get('source')

#                     if thumbnail:
#                         img_response = requests.get(thumbnail)
#                         img = Image.open(BytesIO(img_response.content))
#                         st.image(img, caption=page.title, use_column_width=True)
#                     else:
#                         st.info("No image available for this page")
#                 except Exception as e:
#                     st.warning(f"Couldn't load image: {str(e)}")

#                 # Similarity graph
#                 if 'similarities' in locals():
#                     fig = px.bar(x=links, y=similarities,
#                                 title="Semantic Similarity Scores",
#                                 labels={'x': 'Article', 'y': 'Relevance'})
#                     st.plotly_chart(fig)

#         else:
#             st.error("Page not found. Try another search term.")

In [3]:
%%writefile wiki_app_advanced.py
import streamlit as st

# MUST be the first Streamlit command
st.set_page_config(
    layout="wide",
    page_title="🌐 Wikipedia Semantic Explorer+",
    page_icon="🔍"
)

# Now import other libraries
import wikipediaapi
import plotly.express as px
from transformers import BertTokenizer, BertModel, pipeline
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity
from PIL import Image
import requests
from io import BytesIO
import time

# --- Custom CSS ---
st.markdown("""
<style>
.article-card {
    border: 1px solid #e0e0e0;
    border-radius: 10px;
    padding: 15px;
    margin: 10px 0;
    transition: box-shadow 0.3s;
}
.article-card:hover {
    box-shadow: 0 4px 8px rgba(0,0,0,0.1);
}
</style>
""", unsafe_allow_html=True)

# --- Initialize Models with Caching ---
@st.cache_resource
def load_models():
    tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
    model = BertModel.from_pretrained('bert-base-uncased')
    summarizer = pipeline("summarization", model="facebook/bart-large-cnn")
    return tokenizer, model, summarizer

tokenizer, model, summarizer = load_models()

def get_bert_embedding(text):
    inputs = tokenizer(text, return_tensors="pt", truncation=True, max_length=512)
    outputs = model(**inputs)
    return outputs.last_hidden_state.mean(dim=1).squeeze().detach().numpy()

# --- Main App ---
st.title("🌐 Wikipedia Semantic Explorer+")

# --- Search & Display Logic ---
wiki = wikipediaapi.Wikipedia(
    language="en",
    user_agent="WikiExplorer/1.0"
)

query = st.text_input("**Search Wikipedia**", "Artificial Intelligence",
                    help="Try 'Machine Learning' or 'Quantum Computing'")

if query:
    with st.spinner("🔍 Fetching and analyzing content..."):
        start_time = time.time()
        page = wiki.page(query)

        if page.exists():
            col1, col2 = st.columns([2, 1])

            with col1:
                # Article Header with Link
                st.subheader(f"📖 [{page.title}](https://en.wikipedia.org/wiki/{page.title.replace(' ', '_')})")

                # Summary with Read More toggle
                with st.expander("Show Summary", expanded=True):
                    summary = page.summary[:1500]
                    st.markdown(summary)

                # Related Articles Section
                st.subheader("🔗 Top Related Articles")
                links = list(page.links.keys())[:8]

                try:
                    embeddings = np.array([get_bert_embedding(link) for link in [query] + links])
                    similarities = cosine_similarity(embeddings[0:1], embeddings[1:])[0]

                    for link, score in sorted(zip(links, similarities), key=lambda x: -x[1])[:5]:
                        related_page = wiki.page(link)
                        if related_page.exists():
                            with st.container():
                                st.markdown(f"""
                                <div class="article-card">
                                    <h4><a href="https://en.wikipedia.org/wiki/{link.replace(' ', '_')}" target="_blank">{link}</a> (Relevance: {score:.2f})</h4>
                                    <p>{related_page.summary[:200]}...</p>
                                </div>
                                """, unsafe_allow_html=True)

                except Exception as e:
                    st.error(f"Similarity calculation failed: {str(e)}")

            with col2:
                # Image and Quick Facts
                try:
                    image_url = f"https://en.wikipedia.org/w/api.php?action=query&titles={query}&prop=pageimages&format=json&pithumbsize=400"
                    response = requests.get(image_url, timeout=5).json()
                    thumbnail = next(iter(response['query']['pages'].values())).get('thumbnail', {}).get('source')

                    if thumbnail:
                        img_response = requests.get(thumbnail, timeout=5)
                        img = Image.open(BytesIO(img_response.content))
                        st.image(img, caption=page.title, use_column_width=True)
                    else:
                        st.info("🎨 No preview image available")
                except:
                    st.warning("⚠️ Couldn't load image")

                with st.expander("⚡ Quick Facts"):
                    first_section = next(iter(page.sections.values()), None)
                    if first_section:
                        st.write(first_section.text[:500])
                    else:
                        st.write("No quick facts available")

                if 'similarities' in locals():
                    fig = px.bar(
                        x=[link[:20] + "..." for link in links[:5]],
                        y=similarities[:5],
                        title="📊 Semantic Similarity",
                        labels={'x': 'Article', 'y': 'Relevance Score'},
                        color=similarities[:5],
                        color_continuous_scale='Teal'
                    )
                    st.plotly_chart(fig, use_container_width=True)

            st.caption(f"⏱️ Generated in {time.time() - start_time:.2f} seconds | 📝 {len(page.summary)} characters | 🔗 {len(links)} related articles")

        else:
            st.error("❌ Page not found. Try another search term.")

# Footer
st.markdown("---")
st.markdown("Built with ♥ using Wikipedia API, BERT, and Streamlit")

Writing wiki_app_advanced.py


In [8]:
%%writefile wiki_app_advanced.py
import streamlit as st

# MUST be the first Streamlit command
st.set_page_config(
    layout="wide",
    page_title="🌐 Wikipedia Semantic Explorer+",
    page_icon="🔍"
)

# Now import other libraries
import wikipediaapi
import plotly.express as px
from transformers import BertTokenizer, BertModel, pipeline
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity
from PIL import Image
import requests
from io import BytesIO
import time

# --- Custom CSS ---
st.markdown("""
<style>
.article-card {
    border: 1px solid #e0e0e0;
    border-radius: 10px;
    padding: 15px;
    margin: 10px 0;
    transition: box-shadow 0.3s;
}
.article-card:hover {
    box-shadow: 0 4px 8px rgba(0,0,0,0.1);
}
</style>
""", unsafe_allow_html=True)

# --- Initialize Models with Caching ---
@st.cache_resource
def load_models():
    tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
    model = BertModel.from_pretrained('bert-base-uncased')
    summarizer = pipeline("summarization", model="facebook/bart-large-cnn")
    return tokenizer, model, summarizer

tokenizer, model, summarizer = load_models()

def get_bert_embedding(text):
    inputs = tokenizer(text, return_tensors="pt", truncation=True, max_length=512)
    outputs = model(**inputs)
    return outputs.last_hidden_state.mean(dim=1).squeeze().detach().numpy()

# --- Main App ---
st.title("🌐 Wikipedia Semantic Explorer+")

# --- Search & Display Logic ---
wiki = wikipediaapi.Wikipedia(
    language="en",
    user_agent="WikiExplorer/1.0 (contact@example.com)"
)

query = st.text_input("**Search Wikipedia**", "Artificial Intelligence",
                    help="Try 'Machine Learning' or 'Quantum Computing'")

if query:
    with st.spinner("🔍 Fetching and analyzing content..."):
        start_time = time.time()
        page = wiki.page(query)

        if page.exists():
            col1, col2 = st.columns([2, 1])

            with col1:
                # Article Header with Link
                st.subheader(f"📖 [{page.title}](https://en.wikipedia.org/wiki/{page.title.replace(' ', '_')})")

                # Summary with Read More toggle
                with st.expander("Show Summary", expanded=True):
                    summary = page.summary[:1500]
                    st.markdown(summary)

                # Related Articles Section
                st.subheader("🔗 Top Related Articles")
                links = list(page.links.keys())[:8]

                try:
                    embeddings = np.array([get_bert_embedding(link) for link in [query] + links])
                    similarities = cosine_similarity(embeddings[0:1], embeddings[1:])[0]

                    for link, score in sorted(zip(links, similarities), key=lambda x: -x[1])[:5]:
                        related_page = wiki.page(link)
                        if related_page.exists():
                            with st.container():
                                st.markdown(f"""
                                <div class="article-card">
                                    <h4><a href="https://en.wikipedia.org/wiki/{link.replace(' ', '_')}" target="_blank">{link}</a> (Relevance: {score:.2f})</h4>
                                    <p>{related_page.summary[:200]}...</p>
                                </div>
                                """, unsafe_allow_html=True)

                except Exception as e:
                    st.error(f"Similarity calculation failed: {str(e)}")

            with col2:
                # Image and Quick Facts
                try:
                    image_url = f"https://en.wikipedia.org/w/api.php?action=query&titles={query}&prop=pageimages&format=json&pithumbsize=400"
                    response = requests.get(image_url, timeout=5).json()
                    thumbnail = next(iter(response['query']['pages'].values())).get('thumbnail', {}).get('source')

                    if thumbnail:
                        img_response = requests.get(thumbnail, timeout=5)
                        img = Image.open(BytesIO(img_response.content))
                        st.image(img, caption=page.title, use_column_width=True)
                    else:
                        st.info("🎨 No preview image available")
                except:
                    st.warning("⚠️ Couldn't load image")

                with st.expander("⚡ Quick Facts"):
                    if page.sections:  # Now checking if sections exist (it's a list)
                        first_section = page.sections[0] if len(page.sections) > 0 else None
                        if first_section:
                            st.write(first_section.text[:500])
                        else:
                            st.write("No quick facts available")

                if 'similarities' in locals():
                    fig = px.bar(
                        x=[link[:20] + "..." for link in links[:5]],
                        y=similarities[:5],
                        title="📊 Semantic Similarity",
                        labels={'x': 'Article', 'y': 'Relevance Score'},
                        color=similarities[:5],
                        color_continuous_scale='Teal'
                    )
                    st.plotly_chart(fig, use_container_width=True)

            st.caption(f"⏱️ Generated in {time.time() - start_time:.2f} seconds | 📝 {len(page.summary)} characters | 🔗 {len(links)} related articles")

        else:
            st.error("❌ Page not found. Try another search term.")

# Footer
st.markdown("---")
st.markdown("Built with ♥ using Wikipedia API, BERT, and Streamlit")

Overwriting wiki_app_advanced.py


In [4]:
!pip install pyngrok
from pyngrok import ngrok




In [9]:
# 1. Cleanup
from pyngrok import ngrok
import os
ngrok.kill()
os.system('pkill ngrok')  # Force kill any lingering processes

# 2. Restart
ngrok.set_auth_token("2vbuJtKmjEPjgnqYitA5hA8QKmR_776hVR3ruaifkgxMapJDg")
public_url = ngrok.connect(addr='8501')
print(f"🚀 Fresh tunnel: {public_url}")
!streamlit run wiki_app_advanced.py --server.port 8501 &>/dev/null &

🚀 Fresh tunnel: NgrokTunnel: "https://b7af-104-196-194-229.ngrok-free.app" -> "http://localhost:8501"


In [6]:
!pip install streamlit wikipedia-api transformers plotly requests pillow numpy scikit-learn
!streamlit run wiki_app_advanced.py


Collecting usage statistics. To deactivate, set browser.gatherUsageStats to false.
[0m
[0m
[34m[1m  You can now view your Streamlit app in your browser.[0m
[0m
[34m  Local URL: [0m[1mhttp://localhost:8502[0m
[34m  Network URL: [0m[1mhttp://172.28.0.12:8502[0m
[34m  External URL: [0m[1mhttp://104.196.194.229:8502[0m
[0m
[34m  Stopping...[0m
[34m  Stopping...[0m
Exception ignored in atexit callback: <function shutdown at 0x7be946dfd440>
Traceback (most recent call last):
  File "/usr/lib/python3.11/logging/__init__.py", line 2185, in shutdown
    h.flush()
  File "/usr/lib/python3.11/logging/__init__.py", line 1093, in flush
    if self.stream and hasattr(self.stream, "flush"):
                       ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/usr/local/lib/python3.11/dist-packages/streamlit/web/bootstrap.py", line 44, in signal_handler
    server.stop()
  File "/usr/local/lib/python3.11/dist-packages/streamlit/web/server/server.py", line 470, in stop
    self._runtime