In [None]:
import pandas as pd
import json
import streamlit as st
from transformers import pipeline
@st.cache_data
def load_data():
    st.write("Loading data in chunks...") 
    
    papers = []
    total_lines = sum(1 for _ in open('/Users/cansarma/Desktop/arxiv-metadata-oai-snapshot.json', 'r'))  # Count total lines
    progress_bar = st.progress(0)
    
    try:
        with open('/Users/cansarma/Desktop/arxiv-metadata-oai-snapshot.json', 'r') as file:
            for i, line in enumerate(file):
                try:
                    papers.append(json.loads(line)) 
                except json.JSONDecodeError as e:
                    st.error(f"Error decoding line: {e}")  
                    
                if i % 1000 == 0:
                    progress_bar.progress(i / total_lines)
        
        st.write(f"Total records loaded: {len(papers)}")  
        
        papers_df = pd.DataFrame(papers)
        st.write(f"Total papers in DataFrame: {len(papers_df)}")  
        
        
        cs_papers = papers_df[papers_df['categories'].str.contains('cs.')].reset_index(drop=True)
        st.write(f"Computer science papers: {len(cs_papers)}")  
        return cs_papers

    except FileNotFoundError:
        st.error("File not found. Please check the file path and try again.")
        return pd.DataFrame() 
    except Exception as e:
        st.error(f"Error loading data: {e}")
        return pd.DataFrame() 

@st.cache_resource
def get_summarizer():
    st.write("Loading summarization model...") 
    return pipeline("summarization")

def summarize_paper(abstract):
    summarizer = get_summarizer()
    summary = summarizer(abstract, max_length=150, min_length=30, do_sample=False)
    return summary[0]['summary_text']

def search_papers(query, papers_df):
    results = papers_df[papers_df['title'].str.contains(query, case=False, na=False) | 
                         papers_df['abstract'].str.contains(query, case=False, na=False)]
    return results

def main():
    st.title("AI Research Chatbot")

    papers_df = load_data()

    if not papers_df.empty:
        st.sidebar.header("Search Papers")
        search_query = st.sidebar.text_input("Enter keywords to search for papers:")

        if st.sidebar.button("Search"):
            results = search_papers(search_query, papers_df)
            if not results.empty:
                st.write(f"Found {len(results)} papers:")
                for index, row in results.iterrows():
                    st.subheader(row['title'])
                    st.write(f"Abstract: {row['abstract']}")
                    st.write(f"Summary: {summarize_paper(row['abstract'])}")
                    st.write(f"[View Full Paper](http://arxiv.org/abs/{row['id']})")
            else:
                st.write("No papers found.")

        st.sidebar.header("Ask a Question")
        user_question = st.sidebar.text_input("What would you like to know?")
        
        if st.sidebar.button("Ask"):
    
            response = "I'm currently unable to answer that question directly. Please check the papers."
            st.write(response)
    else:
        st.write("Failed to load data. Please check the error message above.")

if __name__ == "__main__":
    main()
