In [1]:
import pandas as pd
from transformers import pipeline, set_seed
from sentence_transformers import SentenceTransformer
import requests
from bs4 import BeautifulSoup
import chromadb
from nltk.tokenize import sent_tokenize
from tqdm.notebook import tqdm

In [2]:
picture_info = pd.read_csv('../data/best_picture_2000.csv')

In [3]:
def get_text(link):
    response = requests.get(link)
    soup = BeautifulSoup(response.text)
    # Find all paragraphs within the main content area
    paragraphs = soup.find(id='mw-content-text').find_all('p')
    # Extract plain text from paragraphs using get_text() method
    text = '\n'.join([p.get_text() for p in paragraphs])
    return text
    

In [4]:
picture_info['body_text'] = picture_info['link'].apply(get_text)

In [None]:
picture_info = picture_info.reset_index().head()

In [None]:
picture_info

In [None]:
client = chromadb.PersistentClient(path="./chromadb")
collection = client.create_collection("picture_info")

In [None]:
def add_picture(picture):
    sentences = sent_tokenize(picture['body_text'])
    collection.add(
        documents = sentences,
        ids = [f'{picture["index"]}_{i}' for i in range(len(sentences))],
        metadatas = [{'picture': picture['title']}] * len(sentences)
    )

In [None]:
# import nltk
# nltk.download('punkt')

In [None]:
for _, row in tqdm(picture_info.iterrows()):
    add_picture(row)

In [None]:
questions = pd.read_csv('../data/QAs.csv')

In [None]:
def context(question):
    results = collection.query(
    query_texts = [question],
    n_results = 1
    )
    return '\n'.join(results['documents'][0])

In [None]:
questions['context']=  questions['question'].apply(context)

Question answering models

In [None]:
encoder = SentenceTransformer('sentence-transformers/all-MiniLM-L6-v2')

In [None]:
embeddings = questions['context'].apply(encoder.encode)

In [None]:
decoder = pipeline('text-generation', model='distilgpt2')

In [None]:
qa = pipeline(task="question-answering")

In [None]:
def answer_generation(row):
    question = row['question']
    context = row['context']
    answer = qa(question = question, context = context)
    return answer['answer']

In [None]:
questions['answer_generated'] = questions.apply(answer_generation, axis=1)

In [None]:
questions.head()