In [1]:
! pip3 install torch torchvision torchaudio

Collecting torch
  Obtaining dependency information for torch from https://files.pythonhosted.org/packages/74/07/edce54779f5c3fe8ab8390eafad3d7c8190fce68f922a254ea77f4a94a99/torch-2.1.0-cp311-cp311-win_amd64.whl.metadata
  Downloading torch-2.1.0-cp311-cp311-win_amd64.whl.metadata (25 kB)
Collecting torchvision
  Obtaining dependency information for torchvision from https://files.pythonhosted.org/packages/20/ac/ab6f42af83349e679b03c9bb18354740c6b58b17dba329fb408730230584/torchvision-0.16.0-cp311-cp311-win_amd64.whl.metadata
  Downloading torchvision-0.16.0-cp311-cp311-win_amd64.whl.metadata (6.6 kB)
Collecting torchaudio
  Obtaining dependency information for torchaudio from https://files.pythonhosted.org/packages/11/30/715101782513f94c834ebe3afb9a29b0fae1121f64963db9d39fb80da53e/torchaudio-2.1.0-cp311-cp311-win_amd64.whl.metadata
  Downloading torchaudio-2.1.0-cp311-cp311-win_amd64.whl.metadata (5.7 kB)
Downloading torch-2.1.0-cp311-cp311-win_amd64.whl (192.3 MB)
   ------------------

In [None]:
# Install the Hugging Face Transformers library and its dependencies
! pip install -U transformers
! pip install -U datasets
! pip install -U sentencepiece

# Import the necessary modules
import os
import json
import pandas as pd
from transformers import AutoTokenizer, AutoModelForQuestionAnswering, QuestionAnsweringPipeline

# Load the transcripts from the csv file named "cleaned_output_file"
transcripts = pd.read_csv('cleaned_output_file.csv')

# Convert the transcripts dataframe to a dictionary with id as the key and text as the value
transcripts = transcripts.set_index('id')['text'].to_dict()

# Load the test data from the csv file named "test.csv"
test = pd.read_csv('test.csv')

# Load the pre-trained tokenizer and model for question answering
# You can choose any model from https://huggingface.co/models?filter=question-answering
# For example, you can use bert-large-uncased-whole-word-masking-finetuned-squad
tokenizer = AutoTokenizer.from_pretrained("bert-large-uncased-whole-word-masking-finetuned-squad")
model = AutoModelForQuestionAnswering.from_pretrained("bert-large-uncased-whole-word-masking-finetuned-squad")

# Create a question answering pipeline using the tokenizer and model
qa_pipeline = QuestionAnsweringPipeline(model=model, tokenizer=tokenizer)

# Initialize an empty list to store the answers
answers = []

# Loop over all the rows in the test data
for _, row in test.iterrows():
  # Get the question and the transcript id from the row
  question = row['Question']
  transcript_id = row['Transcript']

  # Get the transcript text from the transcripts dictionary
  transcript = transcripts[transcript_id]

  # Use the question answering pipeline to generate an answer based on the question and the transcript
  answer = qa_pipeline(question=question, context=transcript)

  # Append the answer and the question id to the answers list
  answers.append([answer, row['Id']])

# Convert the answers list to a pandas dataframe
answers = pd.DataFrame(answers, columns=["Text", "Id"])

# Save the answers dataframe to a csv file
answers.to_csv("sample_submission_9nov.csv", index=False)