In [1]:
from transformers import TapasTokenizer, TapasForQuestionAnswering
import pandas as pd
import torch

# Load the tokenizer and model
tokenizer = TapasTokenizer.from_pretrained('google/tapas-base-finetuned-wtq')
model = TapasForQuestionAnswering.from_pretrained('google/tapas-base-finetuned-wtq')


In [2]:
# Generate a large table with more than 300 rows
data = {
    'Actors': ["Actor " + str(i) for i in range(301)],
    'Age': [str(20 + i % 40) for i in range(301)],
    'Number of Movies': [str(10 + i % 50) for i in range(301)]
}
large_table = pd.DataFrame.from_dict(data)


In [3]:
# large_table.to_csv('large_table.csv', index=False)

In [4]:
def chunk_table(table, chunk_size=50):
    return [table[i:i + chunk_size] for i in range(0, table.shape[0], chunk_size)]


In [5]:
import collections
import numpy as np

def compute_prediction_sequence(model, data, device):
  """Computes predictions using model's answers to the previous questions."""

  # prepare data
  input_ids = data["input_ids"].to(device)
  attention_mask = data["attention_mask"].to(device)
  token_type_ids = data["token_type_ids"].to(device)

  all_logits = []
  prev_answers = None

  num_batch = data["input_ids"].shape[0]

  for idx in range(num_batch):

    if prev_answers is not None:
        coords_to_answer = prev_answers[idx]
        # Next, set the label ids predicted by the model
        prev_label_ids_example = token_type_ids_example[:,3] # shape (seq_len,)
        model_label_ids = np.zeros_like(prev_label_ids_example.cpu().numpy()) # shape (seq_len,)

        # for each token in the sequence:
        token_type_ids_example = token_type_ids[idx] # shape (seq_len, 7)
        for i in range(model_label_ids.shape[0]):
          segment_id = token_type_ids_example[:,0].tolist()[i]
          col_id = token_type_ids_example[:,1].tolist()[i] - 1
          row_id = token_type_ids_example[:,2].tolist()[i] - 1
          if row_id >= 0 and col_id >= 0 and segment_id == 1:
            model_label_ids[i] = int(coords_to_answer[(col_id, row_id)])

        # set the prev label ids of the example (shape (1, seq_len) )
        token_type_ids_example[:,3] = torch.from_numpy(model_label_ids).type(torch.long).to(device)

    prev_answers = {}
    # get the example
    input_ids_example = input_ids[idx] # shape (seq_len,)
    attention_mask_example = attention_mask[idx] # shape (seq_len,)
    token_type_ids_example = token_type_ids[idx] # shape (seq_len, 7)
    # forward pass to obtain the logits
    outputs = model(input_ids=input_ids_example.unsqueeze(0),
                    attention_mask=attention_mask_example.unsqueeze(0),
                    token_type_ids=token_type_ids_example.unsqueeze(0))
    logits = outputs.logits
    all_logits.append(logits)

    # convert logits to probabilities (which are of shape (1, seq_len))
    dist_per_token = torch.distributions.Bernoulli(logits=logits)
    probabilities = dist_per_token.probs * attention_mask_example.type(torch.float32).to(dist_per_token.probs.device)

    # Compute average probability per cell, aggregating over tokens.
    # Dictionary maps coordinates to a list of one or more probabilities
    coords_to_probs = collections.defaultdict(list)
    prev_answers = {}
    for i, p in enumerate(probabilities.squeeze().tolist()):
      segment_id = token_type_ids_example[:,0].tolist()[i]
      col = token_type_ids_example[:,1].tolist()[i] - 1
      row = token_type_ids_example[:,2].tolist()[i] - 1
      if col >= 0 and row >= 0 and segment_id == 1:
        coords_to_probs[(col, row)].append(p)

    # Next, map cell coordinates to 1 or 0 (depending on whether the mean prob of all cell tokens is > 0.5)
    coords_to_answer = {}
    for key in coords_to_probs:
      coords_to_answer[key] = np.array(coords_to_probs[key]).mean() > 0.5
    prev_answers[idx+1] = coords_to_answer

  logits_batch = torch.cat(tuple(all_logits), 0)

  return logits_batch

In [6]:
# ! pip install dask

In [11]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity


def score_chunk_relevance(chunk, query, vectorizer):
    # Combine the chunk's text into a single string
    # print(chunk)
    chunk_text = str(chunk)
    query_text = str(query)
    
    # Transform texts to TF-IDF vectors
    texts = [chunk_text, query_text]
    tfidf_matrix = vectorizer.transform(texts)
    
    # Calculate cosine similarity
    cosine_sim = cosine_similarity(tfidf_matrix[0:1], tfidf_matrix[1:2])
    
    return cosine_sim[0][0]

# Initialize TF-IDF Vectorizer
# print(chunk)
vectorizer = TfidfVectorizer().fit([" ".join(large_table.fillna('').values.flatten())])
def chunk_table_with_context(table, chunk_size=50, overlap=1):
    chunks = []
    for i in range(0, table.shape[0], chunk_size):
        start_idx = max(0, i - overlap)
        end_idx = min(table.shape[0], i + chunk_size)
        chunks.append(table.iloc[start_idx:end_idx])
    return chunks


In [14]:
# !pip install dask

Collecting dask
  Downloading dask-2024.5.2-py3-none-any.whl.metadata (3.8 kB)
Collecting cloudpickle>=1.5.0 (from dask)
  Downloading cloudpickle-3.0.0-py3-none-any.whl.metadata (7.0 kB)
Collecting partd>=1.2.0 (from dask)
  Downloading partd-1.4.2-py3-none-any.whl.metadata (4.6 kB)
Collecting toolz>=0.10.0 (from dask)
  Using cached toolz-0.12.1-py3-none-any.whl.metadata (5.1 kB)
Collecting locket (from partd>=1.2.0->dask)
  Downloading locket-1.0.0-py2.py3-none-any.whl.metadata (2.8 kB)
Downloading dask-2024.5.2-py3-none-any.whl (1.2 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.2/1.2 MB[0m [31m2.9 MB/s[0m eta [36m0:00:00[0ma [36m0:00:01[0m0m
[?25hDownloading cloudpickle-3.0.0-py3-none-any.whl (20 kB)
Downloading partd-1.4.2-py3-none-any.whl (18 kB)
Using cached toolz-0.12.1-py3-none-any.whl (56 kB)
Downloading locket-1.0.0-py2.py3-none-any.whl (4.4 kB)
Installing collected packages: toolz, locket, cloudpickle, partd, dask
  Attempting uninstall: cloudp

In [19]:
from transformers import TapasTokenizer, TapasForQuestionAnswering
import pandas as pd
import torch
from io import StringIO
from dask import dataframe as dd

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

def process_chunk(model, tokenizer, chunk, queries):
    model.to(device)
    # Check relevance
    relevance_score = score_chunk_relevance(chunk, queries, vectorizer)
    if relevance_score < 0.66:  # Threshold for relevance
        return None
    df = pd.read_csv(StringIO(chunk))
    inputs = tokenizer(table=df, queries=queries, padding='max_length', return_tensors="pt", truncation=True)
    logits = compute_prediction_sequence(model, inputs, device)
    predicted_answer_coordinates, = tokenizer.convert_logits_to_predictions(inputs, logits.cpu().detach())

    if predicted_answer_coordinates[0]:
        row, col = predicted_answer_coordinates[0][0]
        chunk = pd.read_csv(StringIO(chunk))
        return chunk.iloc[row, col]

# Assuming `large_table` and `chunk_table_with_context` are defined elsewhere

# Chunk the large table with context preservation
chunks = chunk_table_with_context(large_table)

# Convert to Dask DataFrame
dask_chunks = dd.from_pandas(pd.DataFrame({'chunks': chunks}), npartitions=len(chunks))

# Define queries
queries = ["actor 100 number of movies"]

# Process each chunk in parallel
results = dask_chunks.map_partitions(lambda df: df.apply(lambda row: process_chunk(model, tokenizer, row['chunks'], queries), axis=1))

# Compute the results
# computed_results = results.compute()

# Filter out None values and print results
answers = [res for res in results if res is not None]
print(f"Predicted answers: {answers}")


Predicted answers: [<NA>, <NA>, '100  Actor 100  40               10', <NA>, <NA>, <NA>, <NA>]


In [None]:
# Streamlit app
st.title('TAPAS Question Answering on Tables')

# File uploader for CSV files
uploaded_file = st.file_uploader("Choose a CSV file", type="csv")

if uploaded_file is not None:
    # Read the CSV file
    large_table = pd.read_csv(uploaded_file)
    st.write("Uploaded Table:")
    st.write(large_table)
    
    queries = st.text_input("Enter your question:", "How many movies has Actor 150 acted in?")
    
    if st.button("Get Answer"):
        # Initialize TF-IDF Vectorizer
        vectorizer = TfidfVectorizer().fit([" ".join(large_table.astype(str).fillna('').values.flatten())])
        
        # Chunk the table
        chunks = chunk_table_with_context(large_table)
        dask_chunks = dd.from_pandas(pd.DataFrame({'chunks': chunks}), npartitions=len(chunks))
        
        # Process each chunk
        results = dask_chunks.map_partitions(lambda df: df.apply(lambda row: process_chunk(model, tokenizer, row['chunks'], [queries], device), axis=1)).compute()
        answers = [res for res in results if res is not None]
        
        st.write(f"Predicted answers: {answers}")

In [21]:
# !pip install pypdf
# !pip install qdrant-client

Collecting qdrant-client
  Using cached qdrant_client-1.9.1-py3-none-any.whl.metadata (9.5 kB)
Collecting grpcio-tools>=1.41.0 (from qdrant-client)
  Downloading grpcio_tools-1.64.1-cp39-cp39-macosx_10_9_universal2.whl.metadata (5.3 kB)
Collecting portalocker<3.0.0,>=2.7.0 (from qdrant-client)
  Using cached portalocker-2.8.2-py3-none-any.whl.metadata (8.5 kB)
Collecting protobuf<6.0dev,>=5.26.1 (from grpcio-tools>=1.41.0->qdrant-client)
  Using cached protobuf-5.27.1-cp38-abi3-macosx_10_9_universal2.whl.metadata (592 bytes)
Collecting grpcio>=1.41.0 (from qdrant-client)
  Downloading grpcio-1.64.1-cp39-cp39-macosx_10_9_universal2.whl.metadata (3.3 kB)
Using cached qdrant_client-1.9.1-py3-none-any.whl (229 kB)
Downloading grpcio_tools-1.64.1-cp39-cp39-macosx_10_9_universal2.whl (5.4 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m5.4/5.4 MB[0m [31m3.8 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m
[?25hDownloading grpcio-1.64.1-cp39-cp39-macosx_10_9_universal2.whl

In [12]:
large_table.loc[large_table['Actors'] == 'Actor 50']

Unnamed: 0,Actors,Age,Number of Movies
50,Actor 50,30,10


In [28]:
from transformers import TrainingArguments, Trainer

# Dummy data for weak supervision
train_data = {
    'queries': ["How many movies has Actor 50 acted in?", "How old is Actor 1?"],
    'answers': [10, 21]
}

# Convert to DataFrame
train_df = pd.DataFrame(train_data)

# Tokenize training data by chunks
train_chunks = chunk_table(large_table)
train_inputs_list = [tokenizer(table=chunk, queries=train_df['queries'].tolist(), padding='max_length', return_tensors="pt", truncation=True) for chunk in train_chunks]

# Create dataset
class TableDataset(torch.utils.data.Dataset):
    def __init__(self, inputs_list, answers):
        self.inputs_list = inputs_list
        self.answers = answers

    def __len__(self):
        return len(self.answers)

    def __getitem__(self, idx):
        item = {key: val[idx] for key, val in self.inputs_list[idx].items()}
        item['labels'] = torch.tensor(self.answers[idx])
        return item

train_dataset = TableDataset(train_inputs_list, train_df['answers'].tolist())

# Set up training arguments
training_args = TrainingArguments(
    output_dir='./results',
    num_train_epochs=3,
    per_device_train_batch_size=2,
    logging_dir='./logs',
)

# Initialize Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
)

# Fine-tune the model
trainer.train()


IndexError: iloc cannot enlarge its target object

In [28]:
Ollama?

[0;31mInit signature:[0m
[0mOllama[0m[0;34m([0m[0;34m[0m
[0;34m[0m    [0;34m*[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0mname[0m[0;34m:[0m [0mOptional[0m[0;34m[[0m[0mstr[0m[0;34m][0m [0;34m=[0m [0;32mNone[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0mbase_url[0m[0;34m:[0m [0mstr[0m [0;34m=[0m [0;34m'http://localhost:11434'[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0mmodel[0m[0;34m:[0m [0mstr[0m [0;34m=[0m [0;34m'llama2'[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0mmirostat[0m[0;34m:[0m [0mOptional[0m[0;34m[[0m[0mint[0m[0;34m][0m [0;34m=[0m [0;32mNone[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0mmirostat_eta[0m[0;34m:[0m [0mOptional[0m[0;34m[[0m[0mfloat[0m[0;34m][0m [0;34m=[0m [0;32mNone[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0mmirostat_tau[0m[0;34m:[0m [0mOptional[0m[0;34m[[0m[0mfloat[0m[0;34m][0m [0;34m=[0m [0;32mNone[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0mnum_ctx[0m[0;34m:[0m [0m

  warn_deprecated(


I take a sip of my latte as I casually glance over at the couple sitting at the next table, eavesdropping on their conversation about space exploration. They seem engrossed in their discussion, oblivious to my attention.

"...and then we'd have to develop new propulsion systems," one of them says. "Something that could withstand the stresses of interstellar travel."

I lean in slightly, intrigued by the topic. The other person nods thoughtfully. "I agree, but what about radiation protection? We can't just expose our astronauts to all that cosmic radiation."

Their conversation sparks my imagination. I start daydreaming about what it would be like to travel through space, visiting distant planets and encountering new forms of life. The possibilities seem endless.

As they continue discussing the challenges of space exploration, I find myself mentally designing a futuristic spaceship, complete with advanced life support systems and gravity manipulation technology. The café's tranquil atm