### Create parquet file for tutorial

This notebook takes a quora dataset and turns it into a table of embeddings. There are 300 values per vector and they are individually stored in cells in a parquet file.

In [2]:
# Imports
import pandas as pd
import requests, os, zipfile
import datetime
import jsonlines

from sentence_transformers import SentenceTransformer

# Download dataset
DATA_DIR = "tmp"
QA_DIR = f"{DATA_DIR}/quora_duplicate_questions"
QA_FILE = f"{DATA_DIR}/quora_duplicate_questions.tsv"
QA_URL = "https://qim.fs.quoracdn.net/quora_duplicate_questions.tsv"


def download_data():
    os.makedirs(DATA_DIR, exist_ok=True)

    if not os.path.exists(QA_DIR):
        if not os.path.exists(QA_FILE):
            r = requests.get(QA_URL) 
            with open(QA_FILE, "wb") as f:
                f.write(r.content)

download_data()
# usecols=["qid1", "question1"],
# Load data into memory
pd.set_option('display.max_colwidth', 500)
df = pd.read_csv(QA_FILE, sep='\t', index_col=False)
df = df.reset_index(drop=True)
df.drop_duplicates(inplace=True)
df.head()

Unnamed: 0,id,qid1,qid2,question1,question2,is_duplicate
0,0,1,2,What is the step by step guide to invest in share market in india?,What is the step by step guide to invest in share market?,0
1,1,3,4,What is the story of Kohinoor (Koh-i-Noor) Diamond?,What would happen if the Indian government stole the Kohinoor (Koh-i-Noor) diamond back?,0
2,2,5,6,How can I increase the speed of my internet connection while using a VPN?,How can Internet speed be increased by hacking through DNS?,0
3,3,7,8,Why am I mentally very lonely? How can I solve it?,"Find the remainder when [math]23^{24}[/math] is divided by 24,23?",0
4,4,9,10,"Which one dissolve in water quikly sugar, salt, methane and carbon di oxide?",Which fish would survive in salt water?,0


In [8]:
# Set any value for number of questions
NUM_OF_QUESTIONS = 10000

model = SentenceTransformer('average_word_embeddings_komninos')

# Use only defined number of rows
df = df[:NUM_OF_QUESTIONS]

# Create embedding for each question
df['question_vector'] = df.question1.apply(lambda x: model.encode(str(x)))

# Create timestamps 
df['created'] = datetime.datetime.utcnow()
df['datetime'] = df['created'].dt.floor('h')

# Generate columns for vector elements
df2 = df.question_vector.apply(pd.Series)
df2.columns = [f'e_{i}' for i in range(300)]
result = pd.concat([df, df2], axis=1)

# Exclude some columns
result = result.drop(['question_vector'], axis=1)

# Change directory if needed
if os.getcwd().split('/')[-1] != 'feature_repo':
    os.chdir('feature_repo')
    
# Save to parquet file
result[:NUM_OF_QUESTIONS].to_parquet('.data/questions.parquet')

### Recreate process with different dataset and model
This uses Google's [Natural Questions benchmark](https://ai.google.com/research/NaturalQuestions) with the most popular transformers model from Hugging Face ([all-miniLM-L6-v2](https://huggingface.co/sentence-transformers/all-MiniLM-L6-v2)).

In [72]:
#This file was downloaded at the link below. Tried using pandas' native solution. jsonlines was better.
# https://ai.google.com/research/NaturalQuestions/download)
# use this command: gsutil -m cp -R gcp_filepath <path to your data directory>
# use this gcp_filepath: gs://natural_questions/v1.0-simplified/simplified-nq-train.jsonl.gz

os.chdir("..")
filepath = 'simplified-nq-train.jsonl'
rows_list = []
#Iterate over the json files
with jsonlines.open(filepath) as reader:
    #Iterate over the each line on the reader via enumerate
    for idx, obj in enumerate(reader):
        dict1 = {}
        dict1['question_text'] = obj['question_text']
        dict1['document_url'] = obj['document_url']
        rows_list.append(dict1)
        if idx % 20000 == 0: print(idx)

df = pd.DataFrame(rows_list)  

0
20000
40000
60000
80000
100000
120000
140000
160000
180000
200000
220000
240000
260000
280000
300000


In [73]:
df.head()

Unnamed: 0,question_text,document_url
0,which is the most common use of opt-in e-mail marketing,https://en.wikipedia.org//w/index.php?title=Email_marketing&amp;oldid=814071202
1,how i.met your mother who is the mother,https://en.wikipedia.org//w/index.php?title=The_Mother_(How_I_Met_Your_Mother)&amp;oldid=802354471
2,what type of fertilisation takes place in humans,https://en.wikipedia.org//w/index.php?title=Human_fertilization&amp;oldid=831042507
3,who had the most wins in the nfl,https://en.wikipedia.org//w/index.php?title=List_of_National_Football_League_career_quarterback_wins_leaders&amp;oldid=818143757
4,what happened to the lost settlement of roanoke,https://en.wikipedia.org//w/index.php?title=Roanoke_Colony&amp;oldid=843591647


In [80]:
model = SentenceTransformer('all-MiniLM-L6-v2')
dimensions = 384
NUM_OF_QUESTIONS = 10000

# Use only defined number of rows
df = df[:NUM_OF_QUESTIONS]

# Create embedding for each question
df['question_vector'] = df.question_text.apply(lambda x: model.encode(str(x)))

# Create timestamps 
df['created'] = datetime.datetime.utcnow()
df['datetime'] = df['created'].dt.floor('h')

# Generate columns for vector elements
df2 = df.question_vector.apply(pd.Series)
df2.columns = [f'e_{i}' for i in range(dimensions)]
result = pd.concat([df, df2], axis=1)

# Exclude some columns
result = result.drop(['question_vector'], axis=1)
result = result.reset_index()
result.rename(columns={"index": "qid"}, errors="raise", inplace=True)

# Change directory if needed
if os.getcwd().split('/')[-1] != 'feature_repo':
    os.chdir('feature_repo')
    
# Save to parquet file
result.to_parquet('./data/NQ_questions.parquet')