In [38]:
!pip install --upgrade --quiet arxiv

[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m23.0.1[0m[39;49m -> [0m[32;49m24.0[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m


In [118]:
import arxiv

# Construct the default API client.
client = arxiv.Client()

# Search for most recently submitted papers related to ML (cs.LG) or AI (cs.AI)
search = arxiv.Search(
  query = "cat:cs.LG OR cat:cs.AI",
  max_results = 1000,
  sort_by = arxiv.SortCriterion.SubmittedDate
)

results = client.results(search)


In [119]:
import json
import datetime

def default_parser(value):
    if isinstance(value, arxiv.Result):
        data = value.__dict__.copy()
        data.pop('_raw')
        return data
    elif isinstance(value, datetime.date):
        return str(value.isoformat())
    else:
        return value.__dict__

result_jsons = []

for result in results:
    result_json = json.dumps(result, default=default_parser, indent=2)
    result_jsons.append(json.loads(result_json))

with open("data.json", "w") as f:
    json.dump(result_jsons, f, indent=2)

In [120]:
import weaviate

client = weaviate.connect_to_custom(
    http_host="weaviate",
    http_port="8080",
    http_secure=False,
    grpc_host="weaviate",
    grpc_port="50051",
    grpc_secure=False,
)

In [121]:
client.get_meta()

{'hostname': 'http://[::]:8080',
 'modules': {'text2vec-transformers': {'model': {'_name_or_path': './models/model',
    'add_cross_attention': False,
    'architectures': ['BertModel'],
    'attention_probs_dropout_prob': 0.1,
    'bad_words_ids': None,
    'begin_suppress_tokens': None,
    'bos_token_id': None,
    'chunk_size_feed_forward': 0,
    'classifier_dropout': None,
    'cross_attention_hidden_size': None,
    'decoder_start_token_id': None,
    'diversity_penalty': 0,
    'do_sample': False,
    'early_stopping': False,
    'encoder_no_repeat_ngram_size': 0,
    'eos_token_id': None,
    'exponential_decay_length_penalty': None,
    'finetuning_task': None,
    'forced_bos_token_id': None,
    'forced_eos_token_id': None,
    'gradient_checkpointing': False,
    'hidden_act': 'gelu',
    'hidden_dropout_prob': 0.1,
    'hidden_size': 384,
    'id2label': {'0': 'LABEL_0', '1': 'LABEL_1'},
    'initializer_range': 0.02,
    'intermediate_size': 1536,
    'is_decoder': False

In [129]:
client.collections.delete("Document")

In [130]:
import weaviate.classes as wvc

client.collections.create(
    "Document",
    vectorizer_config=wvc.config.Configure.Vectorizer.text2vec_transformers(),
    vector_index_config=wvc.config.Configure.VectorIndex.hnsw(),
    properties=[
        wvc.config.Property(name="title", data_type=wvc.config.DataType.TEXT),
        wvc.config.Property(name="summary", data_type=wvc.config.DataType.TEXT),
    ]
)

<weaviate.collections.collection.Collection at 0x7f075490f7f0>

In [131]:
document = client.collections.get("Document")
config = document.config.get()

print(config)

_CollectionConfig(name='Document', description=None, generative_config=None, inverted_index_config=_InvertedIndexConfig(bm25=_BM25Config(b=0.75, k1=1.2), cleanup_interval_seconds=60, index_null_state=False, index_property_length=False, index_timestamps=False, stopwords=_StopwordsConfig(preset=<StopwordsPreset.EN: 'en'>, additions=None, removals=None)), multi_tenancy_config=_MultiTenancyConfig(enabled=False), properties=[_Property(name='title', description=None, data_type=<DataType.TEXT: 'text'>, index_filterable=True, index_searchable=True, nested_properties=None, tokenization=<Tokenization.WORD: 'word'>, vectorizer_config=_PropertyVectorizerConfig(skip=False, vectorize_property_name=True), vectorizer='text2vec-transformers'), _Property(name='summary', description=None, data_type=<DataType.TEXT: 'text'>, index_filterable=True, index_searchable=True, nested_properties=None, tokenization=<Tokenization.WORD: 'word'>, vectorizer_config=_PropertyVectorizerConfig(skip=False, vectorize_proper

In [132]:
with open("data.json", "r") as f:
    data = json.load(f)

fields = ['title', 'summary']

with client.batch.fixed_size(batch_size=200) as batch:
    for data_row in data:
        properties = {field: data_row[field] for field in fields}
        batch.add_object(
            collection='Document',
            properties=properties,
        )

In [133]:
docs = client.collections.get("Document")
response = docs.aggregate.over_all(total_count=True)

print(response.total_count)

1000


In [134]:
response = docs.query.near_text(
    query="car",
    limit=5,
    return_metadata=wvc.query.MetadataQuery(distance=True)
)

for o in response.objects:
    print(o.properties['title'])
    print(o.metadata.distance)

Towards Robust Car Following Dynamics Modeling via Blackbox Models: Methodology, Analysis, and Recommendations
0.7079105377197266
Driving Everywhere with Large Language Model Policy Adaptation
0.7538900375366211
Vehicle Behavior Prediction by Episodic-Memory Implanted NDT
0.7546801567077637
Estimating On-road Transportation Carbon Emissions from Open Data of Road Network and Origin-destination Flow Data
0.8088363409042358
CityFlowER: An Efficient and Realistic Traffic Simulator with Embedded Machine Learning Models
0.8152095079421997


In [143]:
import requests

example_query = 'overview or survey of LLMs'

payload = {'text': example_query}

response = requests.post('http://t2v-transformers:8080/vectors/', json=payload).json()
print(response)

vector = response['vector']

{'text': 'overview or survey of LLMs', 'vector': [0.44878119230270386, -0.04536544159054756, -0.23652151226997375, -0.3060486316680908, 0.3254667818546295, 0.5029256343841553, -0.07935827970504761, -0.0038907788693904877, -0.0341285765171051, 0.2556675970554352, -0.7224306464195251, -0.22549724578857422, 0.37613701820373535, -0.041613515466451645, -0.457935094833374, 0.06133721396327019, 0.2547215223312378, -0.18404462933540344, 0.4099758267402649, 0.20028559863567352, 0.3450828194618225, -0.1943768858909607, 0.8460269570350647, -0.21073603630065918, -0.05531223118305206, -0.16577176749706268, 0.34324216842651367, 0.016099900007247925, -0.2967534363269806, -0.6374852657318115, -0.3311845064163208, 0.6175627708435059, 0.38582342863082886, -0.005985047668218613, -0.14287568628787994, 0.06119507551193237, 0.7380911111831665, 0.5302543640136719, -0.08264720439910889, 0.31282293796539307, 0.2013060599565506, -0.5277292728424072, 0.10601353645324707, 0.4626431465148926, 0.348145991563797, -0

In [151]:
response1 = docs.query.near_text(
    query=example_query,
    limit=10,
    return_metadata=wvc.query.MetadataQuery(distance=True)
)
result1 = [(o.properties['title'], o.metadata.distance) for o in response1.objects]

response2 = docs.query.near_vector(
    near_vector=vector,
    limit=10,
    return_metadata=wvc.query.MetadataQuery(distance=True)
)
result2 = [(o.properties['title'], o.metadata.distance) for o in response2.objects]

assert result1==result2