In [2]:
import numpy as np
import requests
import random
#baseUrl = "https://notAvailableAnymore"
baseUrl = "http://localhost:8000"
csvfile = "./data.csv"

## Read the sample data

Just a quick ugly snippet to read the CSV and get it in a format suitable to be served to the API


In [2]:
data = np.genfromtxt(csvfile,skip_header=1, dtype=str, delimiter=",")
n_compounds = len(data)
compounds = []

d_min = 6
d_max = 30

for i, entry in enumerate(data):
    compound = {
        "compound": entry[0],
        "properties": [
            {
                "name": entry[1],
                "value": entry[2]
            },
            {
                "name": entry[3],
                "value": entry[4]
            },
            # I could even add any arbitrary Scalar and/or text property here,
            # and the API would still save them and search correctly.
            # Nowhere in the API I defined Density, it just works by design.
            # Uncomment to try it out.
            #{
            #    "name": "Density",
            #    "value": str(d_min + (d_max-d_min)*random.random())
            #}
        ]
    }
    compounds.append(compound)
    
#compounds

In [6]:
def csv_to_compounds(csvfile):
    """
      Read compounds from the provided csvfile.
      The file is assumed to exist and be properly formatted.

      This routine can read any csv formatted in the following way

          compound_name, prop_name_1, prop_value_1, prop_name_2, prop_value_2, prop_name_3, prop_name_3, ...

      So additional properties and values can be added at will
    """
    data = np.genfromtxt(csvfile,skip_header=1, dtype=str, delimiter=",")
    compounds = []

    d_min = 6
    d_max = 30

    for i, entry in enumerate(data):
        # each line in the file is assumed to be:
        # compound_name, prop_name_1, prop_value_1, prop_name_2, prop_value_2, prop_name_3, prop_name_3, ...
        name = entry[0]
        raw_props = entry[1:]
        if len(raw_props)%2 != 0:
            raise Exception("The csv is not formatted properly")
        properties = []
        for j in range(len(raw_props)//2):
            prop = {}
            prop["name"] = raw_props[j*2]
            prop["value"] = raw_props[j*2+1]
            properties.append(prop)

        compound = {
            "compound": name,
            "properties": properties
        }
        compounds.append(compound)

    return compounds

csv_to_compounds(csvfile)

[{'compound': 'Cd1I2',
  'properties': [{'name': 'Band gap', 'value': '3.19'},
   {'name': 'Color', 'value': 'White'}]},
 {'compound': 'Zr1S2',
  'properties': [{'name': 'Band gap', 'value': '1.68'},
   {'name': 'Color', 'value': 'Violet'}]},
 {'compound': 'Ga1Sb1',
  'properties': [{'name': 'Band gap', 'value': '0.812'},
   {'name': 'Color', 'value': 'Light Gray'}]},
 {'compound': 'P',
  'properties': [{'name': 'Band gap', 'value': '1.6'},
   {'name': 'Color', 'value': 'Red'}]},
 {'compound': 'Ca1Te1',
  'properties': [{'name': 'Band gap', 'value': '4.07'},
   {'name': 'Color', 'value': 'White'}]},
 {'compound': 'Fe1S2',
  'properties': [{'name': 'Band gap', 'value': '1.2'},
   {'name': 'Color', 'value': 'Yellow'}]},
 {'compound': 'Al1Sb1',
  'properties': [{'name': 'Band gap', 'value': '1.6'},
   {'name': 'Color', 'value': 'Dark Gray'}]},
 {'compound': 'Hf1S3',
  'properties': [{'name': 'Band gap', 'value': '2.85'},
   {'name': 'Color', 'value': 'Ocher'}]},
 {'compound': 'In2Te3',
  

## Clear the database
- unique identifiers for each compound were not provided, so clear the database to avoid duplicates

In [18]:
def api_clear(baseUrl, verbose=True):
    r = requests.post(baseUrl+"/data/clear/")
    assert r.status_code == 204, "Error while cleaning the database"
            
api_clear(baseUrl)

## Upload the data (slow)
- for each entry make a `POST` request to the proper endpoint
- see below for a faster way to upload a large number of new compounds

In [19]:
def api_add(baseUrl, compounds, verbose=True):
    # let's start from a clean database
    api_clear(baseUrl, verbose)

    for i, compound in enumerate(compounds):
        r = requests.post(baseUrl+"/data/add/", json=compound)
        assert r.status_code == 201, "Error while adding compound# {}".format(i)
        if verbose: print(i,end=" ")

api_add(baseUrl, compounds)

0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 

## Upload the data in a single request (faster)

- clear the Database (unique identifiers for each compound were not provided, so clear the database to avoid duplicates)
- make a single `POST` request that contains all the compounds to be added
- leveraging `bulk_create` in the backend, we will also minimize the number of connections to the database

In [20]:
def api_batchadd(baseUrl, compounds, verbose=True):
    # let's start from a clean database
    api_clear(baseUrl, verbose)
    r = requests.post(baseUrl+"/data/batchadd/", json=compounds)
    assert r.status_code == 201, "Error while adding compounds"
    if verbose: print("OK")
            
api_batchadd(baseUrl, compounds)

OK


## Filter

### `Compound` filter
#### Implemented `logic`: 
- `contains`
- `eq`
- `startswith`
- `endswith`
- `any`


### `ScalarProperty` filter (for band gap, density, etc)

#### Implemented `logic`: 
- `eq` 
- `lt`
- `lte`
- `gt`
- `gte`
- `any`

### `TextProperty` filter (for color, and other non numerical porps)

#### Implemented `logic`: 
- `eq`
- `contains`
- `any`


In [27]:
def api_search(baseUrl, filter_dict, verbose=True):
    r = requests.post(baseUrl+"/data/search/", json=filter_dict)
    if verbose:
        print("Found {} matching compounds".format(len(r.json())))
        for compound in r.json():
            print(compound)
    return r

## Local Search Routines

Some hacky throw-away routines to search the compound dictionary that we generated from the csv, to ensure that the result from the API actually matches the result evaluated locally

In [22]:
def local_search(compounds, filter_dict):
    """
        just a throw-away routine to filter the local compounds dictionary,
        the result of the local filter, and the api filter can be compared to ensure they match.
        ensuring proper formatting is out of the scope of this, so ensure filter_dict is always valid
    """
    matches = []
    for c in compounds:
        match = True
        if "properties" in filter_dict:
            for prop in filter_dict["properties"]:
                value = _sanitize_value(prop["value"])
                if isinstance(value, float):
                    match = _scalar_prop_match(c, prop)
                else:
                    match = _text_prop_match(c, prop)
        
                if not match: break # if any of the properties don't match no need to look further

        if match and "compound" in filter_dict:
            # if there is a name in the query, add it to the QS filter
            match = _name_match(c, filter_dict["compound"])
        
        if match: matches.append(c)
    return matches
        
        
def _name_match(compound, rule):
    match = True
    if rule["logic"] == "eq":
        match = compound["compound"] == rule["value"]
    elif rule["logic"] == "startswith":
        match = compound["compound"].startswith(rule["value"])
    elif rule["logic"] == "endswith":
        match = compound["compound"].endswith(rule["value"])
    elif rule["logic"] == "contains":
        match = rule["value"] in compound["compound"]
    # anything else stays true
    #print(compound, rule, match)
    #print()
    return match
    

def _text_prop_match(compound, rule):
    if rule["logic"] == "any": return True
    
    match = True
    compound_prop = _get_prop(compound, rule["name"])
    
    # this compound doesn't even have the requested prop
    if compound_prop is None: return False
    
    if rule["logic"] == "eq":
        match = compound_prop["value"] == rule["value"]
    elif rule["logic"] == "contains":
        match = rule["value"] in compound_prop["value"]
    #print(compound_prop, rule, match)
    #print()
    return match
    

def _scalar_prop_match(compound, rule):
    if rule["logic"] == "any": return True
    
    match = True
    compound_prop = _get_prop(compound, rule["name"])
    
    # this compound doesn't even have the requested prop
    if compound_prop is None: return False
    
    rule_val = _sanitize_value(rule["value"])
    prop_val = _sanitize_value(compound_prop["value"])
    if rule["logic"] == "eq":
        match = rule_val == prop_val
    elif rule["logic"] == "gte":
        match = prop_val >= rule_val
    elif rule["logic"] == "gt":
        match = prop_val > rule_val
    elif rule["logic"] == "lte":
        match = prop_val <= rule_val
    elif rule["logic"] == "lt":
        match = prop_val < rule_val
    #print(compound, rule, match)
    #print()
    return match

def _get_prop(compound, prop_name):
    compound_prop = None
    for prop in compound["properties"]:
        if prop["name"] == prop_name:
            compound_prop = prop
            break
    return compound_prop

def _sanitize_value(value_in):
    """
      According to the assignment, all property values are passed as strings,
      but we may need to implement a different logic depending on whether the
      property is an actual string, or should be treated as a number.
      This simple helper routine does just that.
    """
    try:
        value = float(value_in)
    except ValueError:
        value = str(value_in)
    return value

## Testing the Search API


### Empty filter

In [23]:
# Empty filter returns all the compounds in the DB
empty_filter = {}

api_res = api_search(baseUrl,empty_filter,verbose=False)
local_res = local_search(compounds,empty_filter)

assert len(api_res.json()) == len(local_res)

### Name filter

In [24]:
# Only filter the name of the compound, not any property
name_filter = {
    "compound": {
        "value": "Pb",
        "logic": "contains"
    }
}

api_res = api_search(baseUrl,name_filter,verbose=False)
local_res = local_search(compounds,name_filter)

assert len(api_res.json()) == len(local_res)

### Complete filter (Name + Properties)

In [25]:
# complete filter
complete_filter = {
    "compound": {
        "value": "Se",
        "logic": "contains"
    },
    "properties": [
        {
            "name": "Band gap",
            "value": "3",
            "logic": "lt"
        },
        {
            "name": "Color",
            "value": "Gray",
            "logic": "contains"
        },
        # Showing that density or any other property works as well
        #{
        #    "name": "Density",
        #    "value": "15",
        #    "logic": "gte"
        #}
    ]
}

api_res = api_search(baseUrl,complete_filter,verbose=False)
local_res = local_search(compounds,complete_filter)

assert len(api_res.json()) == len(local_res)

### Wrong filter (400 response expected)

In [28]:
# wrong filter, the format of the request body does not match what the api expects, a 400 error code will be returned.
wrong_filter = {
    "compound": {
        "wrong_value": "Se",
        "wrong_logic": "contains"
    }
}

api_res = api_search(baseUrl,wrong_filter,verbose=False)

assert api_res.status_code == 400

In [8]:
try:
    assert 3 ==2
except Exception:
    print("whoops")

whoops
