# Test Environment

This notebook serves as an environment to test various scripts and bits of code.  Nothing here should be dependend on.

In [1]:
# set the project path
%cd ~/projects/drug-pricing

/Users/cccdenhart/projects/drug-pricing


In [2]:
%load_ext autoreload

In [3]:
%autoreload 2
import pandas as pd
from tqdm import tqdm
import numpy as np
from scipy.special import softmax

from src.scripts.aggregate_locations import main
from src.utils import connect_to_mongo, get_nlp
from src.schema import User, Post, SubmissionPost, CommentPost, Location
from src.models.v1.__init__ import load_locations, V1, DENYLIST, DenylistFilter, LocationFilter, FrequencyRanker

In [23]:
main()

Reading data .....
Transforming data .....
Wrote grouped locations to data/locations/grouped-locations.csv .....


In [5]:
connect_to_mongo()

In [24]:
df = pd.read_csv("data/locations/grouped-locations.csv")

In [26]:
df.head()

Unnamed: 0,neighborhood,city,county,state,country,metro,state_full
0,northeast dallas,dallas,dallas,tx,united states of america,dallas-fort worth-arlington,texas
1,maryvale,phoenix,maricopa,az,united states of america,phoenix-mesa-scottsdale,arizona
2,paradise,las vegas,clark,nv,united states of america,las vegas-henderson-paradise,nevada
3,upper west side,new york,new york,ny,united states of america,new york-newark-jersey city,new york
4,south los angeles,los angeles,los angeles,ca,united states of america,los angeles-long beach-anaheim,california


In [25]:
df.shape

(16378, 7)

In [8]:
neigh = pd.read_csv("data/locations/neighborhoods.csv")

In [51]:
# define model
fp = "data/locations/grouped-locations.csv"
locations = pd.read_csv(fp)
filters = [
    DenylistFilter(DENYLIST),
    LocationFilter(locations)
]
rankers = [FrequencyRanker()]
nlp = get_nlp()

model = V1(filters, rankers, locations, nlp)

In [65]:
use_top = True
if use_top:
    # get the users with the most posts
    # pipeline = [{"$sortByCount": "$user"}, {"$limit": 6}]
    pipeline = [{"$limit": 6}]
    res = Post.objects().aggregate(pipeline)
    ids = [str(r["_id"]) for r in res if r["_id"]]
    users = User.objects(id__in=ids)
else:
    usernames = pd.read_csv("data/rand_user_200.csv", squeeze=True).tolist()
    users = User.objects(username__in=usernames).all()

In [66]:
entscores = [model.score_entities(u) for u in users]

In [67]:
locscores = [model.score_locations(e) for e in entscores]

In [68]:
[len(e) for e in entscores]

[]

In [64]:
[len(l) for l in locscores]

[]

In [58]:
entscores[1]

{'san diego': 1.0,
 'virginia': 1.0,
 'massachusetts': 2.0,
 'cambodia': 1.0,
 'arizona': 1.0,
 'phoenix': 1.0,
 'san antonio': 1.0,
 'louisville': 1.0,
 'lexington': 2.0,
 'missouri': 2.0,
 'florida': 1.0,
 'chicago': 1.0,
 'minnesota': 2.0,
 'kentucky': 1.0,
 'texas': 1.0,
 'canada': 1.0,
 'illinois': 1.0,
 'new york': 1.0}

In [55]:
locscores[3]

[('elkhart-goshen', 0.0007792002494621963),
 ('sacramento--roseville--arden-arcade', 0.0007792002494621963),
 ('new orleans-metairie', 0.0007792002494621963),
 ('chicago-naperville-elgin', 0.00028665175233280127),
 ('orlando-kissimmee-sanford', 0.00028665175233280127),
 ('miami-fort lauderdale-west palm beach', 0.00028665175233280127),
 ('indianapolis-carmel-anderson', 0.00028665175233280127),
 ('anchorage', 0.00028665175233280127),
 ('north port-sarasota-bradenton', 0.00028665175233280127),
 ('tampa-st. petersburg-clearwater', 0.00028665175233280127),
 ('urban honolulu', 0.00028665175233280127),
 ('jacksonville', 0.00028665175233280127),
 ('cape coral-fort myers', 0.00028665175233280127),
 ('tallahassee', 0.00028665175233280127),
 ('charleston', 0.00028665175233280127),
 ('panama city', 0.00028665175233280127),
 ('homosassa springs', 0.00028665175233280127),
 ('port st. lucie', 0.00028665175233280127),
 ('key west', 0.00028665175233280127),
 ('new haven-milford', 0.0002866517523328012