# Drug Pricing Project - Analysis

## Import Packages and Data

In [43]:
import csv
import re
import os
import nltk
import argparse
import itertools
import glob
import datetime
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from collections import Counter, defaultdict, deque, OrderedDict
from nltk.corpus import stopwords
from nltk.tokenize import sent_tokenize, word_tokenize
from nltk.stem import PorterStemmer
from itertools import chain
pd.options.display.max_colwidth = 450
porter_stemmer = PorterStemmer()

In [44]:
import sys
sys.path.insert(0, '/Users/jackiereimer/Dropbox/Drug Pricing Project/code/reddit_preprocessing')
from reddit_preprocessing import reddit_preprocessing as rp

In [None]:
pwd

In [3]:
class ArgumentContainer(object):
    def __init__(self):
        self.data_folder = "opiates"
        self.keyterm_folder = "keyterm_lists"
        self.complete_threads_file = "use_data/threads/all_dumps.csv"
        self.complete_comments_file = "use_data/comments/all_comments.csv"
        self.stop_words = "stop_words"
        self.location_folder = "location"
        self.mat_folder = "mat"
        self.unit_folder = "unit"
        self.currency_folder = "currency"
        self.output_folder = "output"
        self.file_folder = None


if 'args' not in dir():
    args = ArgumentContainer()

In [4]:
locations_filepath, mat_filepath, all_comments_filepath, all_dumps_filepath, unit_filepath, currency_filepath, output_filepath, stopwords_filepath = rp.assign_location_dirs(args.data_folder, args.complete_threads_file, args.complete_comments_file, args.location_folder, args.mat_folder, args.unit_folder, args.currency_folder, args.output_folder, args.stop_words, args.file_folder)
locations, state_init = rp.generates_non_case_sensitive_list_of_keyterms(locations_filepath)
meth_words, sub_words, nalt_words, narc_words = rp.generates_non_case_sensitive_list_of_keyterms(mat_filepath)
currencies = rp.generates_non_case_sensitive_list_of_keyterms(currency_filepath)[0]
units = rp.generates_non_case_sensitive_list_of_keyterms(unit_filepath)[0]
more_stops = rp.generates_non_case_sensitive_list_of_keyterms(stopwords_filepath)[0]

All thread file: /Users/jackiereimer/Dropbox/drug_pricing_data/opiates/use_data/threads/all_dumps.csv
All comment file: /Users/jackiereimer/Dropbox/drug_pricing_data/opiates/use_data/comments/all_comments.csv
Stop Words file: /Users/jackiereimer/Dropbox/Drug Pricing Project/keyterm_lists/stop_words
Locations file: /Users/jackiereimer/Dropbox/Drug Pricing Project/keyterm_lists/location
MAT file: /Users/jackiereimer/Dropbox/Drug Pricing Project/keyterm_lists/mat
Unit file: /Users/jackiereimer/Dropbox/Drug Pricing Project/keyterm_lists/unit
Currency file: /Users/jackiereimer/Dropbox/Drug Pricing Project/keyterm_lists/currency
Output folder: /Users/jackiereimer/Dropbox/Drug Pricing Project/keyterm_lists/output
/Users/jackiereimer/Dropbox/Drug Pricing Project/keyterm_lists/location/locations.csv
/Users/jackiereimer/Dropbox/Drug Pricing Project/keyterm_lists/location/state_init.csv
/Users/jackiereimer/Dropbox/Drug Pricing Project/keyterm_lists/mat/naloxone_words.csv
/Users/jackiereimer/Dropb

In [5]:
len(locations)

774

In [6]:
locations = [item.lower() for item in locations]
locations_50 = locations[:50]

In [48]:
total_thread_tuples, total_threads = rp.list_of_threads_from_csv(args.data_folder, all_dumps_filepath)
total_comment_tuples, total_comments = rp.list_of_comments_from_csv(args.data_folder, all_comments_filepath)
total_posts = total_threads + total_comments
thread_tuples_headers = ['post_id','time','no_comments', 'post_title', 'post_body']
comment_tuples_headers = ['comment_id', 'time', 'reply_id', 'post_body']
stop = stopwords.words('english')
stop_all = stop + more_stops

All r/opiates/ threads aggregated
All r/opiates/ comments aggregated


## Put Data into DataFrame and Define Regex Filters

In [47]:
general_re = r"^.*\b({})\b.*$" # matches standalone strings
digit_re = r"\s\b\d{1,3}\b" # matches standalone numbers between 1 and 3 digits
price_re = r'^.*[{}]\s?\d{{1,3}}(?:[.,]\d{{3}})*(?:[.,]\d{{1,2}})?.*$' # matches standalone numbers of currency format with preceding currency symbol
unit_price_re = r'[{}]?\d+[/]\D\S+' # matches string of format 'digit(s)/letter(s)' (e.g. $40/gram, 5/mg)
surrounding_dollar_re = r'(?P<before>(?:\w+\W+){5})\$\d+(?:\.\d+)?(?P<after>(?:\W+\w+){5})' # matches the five words that surround the mention of '$'
surrounding_words_re = r'(?P<before>(?:\w+\W+){})[{}]\d+(?:\.\d+)?(?P<after>(?:\W+\w+){})' # requires three inputs (digit, keywords, digit), matches the digit number of words that surround keyword

In [46]:
def convert_key_word_threads_to_df(df, search_for, regexp, case_sensitive=False):
    """
    """
    print('Number of strings searched: %s' % df.shape[0])
    print('Number of keywords searching for: %s' % len(search_for))
    dt_start = datetime.datetime.now()
    print('Starting time:', dt_start)
    if not case_sensitive:
        flag = re.I
    else:
        flag = False
    i = 0
    new_df = df
    for keyword in search_for:
        i += 1
        print('Word %s out of %s' % (i, len(search_for)))
        print('Time elapsed:', datetime.datetime.now() - dt_start)
        word = re.compile(regexp.format(keyword), flags=flag)
        new_df[keyword] = new_df.astype(str).sum(axis=1).str.contains(word, regex=True)
    return new_df

In [49]:
comment_df = pd.DataFrame(total_comment_tuples, columns=comment_tuples_headers)
thread_df = pd.DataFrame(total_thread_tuples, columns=thread_tuples_headers)
reddit_df = thread_df.append(comment_df)
reddit_df = reddit_df[['time', 'post_id', 'reply_id', 'no_comments', 'comment_id', 'post_title', 'post_body']]
#reddit_df = reddit_df.applymap(lambda s:s.lower() if type(s) == str else s)

# STOP HERE FOR FULL r/Opiates DF

### MAT DataFrames

In [None]:
narc_df = convert_key_word_threads_to_df(reddit_df, narc_words, general_re, case_sensitive=False)

In [None]:
meth_df = convert_key_word_threads_to_df(reddit_df, meth_words, general_re, case_sensitive=False)

In [None]:
nalt_df = convert_key_word_threads_to_df(reddit_df, nalt_words, general_re, case_sensitive=False)

In [None]:
sub_df = convert_key_word_threads_to_df(reddit_df, sub_words, general_re, case_sensitive=False)

In [None]:
narc_time = narc_only_df['time']
meth_time = meth_only_df['time']
nalt_time = nalt_only_df['time']
sub_time = sub_only_df['time']

### Drug Pricing DataFrames

Because of the high volume of potential locations, it is most efficient to apply other filters, remove non-matching features/observations and then filter for locations

In [None]:
price_df = convert_key_word_threads_to_df(reddit_df, currencies, price_re, case_sensitive=False)

In [None]:
price_df_nonzero = price_df.loc[:, (price_df != 0).any(axis=0)]
price_df_nonzero = price_df_nonzero[price_df_nonzero.select_dtypes([bool]).any(1)]

In [None]:
unit_df = convert_key_word_threads_to_df(price_df_nonzero, units, general_re, case_sensitive=False)
unit_df_nonzero = unit_df.loc[:, (unit_df != 0).any(axis=0)]

In [51]:
location_df = convert_key_word_threads_to_df(reddit_df, locations_10, general_re, case_sensitive=False)

Number of strings searched: 2000449
Number of keywords searching for: 10
Starting time: 2019-02-22 16:03:37.370174
Word 1 out of 10
Time elapsed: 0:00:00.000228




Word 2 out of 10
Time elapsed: 0:00:29.882786
Word 3 out of 10
Time elapsed: 0:00:58.888928
Word 4 out of 10
Time elapsed: 0:01:33.392682
Word 5 out of 10
Time elapsed: 0:02:06.430665
Word 6 out of 10
Time elapsed: 0:02:41.995413
Word 7 out of 10
Time elapsed: 0:03:19.432200
Word 8 out of 10
Time elapsed: 0:04:01.531544
Word 9 out of 10
Time elapsed: 0:04:42.823146
Word 10 out of 10
Time elapsed: 0:05:28.142118


In [57]:
location_df_nonzero = location_df.loc[:, (location_df != 0).any(axis=0)]

Define a column of the matched words

In [55]:
# location_df['Matches'] = location_df.eq(True).dot(location_df.columns+',').str[:-1].str.split()
#location_df.drop(["No. Matches"], axis=1)
location_df['Matches'] = location_df.eq(True).dot(location_df.columns+',').str[:-1].str.split(',')

In [60]:
location_df_matches.shape

(273, 19)

In [56]:
location_df['No. Matches'] = location_df['Matches'].apply(lambda x: len(x))

In [58]:
location_df_matches = location_df.loc[location_df['No. Matches'] > 1]

In [59]:
location_df_0_matches = location_df.loc[location_df['No. Matches'] == 1]

In [36]:
location_df_0_matches = location_df_0_matches[:750]

In [61]:
location_test_subset = location_df_matches.append(location_df_0_matches)

In [40]:
location_test_subset

Unnamed: 0,time,post_id,reply_id,no_comments,comment_id,post_title,post_body,new york,nyc,brooklyn,los angeles,chicago,chitown,philly,miami,jacksonville,detroit,Matches,No. Matches
688,1472677524.0,50jieo,,25,,why isn't dope more popular in fl?,"i've always heard florida has a pretty lame dope scene, with generally higher prices and lower purity than just about anywhere else on the east coast (besides perhaps maine and other isolated areas). is it because of the popularity of painkillers, which still seem to be pretty widely available, despite the crackdown on pill mills? i understand that the port of miami inspects more cans in anticipation of drug trafficking, but then why is coke ...",False,False,False,False,False,False,True,True,False,False,"[philly, miami]",2
43101,1401723091.0,2748wi,,30,,is heroin use increasing nationwide or is it just in my area/a couple cities?,i go to school on the west coast but i'm home for the east coast all summer. where i go to school i know literally nobody but myself that uses opiates and the only opiate news i hear is them cracking down on crooked doctors. when i come home it seems at least once a week a kid is overdosing on h(im sure many many more people in the area od than that but its only the ones 20 and under they show for the story) and it's some sort of new(as of l...,False,True,False,False,False,False,True,False,False,False,"[nyc, philly]",2
51956,1399675674.0,25608q,,9,,"court ordered rehab: how to pay, experiences?","posted on here last week: my bf was arrested in nyc, brooklyn specifically, with 45 bags of h. got sent to drug treatment court where they postponed his plea hearing to evaluate his addiction. since last week he has been going thru confusing hell trying to apply for treatment. the case worker wants him to show he's serious on getting a jump on detox and inpatient rehab. he's without insurance and makes low income trying to get medicaid so he ...",False,True,True,False,False,False,False,False,False,False,"[nyc, brooklyn]",2
70882,1376773787.0,1kkife,,9,,i had the strangest fuckin dream last night...,me and my boy john had somehow walked from the bay area to the outskirts of nyc. we were both fiending for some good ecp and i had the bright idea to hit up r/opiates to find a connect in brooklyn. guess that's what happens when i take a fat shot of tar to the dome at 2am.,False,True,True,False,False,False,False,False,False,False,"[nyc, brooklyn]",2
88147,1502028922.0,6ryjyp,,20,,so why is it that cali only gets tar,"first time post whats good so of course i assume that the entire west coast, pac northwest, southwest etc. are plugged up by the same distribution network and maybe tar is the choice just because it's cheaper not to refine and because the users are used to it as far as what's been around but then why is it that new york philly bmore etc. are getting that nice white fluffy while i'm out here confusing my dabs with my shit? (assuming by now the...",True,False,False,False,False,False,True,False,False,False,"[new york, philly]",2
95612,1525487824.0,8h4n4r,,3,,mafia,to those of you in the larger cities do you ever see any trace of the mob in your dealings with per se nyc or chicago underworld? is it prevalent still?,False,True,False,False,True,False,False,False,False,False,"[nyc, chicago]",2
95624,1363577939.0,1ai6jd,,4,,need some advice on copping in new area,so i just moved to nyc from philly recently and don't know how to go about finding myself a new connect. i've scoured the streets trying to find some but they don't advertise it here like they do in camden and north philly. i'm in washington heights so it's there i'm just bad about approaching anyone and asking outright. any nyc bropiates have any advice for me? anything would help at this point.,False,True,False,False,False,False,True,False,False,False,"[nyc, philly]",2
102404,1522617236.0,88u2iu,,46,,what do people call dope where you live?,"im in new york (edit: the state, not nyc) and everyone calls it ""m"". i've never heard it called this anywhere else, but dominicans/ricans run the game in my city (m = montega). coke also goes by ""p"" (for perica (sic))",True,True,False,False,False,False,False,False,False,False,"[new york, nyc]",2
108882,1405124715.0,2ah2bt,,7,,open air markets,"i was readitng this thing online that new york no longer really has open air markets, but chicago still does.. wonder why that is... does anyone have suggestions on how to cope when there is none?",True,False,False,False,True,False,False,False,False,False,"[new york, chicago]",2
117944,1530351482.0,8v0s4i,,6,,how is heroin usually packaged in nj?,"glasaine like nyc, or ziplock with wax paper inside like philly? thanks!",False,True,False,False,False,False,True,False,False,False,"[nyc, philly]",2


In [62]:
sys.path.insert(0, '/Users/jackiereimer/Dropbox/Drug Pricing Project/analysis_output')
location_test_subset.to_csv('location_df_test_subset.csv')

In [None]:
location_df_3_matches['time_new'] = pd.to_datetime(location_df_3_matches['time'], unit='s')
location_df_3_matches['date'] = location_df_3_matches['time_new'].dt.date
location_df_3_matches['time_new'] = pd.to_datetime(location_df_3_matches['time'], unit='s')
location_df_3_matches['time'] = pd.to_numeric(location_df_3_matches['time'])
location_df_3_matches['date'] = pd.to_datetime(location_df_3_matches['time_new'])



location_df_3_match_hist = location_df_3_matches['date'].groupby([location_df_3_matches["date"].dt.year, location_df_3_matches["date"].dt.month]).count().plot(kind="bar")


n = 12
n_posts = len(location_df_3_matches.index)
ticks = location_df_3_match_hist.xaxis.get_ticklocs()
ticklabels = [l.get_text() for l in location_df_3_match_hist.xaxis.get_ticklabels()]
location_df_3_match_hist.xaxis.set_ticks(ticks[::n])
location_df_3_match_hist.xaxis.set_ticklabels(ticklabels[::n])
location_df_3_match_hist.set_title('3 Matches')
location_df_3_match_hist.set_xlabel('(Month, Year)')
location_df_3_match_hist.set_ylabel('No. Posts')
location_df_3_match_hist.text(2, 130, '$n=%s$ ' % n_posts)

plt.rcParams['figure.dpi'] = 450 # default for me was 75
plt.show()



In [None]:
%load_ext autoreload
%autoreload 2
%matplotlib inline
import os

import numpy as np
from snorkel import SnorkelSession
session = SnorkelSession()

In [None]:
from snorkel.models import candidate_subclass
Person = candidate_subclass('Keyword', ['keyword'])

In [None]:
def find_keyword(s, keyword):
    post = s.split(' ')
    return 1 if keyword in post.get_parent().words else 0

In [50]:
locations_10 = ['New York', 'nyc', 'Brooklyn', 'Los Angeles', 'Chicago', 'chitown', 'Philly', 'Miami', 'Jacksonville', 'Detroit'] 