# Drug Pricing Project - Analysis

## Import Packages and Data

In [1]:
import csv
import re
import os
import nltk
import argparse
import itertools
import glob
import datetime
import pandas as pd
import matplotlib.pyplot as plt
from collections import Counter, defaultdict, deque, OrderedDict
from nltk.corpus import stopwords
from nltk.tokenize import sent_tokenize, word_tokenize
from nltk.stem import PorterStemmer
from itertools import chain
pd.options.display.max_colwidth = 450

In [11]:
os.chdir('code/')
from reddit_preprocessing.reddit_preprocessing import reddit_preprocessing as rp

In [4]:
class ArgumentContainer(object):
    def __init__(self):
        self.data_folder = "opiates"
        self.keyterm_folder = "keyterm_lists"
        self.complete_threads_file = "use_data/threads/all_dumps.csv"
        self.complete_comments_file = "use_data/comments/all_comments.csv"
        self.stop_words = "stop_words"
        self.location_folder = "location"
        self.mat_folder = "mat"
        self.unit_folder = "unit"
        self.currency_folder = "currency"
        self.output_folder = "output"
        self.file_folder = None


if 'args' not in dir():
    args = ArgumentContainer()

In [12]:
print(dir(rp))

['Counter', 'OrderedDict', '__builtins__', '__cached__', '__doc__', '__file__', '__loader__', '__name__', '__package__', '__spec__', 'argparse', 'assign_location_dirs', 'chain', 'csv', 'datetime', 'defaultdict', 'deque', 'generates_non_case_sensitive_list_of_keyterms', 'glob', 'itertools', 'list_of_comments_from_csv', 'list_of_threads_from_csv', 'nltk', 'os', 'pd', 're', 'sent_tokenize', 'stopwords', 'word_tokenize']


In [21]:
locations_filepath, mat_filepath, all_comments_filepath, all_dumps_filepath, unit_filepath, currency_filepath, output_filepath, stopwords_filepath = rp.assign_location_dirs(args.data_folder, args.complete_threads_file, args.complete_comments_file, args.location_folder, args.mat_folder, args.unit_folder, args.currency_folder, args.output_folder, args.stop_words, args.file_folder)
state_init, locations = rp.generates_non_case_sensitive_list_of_keyterms(locations_filepath)
nalt_words, sub_words, meth_words, narc_words = rp.generates_non_case_sensitive_list_of_keyterms(mat_filepath)
currencies = rp.generates_non_case_sensitive_list_of_keyterms(currency_filepath)[0]
units = rp.generates_non_case_sensitive_list_of_keyterms(unit_filepath)[0]
more_stops = rp.generates_non_case_sensitive_list_of_keyterms(stopwords_filepath)[0]

All thread file: /Users/jackiereimer/Dropbox/drug_pricing_data/opiates/use_data/threads/all_dumps.csv
All comment file: /Users/jackiereimer/Dropbox/drug_pricing_data/opiates/use_data/comments/all_comments.csv
Stop Words file: /Users/jackiereimer/Dropbox/Drug Pricing Project/keyterm_lists/stop_words
Locations file: /Users/jackiereimer/Dropbox/Drug Pricing Project/keyterm_lists/location
MAT file: /Users/jackiereimer/Dropbox/Drug Pricing Project/keyterm_lists/mat
Unit file: /Users/jackiereimer/Dropbox/Drug Pricing Project/keyterm_lists/unit
Currency file: /Users/jackiereimer/Dropbox/Drug Pricing Project/keyterm_lists/currency
Output folder: /Users/jackiereimer/Dropbox/Drug Pricing Project/keyterm_lists/output
/Users/jackiereimer/Dropbox/Drug Pricing Project/keyterm_lists/location/state_init.csv
/Users/jackiereimer/Dropbox/Drug Pricing Project/keyterm_lists/location/locations.csv
/Users/jackiereimer/Dropbox/Drug Pricing Project/keyterm_lists/mat/naltrexone_words.csv
/Users/jackiereimer/Dro

In [27]:
total_thread_tuples, total_threads = rp.list_of_threads_from_csv(args.data_folder, all_dumps_filepath)
total_comment_tuples, total_comments = rp.list_of_comments_from_csv(args.data_folder, all_comments_filepath)
total_posts = total_threads + total_comments
thread_tuples_headers = ['post_id','time','no_comments', 'post_title', 'post_body']
comment_tuples_headers = ['comment_id', 'time', 'reply_id', 'post_body']
stop = stopwords.words('english')
stop_all = stop + more_stops

All r/opiates/ threads aggregated
All r/opiates/ comments aggregated


## Put Data into DataFrame and Define Regex Filters

In [28]:
general_re = r"^.*\b({})\b.*$" # matches standalone strings
digit_re = r"\s\b\d{1,3}\b" # matches standalone numbers between 1 and 3 digits
price_re = r'^.*[{}]\s?\d{{1,3}}(?:[.,]\d{{3}})*(?:[.,]\d{{1,2}})?.*$' # matches standalone numbers of currency format with preceding currency symbol
unit_price_re = r'[{}]?\d+[/]\D\S+' # matches string of format 'digit(s)/letter(s)' (e.g. $40/gram, 5/mg)
surrounding_dollar_re = r'(?P<before>(?:\w+\W+){5})\$\d+(?:\.\d+)?(?P<after>(?:\W+\w+){5})' # matches the five words that surround the mention of '$'
surrounding_words_re = r'(?P<before>(?:\w+\W+){})[{}]\d+(?:\.\d+)?(?P<after>(?:\W+\w+){})' # requires three inputs (digit, keywords, digit), matches the digit number of words that surround keyword

In [29]:
def convert_key_word_threads_to_df(df, search_for, regexp, case_sensitive=False):
    """
    """
    print('Number of strings searched: %s' % df.shape[0])
    print('Number of keywords searching for: %s' % len(search_for))
    dt_start = datetime.datetime.now()
    print('Starting time:', dt_start)
    if not case_sensitive:
        flag = re.I
    else:
        flag = False
    i = 0
    new_df = df
    for keyword in search_for:
        i += 1
        print('Word %s out of %s' % (i, len(search_for)))
        print('Time elapsed:', datetime.datetime.now() - dt_start)
        word = re.compile(regexp.format(keyword), flags=flag)
        new_df[keyword] = new_df.astype(str).sum(axis=1).str.contains(word, regex=True)
    return new_df

In [30]:
comment_df = pd.DataFrame(total_comment_tuples, columns=comment_tuples_headers)
thread_df = pd.DataFrame(total_thread_tuples, columns=thread_tuples_headers)
reddit_df = thread_df.append(comment_df)
reddit_df = reddit_df[['post_id', 'comment_id', 'reply_id', 'post_title', 'post_body', 'reply_id']]

### MAT DataFrames

In [56]:
narc_df = convert_key_word_threads_to_df(reddit_df, narc_words, general_re, case_sensitive=False)

Number of strings searched: 1995814
Number of keywords searching for: 7
Starting time: 2019-02-01 15:16:20.952179
Word 1 out of 7
Time elapsed: 0:00:00.000111




Word 2 out of 7
Time elapsed: 0:01:14.832369
Word 3 out of 7
Time elapsed: 0:02:13.678160
Word 4 out of 7
Time elapsed: 0:02:55.728131
Word 5 out of 7
Time elapsed: 0:03:40.481408
Word 6 out of 7
Time elapsed: 0:04:29.500893
Word 7 out of 7
Time elapsed: 0:05:23.215066


In [75]:
narc_df_nonzero = narc_df_nonzero.drop(['$','£','€'], axis=1)
narc_df_nonzero = narc_df[narc_df.select_dtypes([bool]).any(1)]

In [72]:
narc_df_nonzero

Unnamed: 0,post_id,comment_id,reply_id,post_title,post_body,reply_id.1,$,£,€,naloxone,nar can,evzio,nrcan,nrcn,narcn,narcan
8,9uj2lg,,,PSA You can buy test strips that test your drugs for fentanyl for $2 a piece - These could have saved Mac Miller and many others lives,,,True,False,False,False,False,False,False,False,False,False
89,5klq7u,,,Straw Poll,£100.00 christmas money: spend on healthy hobbies or spunk the lot on gear on DNM?,,False,True,False,False,False,False,False,False,False,False
115,8bxj30,,,"Fucken Finally, after working from 4-12 bussing tables i re’d up! I was the only busser to so they didn’t give me my hour break 😒but i made $120 in tips and direct deposit hit at midnight! perfect timing! i picked up 8 30s and 3 20s and 4 joints,but i popped 2 30s and faced a J. Feeling so happy🤤",,,True,False,False,False,False,False,False,False,False,False
173,6c5kh8,,,Morphine 30?,"I'm a heavy tar smoker. Dealer says he has ""morphine pill 30 milligram"" for $30. Assuming it's real, would it be worth it for me? Think that'd get me off?",,True,False,False,False,False,False,False,False,False,False
317,37o1qo,,,narcan,does anyone know in which manner it is legally possible in the united states to acquire narcan? i want to make sure i dont die during my hedonistic pleasures,,False,False,False,False,False,False,False,False,False,True
410,31hk8i,,,Total Cost: $48 [Pill Porn],,,True,False,False,False,False,False,False,False,False,False
418,6rw6yx,,,Am I going to OD?,Hey i just took a Vicodin. Watson 3202 and an Aleve. Am I going to be ok? I believe it is 5 mg and I am nodding really hard. Is the Aleve going to be dangerous with the opiate? I called one of my buddies to bring some narcan over.,,False,False,False,False,False,False,False,False,False,True
427,2fy5v3,,,5 xanies and 3 blue M30's $55->advantages of being friends with your dealer :-),,,True,False,False,False,False,False,False,False,False,False
453,5vujw3,,,Just had to perform CPR for the first time on my friend.,"Like the title says I just had to perform CPR on my good friend because he was overdosing on dope. It was one of the scariest things I have ever done in my life. He had drank about a 750 of whisky and vodka, did a bit of coke, and then did a bag (sniffed) and turned blue. I didn't have any narcan or anything but he stopped breathing and i had to do CPR for about 10 to 15 minutes before a EMT showed up. Thankfully he became conscious and start...",,False,False,False,False,False,False,False,False,False,True
526,t96cg,,,Please. A Friendly reminder to my fellow bropiates...,"I just got home from the services for my girlfriend who overdosed and died last week. Worst day ever. Please remember everyone to be very careful, especially those of us who are IV H users, and who are coming back after some clean time, or mixing substances. If you live in a city were you can get Narcan DO IT. I always keep it with me if using, and it has saved a life around me on more than one occasion. Better safe than sorry. You can always...",,False,False,False,False,False,False,False,False,False,True


In [None]:
meth_df = convert_key_word_threads_to_df(reddit_df, meth_words, general_re, case_sensitive=False)
nalt_df = convert_key_word_threads_to_df(reddit_df, nalt_words, general_re, case_sensitive=False)
sub_df = convert_key_word_threads_to_df(reddit_df, sub_words, general_re, case_sensitive=False)

In [None]:
location_df_filtered['tokenized_post_body'] = location_df_filtered.apply(lambda row: nltk.word_tokenize(row['post_body']), axis=1)

In [None]:
location_df_filtered['post_body_no_stops'] = location_df_filtered['tokenized_post_body'].apply(lambda x: [item for item in x if item not in stop_all])

In [None]:
stemmer = PorterStemmer()

location_df_filtered['post_body_stemmed'] = location_df_filtered['post_body_stemmed'].apply()

### Drug Pricing DataFrames

Because of the high volume of potential locations, it is most efficient to apply other filters, remove non-matching features/observations and then filter for locations

In [32]:
price_df = convert_key_word_threads_to_df(reddit_df, currencies, price_re, case_sensitive=False)

Number of strings searched: 1995814
Number of keywords searching for: 3
Starting time: 2019-02-01 11:44:58.407419
Word 1 out of 3
Time elapsed: 0:00:00.002508
Word 2 out of 3
Time elapsed: 0:00:26.730054
Word 3 out of 3
Time elapsed: 0:00:58.637753


In [33]:
price_df_nonzero = price_df.loc[:, (price_df != 0).any(axis=0)]
price_df_nonzero = price_df_nonzero[price_df_nonzero.select_dtypes([bool]).any(1)]

In [34]:
unit_df = convert_key_word_threads_to_df(price_df_nonzero, units, general_re, case_sensitive=False)
unit_df_nonzero = unit_df.loc[:, (unit_df != 0).any(axis=0)]

Number of strings searched: 24841
Number of keywords searching for: 67
Starting time: 2019-02-01 11:46:57.258250
Word 1 out of 67
Time elapsed: 0:00:00.000296




Word 2 out of 67
Time elapsed: 0:00:01.571242
Word 3 out of 67
Time elapsed: 0:00:02.218666
Word 4 out of 67
Time elapsed: 0:00:02.957112
Word 5 out of 67
Time elapsed: 0:00:03.724964
Word 6 out of 67
Time elapsed: 0:00:04.345175
Word 7 out of 67
Time elapsed: 0:00:05.017414
Word 8 out of 67
Time elapsed: 0:00:05.688993
Word 9 out of 67
Time elapsed: 0:00:06.411668
Word 10 out of 67
Time elapsed: 0:00:07.387287
Word 11 out of 67
Time elapsed: 0:00:08.306141
Word 12 out of 67
Time elapsed: 0:00:09.101802
Word 13 out of 67
Time elapsed: 0:00:09.879398
Word 14 out of 67
Time elapsed: 0:00:10.676885
Word 15 out of 67
Time elapsed: 0:00:11.499989
Word 16 out of 67
Time elapsed: 0:00:12.344981
Word 17 out of 67
Time elapsed: 0:00:13.212373
Word 18 out of 67
Time elapsed: 0:00:14.083928
Word 19 out of 67
Time elapsed: 0:00:15.006128
Word 20 out of 67
Time elapsed: 0:00:15.938358
Word 21 out of 67
Time elapsed: 0:00:16.894954
Word 22 out of 67
Time elapsed: 0:00:17.870703
Word 23 out of 67
Tim

In [36]:
location_df = convert_key_word_threads_to_df(unit_df_nonzero, locations, general_re, case_sensitive=False)
location_df_nonzero = location_df.loc[:, (location_df != 0).any(axis=0)]

Number of strings searched: 24841
Number of keywords searching for: 774
Starting time: 2019-02-01 11:56:29.015338
Word 1 out of 774
Time elapsed: 0:00:00.000137


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy


Word 2 out of 774
Time elapsed: 0:00:03.051354
Word 3 out of 774
Time elapsed: 0:00:05.911197
Word 4 out of 774
Time elapsed: 0:00:08.922828
Word 5 out of 774
Time elapsed: 0:00:11.786054
Word 6 out of 774
Time elapsed: 0:00:14.886487
Word 7 out of 774
Time elapsed: 0:00:17.444228
Word 8 out of 774
Time elapsed: 0:00:20.323078
Word 9 out of 774
Time elapsed: 0:00:23.026965
Word 10 out of 774
Time elapsed: 0:00:26.014430
Word 11 out of 774
Time elapsed: 0:00:28.668347
Word 12 out of 774
Time elapsed: 0:00:31.383477
Word 13 out of 774
Time elapsed: 0:00:34.957722
Word 14 out of 774
Time elapsed: 0:00:39.241461
Word 15 out of 774
Time elapsed: 0:00:43.442956
Word 16 out of 774
Time elapsed: 0:00:46.665417
Word 17 out of 774
Time elapsed: 0:00:49.364167
Word 18 out of 774
Time elapsed: 0:00:51.973556
Word 19 out of 774
Time elapsed: 0:00:54.373231
Word 20 out of 774
Time elapsed: 0:00:56.799436
Word 21 out of 774
Time elapsed: 0:00:59.311673
Word 22 out of 774
Time elapsed: 0:01:01.784345


Word 172 out of 774
Time elapsed: 0:14:10.282052
Word 173 out of 774
Time elapsed: 0:14:16.153712
Word 174 out of 774
Time elapsed: 0:14:22.075663
Word 175 out of 774
Time elapsed: 0:14:28.017002
Word 176 out of 774
Time elapsed: 0:14:34.853409
Word 177 out of 774
Time elapsed: 0:14:41.521514
Word 178 out of 774
Time elapsed: 0:14:48.012339
Word 179 out of 774
Time elapsed: 0:14:55.105518
Word 180 out of 774
Time elapsed: 0:15:01.899845
Word 181 out of 774
Time elapsed: 0:15:08.553568
Word 182 out of 774
Time elapsed: 0:15:14.813919
Word 183 out of 774
Time elapsed: 0:15:21.916945
Word 184 out of 774
Time elapsed: 0:15:30.125390
Word 185 out of 774
Time elapsed: 0:15:37.626621
Word 186 out of 774
Time elapsed: 0:15:45.039162
Word 187 out of 774
Time elapsed: 0:15:51.606823
Word 188 out of 774
Time elapsed: 0:15:57.911935
Word 189 out of 774
Time elapsed: 0:16:04.231962
Word 190 out of 774
Time elapsed: 0:16:11.645427
Word 191 out of 774
Time elapsed: 0:16:18.256064
Word 192 out of 774


Word 340 out of 774
Time elapsed: 0:40:59.960958
Word 341 out of 774
Time elapsed: 0:41:17.843399
Word 342 out of 774
Time elapsed: 0:41:37.094767
Word 343 out of 774
Time elapsed: 0:41:58.756624
Word 344 out of 774
Time elapsed: 0:42:14.425034
Word 345 out of 774
Time elapsed: 0:42:27.990244
Word 346 out of 774
Time elapsed: 0:42:39.418552
Word 347 out of 774
Time elapsed: 0:42:50.250458
Word 348 out of 774
Time elapsed: 0:42:59.954527
Word 349 out of 774
Time elapsed: 0:43:10.996384
Word 350 out of 774
Time elapsed: 0:43:21.088727
Word 351 out of 774
Time elapsed: 0:43:30.844151
Word 352 out of 774
Time elapsed: 0:43:40.601430
Word 353 out of 774
Time elapsed: 0:43:50.456490
Word 354 out of 774
Time elapsed: 0:44:00.388710
Word 355 out of 774
Time elapsed: 0:44:10.442218
Word 356 out of 774
Time elapsed: 0:44:20.666823
Word 357 out of 774
Time elapsed: 0:44:30.633637
Word 358 out of 774
Time elapsed: 0:44:40.459474
Word 359 out of 774
Time elapsed: 0:44:50.261380
Word 360 out of 774


Word 508 out of 774
Time elapsed: 1:21:17.817138
Word 509 out of 774
Time elapsed: 1:21:33.444387
Word 510 out of 774
Time elapsed: 1:21:50.464565
Word 511 out of 774
Time elapsed: 1:22:08.415511
Word 512 out of 774
Time elapsed: 1:22:26.285524
Word 513 out of 774
Time elapsed: 1:22:44.131352
Word 514 out of 774
Time elapsed: 1:23:04.295624
Word 515 out of 774
Time elapsed: 1:23:19.337305
Word 516 out of 774
Time elapsed: 1:23:33.839562
Word 517 out of 774
Time elapsed: 1:23:48.533341
Word 518 out of 774
Time elapsed: 1:24:06.012198
Word 519 out of 774
Time elapsed: 1:24:22.087845
Word 520 out of 774
Time elapsed: 1:24:37.194373
Word 521 out of 774
Time elapsed: 1:24:52.565394
Word 522 out of 774
Time elapsed: 1:25:09.557140
Word 523 out of 774
Time elapsed: 1:25:23.945472
Word 524 out of 774
Time elapsed: 1:25:40.553801
Word 525 out of 774
Time elapsed: 1:25:59.992891
Word 526 out of 774
Time elapsed: 1:26:16.885439
Word 527 out of 774
Time elapsed: 1:26:31.296340
Word 528 out of 774


Word 676 out of 774
Time elapsed: 2:22:28.604987
Word 677 out of 774
Time elapsed: 2:22:52.939936
Word 678 out of 774
Time elapsed: 2:23:14.159693
Word 679 out of 774
Time elapsed: 2:23:37.017121
Word 680 out of 774
Time elapsed: 2:24:02.854478
Word 681 out of 774
Time elapsed: 2:24:23.851504
Word 682 out of 774
Time elapsed: 2:24:52.588989
Word 683 out of 774
Time elapsed: 2:25:18.912361
Word 684 out of 774
Time elapsed: 2:25:39.425132
Word 685 out of 774
Time elapsed: 2:26:03.326678
Word 686 out of 774
Time elapsed: 2:26:27.711074
Word 687 out of 774
Time elapsed: 2:26:49.077717
Word 688 out of 774
Time elapsed: 2:27:11.475689
Word 689 out of 774
Time elapsed: 2:27:38.570888
Word 690 out of 774
Time elapsed: 2:28:04.925462
Word 691 out of 774
Time elapsed: 2:28:36.858525
Word 692 out of 774
Time elapsed: 2:28:57.560272
Word 693 out of 774
Time elapsed: 2:29:24.162676
Word 694 out of 774
Time elapsed: 2:29:48.814641
Word 695 out of 774
Time elapsed: 2:30:12.931681
Word 696 out of 774


Define a column of the matched words

In [38]:
len(location_df_nonzero)

24841

In [86]:
location location_df.apply(lambda x: location_df.columns[x == True].values, axis=1)

ValueError: could not broadcast input array from shape (3) into shape (848)

In [109]:
# location_df['Matches'] = location_df.eq(True).dot(location_df.columns+',').str[:-1].str.split()
#location_df.drop(["No. Matches"], axis=1)
location_df['Matches'] = location_df.eq(True).dot(location_df.columns+',').str[:-1].str.split(',')

In [111]:
location_df['No. Matches'] = location_df['Matches'].str.len()

In [80]:
location_df['No. Matches'] = location_df['Matches'].apply(lambda x: x[0])

In [112]:
location_df

Unnamed: 0,post_id,comment_id,reply_id,post_title,post_body,reply_id.1,$,£,€,miligram,...,Krakow,Belo Horizonte,Thousand Oaks,Toulon,s. dakota,newyork,kentucy,Oregon,Matches,No. Matches
8,9uj2lg,,,PSA You can buy test strips that test your drugs for fentanyl for $2 a piece - These could have saved Mac Miller and many others lives,,,True,False,False,False,...,False,False,False,False,False,False,False,False,[$],1
89,5klq7u,,,Straw Poll,£100.00 christmas money: spend on healthy hobbies or spunk the lot on gear on DNM?,,False,True,False,False,...,False,False,False,False,False,False,False,False,[£],1
115,8bxj30,,,"Fucken Finally, after working from 4-12 bussing tables i re’d up! I was the only busser to so they didn’t give me my hour break 😒but i made $120 in tips and direct deposit hit at midnight! perfect timing! i picked up 8 30s and 3 20s and 4 joints,but i popped 2 30s and faced a J. Feeling so happy🤤",,,True,False,False,False,...,False,False,False,False,False,False,False,False,[$],1
173,6c5kh8,,,Morphine 30?,"I'm a heavy tar smoker. Dealer says he has ""morphine pill 30 milligram"" for $30. Assuming it's real, would it be worth it for me? Think that'd get me off?",,True,False,False,False,...,False,False,False,False,False,False,False,False,[$],1
410,31hk8i,,,Total Cost: $48 [Pill Porn],,,True,False,False,False,...,False,False,False,False,False,False,False,False,[$],1
427,2fy5v3,,,5 xanies and 3 blue M30's $55->advantages of being friends with your dealer :-),,,True,False,False,False,...,False,False,False,False,False,False,False,False,[$],1
628,3u7mi3,,,SO MAD.,"Ok I need to just cry to someone.. walked an hour both ways soo sick last night to buy 2 stamps for $15. I sniffed one on the way home to survive and saved the other for this morning. I've been sick since like Saturday so I have a million things to do for tgives prep today and I wake up dumb excited to do the other bag. Get my kit out, open it up, and ITS EMPTY. not even a crumb to scrape. I wanna cry but I have no tears. This kid who I THOU...",,True,False,False,False,...,False,False,False,False,False,False,False,False,"[$, stamps, bag]",3
663,7unkz3,,,$60 dime of fent.. expensive smoking habit,,,True,False,False,False,...,False,False,False,False,False,False,False,False,[$],1
830,5p2gkw,,,(Shit Post) Considering Something Stupid...,"So I recently was given the opportunity for a fresh start in a new place, and I've kicked my heroin habit and have resorted to drinking almost daily to get by. In the next week I'll be getting a $2500 check and want to bus back to where I came from, where all my contacts are, to pick up a ball of h and bus back in time before anyone notices. It's a day long bus ride one way, I figure a day to pickup and chill for a bit, and 1 day bus ride bac...",,True,False,False,False,...,False,False,False,False,False,False,False,False,"[$, job, bit, post, half]",5
881,76otct,,,Won $1000 at the casino today,Don't buy dope.. Don't buy dope.. Don't buy dope....,,True,False,False,False,...,False,False,False,False,False,False,False,False,[$],1


In [114]:
location_df_3_matches = location_df.loc[location_df['No. Matches'] > 2]

In [117]:
location_df_3_matches.shape

(4076, 848)