In [1]:
import pandas as pd
import numpy as np

dummy_tags = pd.read_csv('dummy_tags.csv')
dummy_tags

Unnamed: 0,substance,PubChemCID,sixwords,notes,smiles1,category,Unnamed: 6,2CB impurity,DMT impurity,GCMS breakdown product,...,tramadol impurity,typtamine,uncertain,uncommon,up-and-down,upper,vasodilator,veterinary,vitamin,weak opioid
0,"1,2-Dibromo-4,5-methylenedioxybenzene",225814.0,"Meth production impurity, rare",,C1OC2=CC(=C(C=C2O1)Br)Br,stimulant,1,0,0,0,...,0,0,0,1,0,0,0,0,0,0
1,"1,3-Diacetin",66924.0,Inactive fentanyl cut common in NC,"Common in fentanyl samples in NC, rare elsewhe...",CC(=O)OCC(COC(=O)C)O,other,1,0,0,1,...,0,0,0,1,0,0,0,0,0,0
2,"1,4-Butanediol",8064.0,"related to GHB, unclear if active",Precurser to gamma-hydroxybutyrate (GHB) - met...,C(CCO)CO,other,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,1-2-propanol,7900.0,,,CC(COC)O,,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,1-[methyl]cyclopentanol,73830.0,"Leftover from making ketamine, inactive",Appears similar to other starting materials us...,CC1(CCCC1)O,other,1,0,0,1,...,0,0,0,1,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
259,xylazine,5707.0,"Heavy sedative, causes nasty wounds",,CC1=C(C(=CC=C1)C)NC2=NCCCS2,"opioid,sedative",1,0,0,0,...,0,0,0,1,1,0,0,1,0,0
260,xylitol,6912.0,"Common artificial sweetener in gum, food",,C(C(C(C(CO)O)O)O)O,,1,0,0,0,...,0,0,0,1,0,0,0,0,0,0
261,zolpidem,5732.0,Common prescription sleep med Ambien,Rarely detected in street drugs.,CC1=CC=C(C=C1)C2=C(N3C=C(C=CC3=N2)C)CC(=O)N(C)C,sedative,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
262,α-Ethylaminopentiophenone,205593.0,,,CCCC(C(=O)C1=CC=CC=C1)NCC,,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [2]:
# Load the all_samples.csv file
all_samples = pd.read_csv('all_samples.csv')

# Remove the synonym from the substance column
all_samples['substance'] = all_samples['substance'].str.replace(r'\s*\(.*?\)', '', regex=True)



# Correct the names in the new_df DataFrame
all_samples['substance'] = all_samples['substance'].replace({
    '1--2-propanol': '1-2-propanol',
    '': '3,4-Methylenedioxy-α±-Cyclohexylaminopropiophenone',
    '1-Boc-4--piperidine': '1-Boc-4-piperidine',
    'N--thebaol': 'N-thebaol',
    'N-benzyl-N-cyclohexylamine': 'N-benzylcyclohexanamine',
    'Psilocybin / Psilocin': 'psilocin',
    'ephedrine/pseudoephedrine': 'pseudoephedrine',
    'α-Ethylaminopentiophenon': 'α±-Ethylaminopentiophenone',
    'α-Ethylaminopentiophenone': 'α±-Ethylaminopentiophenone',
    'α-Pyrrolidinoisohexanophenone': 'α±-Pyrrolidinoisohexanophenone',
    'phenethylbromide': 'phenethyl bromide',
    'ethyl 4-ANPP':'ethyl-4-ANPP', 
    'gamma-butyrolactone': 'GBL',
    'gamma-hydroxybutyrate': 'GHB',
    '3-Methoxy-PCP': '3-methoxy-PCP',
    '3-chlorophenmetrazine': '3-CPM',
    '4-Fluoromethylphenidate': '4-fluoromethylphenidate', 
    'mitragynine': 'kratom', 
    '2-Fluoro-2-oxo PCE' : '2-fluoro-2-oxo PCE'

})


In [3]:
# Save the cleaned data back to the CSV file
all_samples.to_csv('all_samples_cleaned.csv', index=False)

all_samples.head()

Unnamed: 0,sampleid,counter,sampletype,expectedsubstance,program,date,color,texture,sensations,texture_notes,...,card_notes,prog_type,program_county,lat_program,lon_program,substance,abundance,method,peak,date_complete
0,6082021,3,unknown,heroin,NC Survivor's Union,6/8/2021,,,,,...,,HR,Guilford County,36.072635,-79.791975,fentanyl,,GCMS,9.28,2022-03-04
1,6082021,3,unknown,heroin,NC Survivor's Union,6/8/2021,,,,,...,,HR,Guilford County,36.072635,-79.791975,4-ANPP,trace,GCMS,,2022-03-04
2,111422,493,spatula,methamphetamine,,10/19/2022,white,powder; chunky; flaky,,,...,,,,,,cocaine,trace,GCMS,,2022-11-14
3,111422,493,spatula,methamphetamine,,10/19/2022,white,powder; chunky; flaky,,,...,,,,,,methyl ecgonidine,trace,GCMS,,2022-11-14
4,111422,493,spatula,methamphetamine,,10/19/2022,white,powder; chunky; flaky,,,...,,,,,,methamphetamine,,GCMS,3.62,2022-11-14


In [4]:
#remove substance that are found as a trace
valid_samples = all_samples[~all_samples['abundance'].str.contains('trace', na=False)]

substance_count = pd.DataFrame(valid_samples['substance'].value_counts().reset_index())
substance_count

Unnamed: 0,substance,count
0,fentanyl,4368
1,4-ANPP,3434
2,methamphetamine,2109
3,acetaminophen,1398
4,xylazine,1323
...,...,...
344,ADB-4en-PINACA,1
345,clomiphene,1
346,methandrostenolone,1
347,temazepam,1


In [5]:
all_samples[all_samples['substance'] == 'pseudoephedrine']

Unnamed: 0,sampleid,counter,sampletype,expectedsubstance,program,date,color,texture,sensations,texture_notes,...,card_notes,prog_type,program_county,lat_program,lon_program,substance,abundance,method,peak,date_complete
36794,902010,1330,spatula,methamphetamine,TN HR,4/9/2023,light brown,crystals; powder,nice; weaker; long,,...,,,,,,pseudoephedrine,trace,GCMS,,2023-05-04


In [6]:
dummy_prevalence = pd.merge(dummy_tags, substance_count, on='substance')
dummy_prevalence

Unnamed: 0,substance,PubChemCID,sixwords,notes,smiles1,category,Unnamed: 6,2CB impurity,DMT impurity,GCMS breakdown product,...,typtamine,uncertain,uncommon,up-and-down,upper,vasodilator,veterinary,vitamin,weak opioid,count
0,"1,2-Dibromo-4,5-methylenedioxybenzene",225814.0,"Meth production impurity, rare",,C1OC2=CC(=C(C=C2O1)Br)Br,stimulant,1,0,0,0,...,0,0,1,0,0,0,0,0,0,1
1,"1,3-Diacetin",66924.0,Inactive fentanyl cut common in NC,"Common in fentanyl samples in NC, rare elsewhe...",CC(=O)OCC(COC(=O)C)O,other,1,0,0,1,...,0,0,1,0,0,0,0,0,0,110
2,"1,4-Butanediol",8064.0,"related to GHB, unclear if active",Precurser to gamma-hydroxybutyrate (GHB) - met...,C(CCO)CO,other,1,0,0,0,...,0,0,0,0,0,0,0,0,0,6
3,1-2-propanol,7900.0,,,CC(COC)O,,1,0,0,0,...,0,0,0,0,0,0,0,0,0,1
4,1-[methyl]cyclopentanol,73830.0,"Leftover from making ketamine, inactive",Appears similar to other starting materials us...,CC1(CCCC1)O,other,1,0,0,1,...,0,0,1,0,0,0,0,0,0,5
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
230,vitamin E,14985.0,,,CC1=C(C2=C(CCC(O2)(C)CCCC(C)CCCC(C)CCCC(C)C)C(...,other,1,0,0,0,...,0,0,1,0,0,0,0,1,0,1
231,vitamin E acetate,86472.0,,,CC1=C(C(=C(C2=C1OC(CC2)(C)CCCC(C)CCCC(C)CCCC(C...,,1,0,0,1,...,0,0,0,0,0,0,0,1,0,2
232,xylazine,5707.0,"Heavy sedative, causes nasty wounds",,CC1=C(C(=CC=C1)C)NC2=NCCCS2,"opioid,sedative",1,0,0,0,...,0,0,1,1,0,0,1,0,0,1323
233,xylitol,6912.0,"Common artificial sweetener in gum, food",,C(C(C(C(CO)O)O)O)O,,1,0,0,0,...,0,0,1,0,0,0,0,0,0,4


In [7]:
dummy_prevalence

Unnamed: 0,substance,PubChemCID,sixwords,notes,smiles1,category,Unnamed: 6,2CB impurity,DMT impurity,GCMS breakdown product,...,typtamine,uncertain,uncommon,up-and-down,upper,vasodilator,veterinary,vitamin,weak opioid,count
0,"1,2-Dibromo-4,5-methylenedioxybenzene",225814.0,"Meth production impurity, rare",,C1OC2=CC(=C(C=C2O1)Br)Br,stimulant,1,0,0,0,...,0,0,1,0,0,0,0,0,0,1
1,"1,3-Diacetin",66924.0,Inactive fentanyl cut common in NC,"Common in fentanyl samples in NC, rare elsewhe...",CC(=O)OCC(COC(=O)C)O,other,1,0,0,1,...,0,0,1,0,0,0,0,0,0,110
2,"1,4-Butanediol",8064.0,"related to GHB, unclear if active",Precurser to gamma-hydroxybutyrate (GHB) - met...,C(CCO)CO,other,1,0,0,0,...,0,0,0,0,0,0,0,0,0,6
3,1-2-propanol,7900.0,,,CC(COC)O,,1,0,0,0,...,0,0,0,0,0,0,0,0,0,1
4,1-[methyl]cyclopentanol,73830.0,"Leftover from making ketamine, inactive",Appears similar to other starting materials us...,CC1(CCCC1)O,other,1,0,0,1,...,0,0,1,0,0,0,0,0,0,5
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
230,vitamin E,14985.0,,,CC1=C(C2=C(CCC(O2)(C)CCCC(C)CCCC(C)CCCC(C)C)C(...,other,1,0,0,0,...,0,0,1,0,0,0,0,1,0,1
231,vitamin E acetate,86472.0,,,CC1=C(C(=C(C2=C1OC(CC2)(C)CCCC(C)CCCC(C)CCCC(C...,,1,0,0,1,...,0,0,0,0,0,0,0,1,0,2
232,xylazine,5707.0,"Heavy sedative, causes nasty wounds",,CC1=C(C(=CC=C1)C)NC2=NCCCS2,"opioid,sedative",1,0,0,0,...,0,0,1,1,0,0,1,0,0,1323
233,xylitol,6912.0,"Common artificial sweetener in gum, food",,C(C(C(C(CO)O)O)O)O,,1,0,0,0,...,0,0,1,0,0,0,0,0,0,4


In [8]:
dummy_prevalence.to_csv('prevalence.csv', index=False)  # index=False prevents writing the index