In [32]:
import pandas as pd
import numpy as np
import itertools

In [33]:
prevalence = pd.read_csv('prevalence.csv')
prevalence

Unnamed: 0,substance,PubChemCID,sixwords,notes,smiles1,category,Unnamed: 6,2CB impurity,DMT impurity,GCMS breakdown product,...,typtamine,uncertain,uncommon,up-and-down,upper,vasodilator,veterinary,vitamin,weak opioid,count
0,"1,2-Dibromo-4,5-methylenedioxybenzene",225814.0,"Meth production impurity, rare",,C1OC2=CC(=C(C=C2O1)Br)Br,stimulant,1,0,0,0,...,0,0,1,0,0,0,0,0,0,1
1,"1,3-Diacetin",66924.0,Inactive fentanyl cut common in NC,"Common in fentanyl samples in NC, rare elsewhe...",CC(=O)OCC(COC(=O)C)O,other,1,0,0,1,...,0,0,1,0,0,0,0,0,0,110
2,"1,4-Butanediol",8064.0,"related to GHB, unclear if active",Precurser to gamma-hydroxybutyrate (GHB) - met...,C(CCO)CO,other,1,0,0,0,...,0,0,0,0,0,0,0,0,0,6
3,1-2-propanol,7900.0,,,CC(COC)O,,1,0,0,0,...,0,0,0,0,0,0,0,0,0,1
4,1-[methyl]cyclopentanol,73830.0,"Leftover from making ketamine, inactive",Appears similar to other starting materials us...,CC1(CCCC1)O,other,1,0,0,1,...,0,0,1,0,0,0,0,0,0,5
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
230,vitamin E,14985.0,,,CC1=C(C2=C(CCC(O2)(C)CCCC(C)CCCC(C)CCCC(C)C)C(...,other,1,0,0,0,...,0,0,1,0,0,0,0,1,0,1
231,vitamin E acetate,86472.0,,,CC1=C(C(=C(C2=C1OC(CC2)(C)CCCC(C)CCCC(C)CCCC(C...,,1,0,0,1,...,0,0,0,0,0,0,0,1,0,2
232,xylazine,5707.0,"Heavy sedative, causes nasty wounds",,CC1=C(C(=CC=C1)C)NC2=NCCCS2,"opioid,sedative",1,0,0,0,...,0,0,1,1,0,0,1,0,0,1323
233,xylitol,6912.0,"Common artificial sweetener in gum, food",,C(C(C(C(CO)O)O)O)O,,1,0,0,0,...,0,0,1,0,0,0,0,0,0,4


In [34]:
#get the co-occurrences - create columns of each substance that give the number times that substance has occurred with another 
samples = pd.read_csv('all_samples_cleaned.csv')
co_occurences = samples[['sampleid', 'substance']]
co_occurences

Unnamed: 0,sampleid,substance
0,06082021,fentanyl
1,06082021,4-ANPP
2,111422,cocaine
3,111422,methyl ecgonidine
4,111422,methamphetamine
...,...,...
37755,USU213,norcocaine
37756,USU213,eutylone
37757,USUDMT32221,"N,N-dimethyltryptamine"
37758,USUDMT32221,N-methyltryptamine


In [35]:
# samples[samples['substance'] == 'GHB']
# 802464
# 805146	
# 806966

samples[samples['sampleid'] == '806966']


Unnamed: 0,sampleid,counter,sampletype,expectedsubstance,program,date,color,texture,sensations,texture_notes,...,card_notes,prog_type,program_county,lat_program,lon_program,substance,abundance,method,peak,date_complete
31828,806966,8875,spatula,GHB,San Francisco AIDS Fdn,10/28/2024,white,pill,,waxy,...,Dark Web,Confirmatory,San Francisco County,37.779026,-122.419906,GHB,,Derivatized GCMS,4.26,2024-11-15


In [36]:
def samples_to_matrix(df, group, value):

    #dataframe that gets all the combinations for each sample id
    grouped = df.groupby(group)[value].apply(lambda x: list(itertools.combinations(x, 2)) if len(x) > 0 else [])
    combinations_df = pd.DataFrame({group: grouped.index, 'combinations': grouped.values})

    #explode the lists into row values
    df_exploded = combinations_df.explode('combinations')
    df_count = df_exploded.groupby('combinations', as_index = False).count()

    #get all the substances
    unique_values = df['substance'].unique()

    # Create empty matrix
    matrix = pd.DataFrame(0, index=unique_values, columns=unique_values)

    # Fill matrix with counts (symmetric)
    for idx, row in df_count.iterrows():
        source, target = row['combinations']
        matrix.loc[source, target] = row['sampleid']
        matrix.loc[target, source] = row['sampleid']  # Make it symmetric
    
    return matrix

In [37]:
matrix = samples_to_matrix(co_occurences, 'sampleid', 'substance')
matrix = matrix.reset_index(names = 'substance')
matrix

Unnamed: 0,substance,fentanyl,4-ANPP,cocaine,methyl ecgonidine,methamphetamine,xylazine,phenethyl 4-ANPP,despropionyl p-fluorofentanyl,lidocaine,...,R-6890,N-methyl-cyclohexanamine,ADB-5'Br-BUTINACA,tianeptine metabolite,delta-8-THC acetate,venlafaxine,adrafinil,pseudoephedrine,pentobarbital,lysergic acid amide
0,fentanyl,3,2213,367,32,268,837,738,246,583,...,0,0,0,0,0,0,0,0,1,0
1,4-ANPP,2213,6,343,20,259,822,692,209,517,...,0,0,0,0,0,0,0,0,0,0
2,cocaine,367,343,1,182,81,180,153,58,118,...,1,0,0,0,0,0,0,0,0,0
3,methyl ecgonidine,32,20,182,0,11,14,9,6,8,...,0,0,0,0,0,0,0,0,0,0
4,methamphetamine,268,259,81,11,1,127,81,42,76,...,0,1,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
376,venlafaxine,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
377,adrafinil,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
378,pseudoephedrine,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,1,0,0,0,0
379,pentobarbital,1,0,0,0,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [38]:
# now join this table with my entire dataset
merged_df = pd.merge(prevalence, matrix, on='substance', how='inner')
merged_df

Unnamed: 0,substance,PubChemCID,sixwords,notes,smiles1,category,Unnamed: 6,2CB impurity,DMT impurity,GCMS breakdown product,...,R-6890,N-methyl-cyclohexanamine,ADB-5'Br-BUTINACA,tianeptine metabolite,delta-8-THC acetate,venlafaxine,adrafinil,pseudoephedrine,pentobarbital,lysergic acid amide
0,"1,2-Dibromo-4,5-methylenedioxybenzene",225814.0,"Meth production impurity, rare",,C1OC2=CC(=C(C=C2O1)Br)Br,stimulant,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,"1,3-Diacetin",66924.0,Inactive fentanyl cut common in NC,"Common in fentanyl samples in NC, rare elsewhe...",CC(=O)OCC(COC(=O)C)O,other,1,0,0,1,...,0,0,0,0,0,0,0,0,0,0
2,"1,4-Butanediol",8064.0,"related to GHB, unclear if active",Precurser to gamma-hydroxybutyrate (GHB) - met...,C(CCO)CO,other,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,1-2-propanol,7900.0,,,CC(COC)O,,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,1-[methyl]cyclopentanol,73830.0,"Leftover from making ketamine, inactive",Appears similar to other starting materials us...,CC1(CCCC1)O,other,1,0,0,1,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
230,vitamin E,14985.0,,,CC1=C(C2=C(CCC(O2)(C)CCCC(C)CCCC(C)CCCC(C)C)C(...,other,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
231,vitamin E acetate,86472.0,,,CC1=C(C(=C(C2=C1OC(CC2)(C)CCCC(C)CCCC(C)CCCC(C...,,1,0,0,1,...,0,0,0,0,0,0,0,0,0,0
232,xylazine,5707.0,"Heavy sedative, causes nasty wounds",,CC1=C(C(=CC=C1)C)NC2=NCCCS2,"opioid,sedative",1,0,0,0,...,0,0,0,0,0,0,0,0,1,0
233,xylitol,6912.0,"Common artificial sweetener in gum, food",,C(C(C(C(CO)O)O)O)O,,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [31]:
merged_df.to_csv('co_occurrences.csv', index=False)  # index=False prevents writing the index

In [39]:
merged_df[['ketamine']]

Unnamed: 0,ketamine
0,0
1,2
2,1
3,0
4,16
...,...
230,0
231,0
232,11
233,0
