In [1]:
#Imports
import numpy as np
import pandas as pd
from sklearn.model_selection import GroupShuffleSplit
import random
from ast import literal_eval
from scipy import stats
%matplotlib inline

In [2]:
#Load in addgene pickle files
basedir = '../../../data/'
id2lineage = pd.read_pickle(os.path.join(basedir,'lineages/id2lineage.pickle'))
id2lineage = id2lineage.reset_index()

addgene_full2 = pd.read_pickle(basedir,'full/addgene_full.pickle')
addgene_full2 = addgene_full2.reset_index()

lin2id = pd.read_pickle(os.path.join(basedir,'lineages/lineage2ids.pickle'))
lin2id = lin2id.reset_index()

In [3]:
#Drop lineages with only one unique lab present

#Make DF of lineages, unique_num_labs
lab_per_lin = pd.DataFrame(columns=['lineage','num_unique_labs'])
num_lab_per_lin = []
for lineage in lin2id['lineage']:
    lin_only = addgene_full2.loc[addgene_full2['lineage'] == lineage]
    num_lab_per_lin.append(lin_only['lab'].nunique())
lab_per_lin['lineage'] = lin2id['lineage']
lab_per_lin['num_unique_labs'] = num_lab_per_lin

#Get bad Lineages with only 1 lab 
bad_lin = lab_per_lin.loc[lab_per_lin['num_unique_labs'] <= 1]
bad_lin = bad_lin.reset_index(drop = True)

In [4]:
#Replace bad_lineagues and save news files for addgene_full and id2lineage and lineage2id
id2lineage = id2lineage.loc[-id2lineage['lineage'].isin(bad_lin['lineage'])]
id2lineage = id2lineage.reset_index(drop = True)

lin2id = lin2id.loc[-lin2id['lineage'].isin(bad_lin['lineage'])]
lin2id = lin2id.reset_index(drop = True)

addgene_full2.loc[addgene_full2['lineage'].isin(bad_lin['lineage']), 'lineage'] = np.NaN
addgene_full2 = addgene_full2.reset_index(drop = True)

lin2id.to_pickle('lineage2ids_no_bad_lin.pickle')
id2lineage.to_pickle('id2lineage_no_bad_lin.pickle')
addgene_full2.to_pickle("addgene_full_no_bad_lin.pickle")

In [5]:
#Group all labs with under ten plasmids into one large lab class named Unk Engineered

#Convert value count to df
counts_df = addgene_full2['lab'].value_counts()
counts_df = counts_df.to_frame().reset_index()
counts_df.rename(columns={'index': 'lab', 'lab': 'value_count'}, inplace=True)

#Find all rows with under 10 value_count
low_abund = counts_df.loc[counts_df['value_count'] < 10]

#Replace all low abundance lab names with Unk Egineered
addgene_full2.loc[addgene_full2['lab'].isin(low_abund['lab']),['lab']] = 'Unk Engineered'

In [7]:
#Adding test baseline and adding more to Unk Egineering

#Create DF with labs and number of unique lineages per lab
lin_per_lab = pd.DataFrame(columns=['lab', 'num_unique_lin'])
num_lin_per_lab = []

#Iterate over lab and get df with only a specific lab then find number of unique lineages within that df
for lab in addgene_full2['lab'].unique():
    lab_only = addgene_full2.loc[addgene_full2['lab'] == lab]
    num_lin_per_lab.append(lab_only['lineage'].nunique())

lin_per_lab['lab'] =  addgene_full2['lab'].unique()
lin_per_lab['num_unique_lin'] = num_lin_per_lab

#Important: Sort lin_per_lab  by num_unique_lin ascending = True as it forces 'Unk Engineered' to be at bottom
lin_per_lab = lin_per_lab.sort_values(by = 'num_unique_lin')
lin_per_lab = lin_per_lab.reset_index(drop = True)

test_baseline = pd.DataFrame()
for lab in lin_per_lab['lab']:
    lab_plasmids = addgene_full2.loc[addgene_full2['lab'] == lab]
    no_lin_membership = lab_plasmids.loc[lab_plasmids['lineage'].isnull()]
    if no_lin_membership.shape[0] < 3:
        #Replace Lab name with 'Unk Engineered' and update lin_per-lab with new Unk value
        addgene_full2.loc[addgene_full2['lab'] == lab,['lab']] = 'Unk Engineered'
    else: 
        #Sample 3 random plasmids from no lineage membership and put them into baseline test set 
        test_baseline = test_baseline.append(no_lin_membership.sample(n=3,random_state=69))
test_baseline = test_baseline.reset_index(drop = True)

In [11]:
#Save the updated addgene_full2 
addgene_full2.to_pickle("addgene_full_no_bad_lin_baseline.pickle")

In [12]:
#Find all ids with lineage
ids_with_lineage = id2lineage['addgene_id'].drop_duplicates()

#Find all unique lineage labels
unique_lineage_labels = id2lineage['lineage'].drop_duplicates()

#Find all ids 
all_ids = addgene_full2['addgene_id'].drop_duplicates()

#All plasmid ids that do not have a lineage
no_lineage_plasmids = all_ids[~all_ids.isin(ids_with_lineage)]

In [13]:
#Manipulate id2groups to assign random unique lineages to plasmids without a lineage 

#update id2lineage with removal of test_baseline ids 
id2lineage = id2lineage.loc[~id2lineage['addgene_id'].isin(test_baseline['addgene_id'])]

#Set Seed
random.seed(45)  

#Generate Random numbers that are unique and different than the already existing lineage labels (they are above the max label)
#The max lineage id is 1228 so any number above that will be a dummy index

#Make dataframe of plasmids with no lineage and assign them a dummy lineage
dummy_lineage = pd.DataFrame( columns=['addgene_id','lineage'])

dummy_lineage['addgene_id'] = no_lineage_plasmids
dummy_lineage['lineage'] = random.sample((range(1300,100000)),no_lineage_plasmids.size)

#Combine id2lineage and dummy lineage to have a full dataframe of lineages 
lineages_full = pd.concat([id2lineage,dummy_lineage])
lineages_full=lineages_full.sort_values(by=['addgene_id'])
lineages_full.reset_index(drop=True)

Unnamed: 0,addgene_id,lineage
0,3.0,36935
1,4.0,56047
2,6.0,65266
3,41.0,35072
4,42.0,12032
5,43.0,41039
6,44.0,45698
7,45.0,4130
8,46.0,10780
9,47.0,64772


In [14]:
#Replace lineage column with lineage ids including dummy ids 
addgene_full2['lineage'] = lineages_full['lineage'].values

In [15]:
#Save new version of addgene_full with baseline data taken away
addgene_full2 = addgene_full2.loc[~addgene_full2['addgene_id'].isin(test_baseline['addgene_id'])]
addgene_full2 = addgene_full2.reset_index(drop = True)

In [17]:
#Function for Group Splitting
#Inputs: df-Pandas df
#y_col-String name of column with values to be predicted in mode
#groups_col-String name of column with values specifying group membership (This would be a lineage column in our case ie. which lineage a plasmid belonged to)

def grouped_split(df,y_col, groups_col):
    
    #Split into Training set and combined Validation+Test set (First Step)
    rs1 = 42
    y1 = df[y_col]
    groups1 = df[groups_col]
    gss1 = GroupShuffleSplit(n_splits=1, test_size=0.2, random_state= rs1)
    train_inds, val_test_inds = next(gss1.split(X=df, y=y1, groups=groups1))
    X_train, X_val_test, y_train, y_val_test = df.iloc[train_inds],df.iloc[val_test_inds], y1.iloc[train_inds], y1.iloc[val_test_inds]
    
    #Split Combined Validation+Test Sets into seperate Validation and Test Sets 
    rs2 = 43
    y2 = X_val_test['lab']
    groups2 = X_val_test[groups_col]
    gss2 = GroupShuffleSplit(n_splits=1, test_size=0.5, random_state= rs2)
    val_inds, test_inds = next(gss2.split(X=X_val_test, y=y2, groups=groups2))

    X_val, X_test, y_val, y_test = X_val_test.iloc[val_inds], X_val_test.iloc[test_inds], y2.iloc[val_inds], y2.iloc[test_inds]
    
    return X_train, y_train, X_val, y_val, X_test, y_test

In [18]:
#Perform the split
X_train, y_train, X_val, y_val, X_test, y_test = grouped_split(addgene_full2,'lab','lineage')

In [19]:
#Combine test base X_test 
X_test = pd.concat([X_test,test_baseline])
X_test = X_test.reset_index(drop = True)
y_test = pd.concat([y_test,test_baseline['lab'].to_frame()])
y_test = y_test.reset_index(drop = True)

In [None]:
#Drop lab and lab_id column in data splots 
#X_val = X_val.drop(['lab','lab_id'], axis = 1)
#X_train = X_train.drop(['lab','lab_id'], axis = 1)
#X_test = X_test.drop(['lab','lab_id'], axis = 1)

In [23]:
#Replace Dummy Lineages with NaN as split is done 
X_train['lineage'] = X_train['lineage'].replace(dummy_lineage['lineage'].values, np.nan)
X_val['lineage'] = X_val['lineage'].replace(dummy_lineage['lineage'].values, np.nan)
X_test['lineage'] = X_test['lineage'].replace(dummy_lineage['lineage'].values, np.nan)



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy



In [24]:
#Save splits as CSVs (these are with the <10 grouping)
X_val.to_pickle('X_val_baseline.pickle')
y_val.to_pickle('y_val_baseline.pickle')

X_test.to_pickle('X_test_baseline.pickle')
y_test.to_pickle('y_test_baseline.pickle')

X_train.to_pickle('X_train_baseline.pickle')
y_train.to_pickle('y_train_baseline.pickle')