In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split

In [10]:
pathOut = '../Data/CleanedScaledData/'
cleanData = pd.read_csv(f'{pathOut}CleanedScaledData.csv')
cleanData
cleanData = cleanData[['Target Gene Symbol','sgRNA Context Sequence', 'Domain','Orientation', 
       'ATAC overlap with peak_sum', 'DHS overlap with peak_sum', 'H3K4me3', 'H3k27ac', 
       'sgRNA \'Cut\' Site TSS Offset', 'doubleZscore','TSS Bucket (-5kb to 5kb)', 'rs3ChenSeqScore']].copy()
# for col in cleanData.select_dtypes(include='object').columns:
#     cleanData[col] = cleanData[col].astype('category')
cleanData['sgRNA Context Sequence'].value_counts().value_counts()

count
1    79326
2    42104
Name: count, dtype: int64

In [95]:
# one-hot encoding to interpret shap value later
categorical_cols = ['Domain','Orientation','TSS Bucket (-5kb to 5kb)']
cleanData_encoded = pd.get_dummies(cleanData, columns=categorical_cols)

bool_cols = cleanData_encoded.select_dtypes(include=['bool']).columns.tolist()
cleanData_encoded[bool_cols] = cleanData_encoded[bool_cols].astype(int)

cleanData_encoded.head()

Unnamed: 0,Target Gene Symbol,sgRNA Context Sequence,ATAC overlap with peak_sum,DHS overlap with peak_sum,H3K4me3,H3k27ac,sgRNA 'Cut' Site TSS Offset,doubleZscore,rs3ChenSeqScore,Domain_Kox1,...,"TSS Bucket (-5kb to 5kb)_[750, 775)","TSS Bucket (-5kb to 5kb)_[775, 800)","TSS Bucket (-5kb to 5kb)_[800, 825)","TSS Bucket (-5kb to 5kb)_[825, 850)","TSS Bucket (-5kb to 5kb)_[850, 875)","TSS Bucket (-5kb to 5kb)_[875, 900)","TSS Bucket (-5kb to 5kb)_[900, 925)","TSS Bucket (-5kb to 5kb)_[925, 950)","TSS Bucket (-5kb to 5kb)_[950, 975)","TSS Bucket (-5kb to 5kb)_[975, 1000)"
0,AATF,AAAACTAAGTTGGAGACTGATGGAAGGATT,2,0,3,3,681.0,0.007172,0.483029,1,...,0,0,0,0,0,0,0,0,0,0
1,AATF,AAAACTAAGTTGGAGACTGATGGAAGGATT,2,0,3,3,681.0,-0.26307,0.483029,0,...,0,0,0,0,0,0,0,0,0,0
2,AATF,AAACTTGGTGCTCGGCTGGATGGGCGGAAC,7,0,3,3,475.0,0.456366,0.009643,1,...,0,0,0,0,0,0,0,0,0,0
3,AATF,AAACTTGGTGCTCGGCTGGATGGGCGGAAC,7,0,3,3,475.0,0.573197,0.009643,0,...,0,0,0,0,0,0,0,0,0,0
4,AATF,AAAGAAGCCGAAGCCCTCCTCCCGAGGCCG,7,0,3,3,420.0,1.399346,0.274955,1,...,0,0,0,0,0,0,0,0,0,0


In [96]:
def clean_column_name(name):
    return (
        str(name)
        .replace('[', '_')
        .replace(']', '_')
        .replace('<', '_')
        .replace('>', '_')
        .replace(' ', '_')
        .replace(',', '_')
        .replace('(', '_')
        .replace(')', '_')
    )

cleanData_encoded.columns = [clean_column_name(col) for col in cleanData_encoded.columns]


In [97]:
cleanData_encoded.columns.tolist()

['Target_Gene_Symbol',
 'sgRNA_Context_Sequence',
 'ATAC_overlap_with_peak_sum',
 'DHS_overlap_with_peak_sum',
 'H3K4me3',
 'H3k27ac',
 "sgRNA_'Cut'_Site_TSS_Offset",
 'doubleZscore',
 'rs3ChenSeqScore',
 'Domain_Kox1',
 'Domain_Zim3',
 'Orientation_++',
 'Orientation_+-',
 'Orientation_-+',
 'Orientation_--',
 'TSS_Bucket__-5kb_to_5kb___-100__-75_',
 'TSS_Bucket__-5kb_to_5kb___-1000__-975_',
 'TSS_Bucket__-5kb_to_5kb___-1025__-1000_',
 'TSS_Bucket__-5kb_to_5kb___-1050__-1025_',
 'TSS_Bucket__-5kb_to_5kb___-1075__-1050_',
 'TSS_Bucket__-5kb_to_5kb___-1100__-1075_',
 'TSS_Bucket__-5kb_to_5kb___-1125__-1100_',
 'TSS_Bucket__-5kb_to_5kb___-1150__-1125_',
 'TSS_Bucket__-5kb_to_5kb___-1175__-1150_',
 'TSS_Bucket__-5kb_to_5kb___-1200__-1175_',
 'TSS_Bucket__-5kb_to_5kb___-1225__-1200_',
 'TSS_Bucket__-5kb_to_5kb___-125__-100_',
 'TSS_Bucket__-5kb_to_5kb___-1250__-1225_',
 'TSS_Bucket__-5kb_to_5kb___-1275__-1250_',
 'TSS_Bucket__-5kb_to_5kb___-1300__-1275_',
 'TSS_Bucket__-5kb_to_5kb___-1325_

In [98]:
# Get unique Gene Symbol
geneCategories = cleanData_encoded['Target_Gene_Symbol'].unique()

# Split the categories into train (80%) and test sets(20%) by Gene Symbol
train_genes, test_genes = train_test_split(geneCategories, test_size=0.2, random_state=6)

# Train set
train_df = cleanData_encoded[cleanData_encoded['Target_Gene_Symbol'].isin(train_genes)].copy()
# Test set
test_df = cleanData_encoded[cleanData_encoded['Target_Gene_Symbol'].isin(test_genes)].copy()


In [48]:
# Check the sum of unique Gene in train and test is equal to unique gene in the entire dataset
train_df['Target_Gene_Symbol'].nunique() + test_df['Target_Gene_Symbol'].nunique() == cleanData_encoded['Target_Gene_Symbol'].nunique()


True

In [8]:
modelTrainingPath = "../Data/modelTraining/"
train_df.to_csv(f'{modelTrainingPath}trainData.csv', index = False)
test_df.to_csv(f'{modelTrainingPath}testData.csv', index = False)


In [69]:
import pandas as pd
# DatasetCombineCleaned.csv is the first version cleaned. It used the orginal phenotype calcuation 
cleanedAllData_path = '../Data/CleanedExternalData/'
# DatasetCombineCleaned.csv is the first version cleaned. It used the orginal phenotype calcuation 
originalCleanedData = pd.read_csv(cleanedAllData_path + 'DatasetCombineCleaned_v3.csv', low_memory=False)
originalCleanedData['sgRNA Context Sequence'] = originalCleanedData['sgRNA Context Sequence'].apply(lambda x: x.upper())

modelTrainingPath = "../Data/modelTraining/"
test_df_import = pd.read_csv(f'{modelTrainingPath}testData.csv')
train_df_import = pd.read_csv(f'{modelTrainingPath}trainData.csv')


In [70]:
train_df_import

Unnamed: 0,Target_Gene_Symbol,ATAC_overlap_with_peak_sum,DHS_overlap_with_peak_sum,H3K4me3,H3k27ac,sgRNA_'Cut'_Site_TSS_Offset,doubleZscore,rs3ChenSeqScore,Domain_Kox1,Domain_Zim3,...,TSS_Bucket__-5kb_to_5kb___750__775_,TSS_Bucket__-5kb_to_5kb___775__800_,TSS_Bucket__-5kb_to_5kb___800__825_,TSS_Bucket__-5kb_to_5kb___825__850_,TSS_Bucket__-5kb_to_5kb___850__875_,TSS_Bucket__-5kb_to_5kb___875__900_,TSS_Bucket__-5kb_to_5kb___900__925_,TSS_Bucket__-5kb_to_5kb___925__950_,TSS_Bucket__-5kb_to_5kb___950__975_,TSS_Bucket__-5kb_to_5kb___975__1000_
0,AATF,2,0,3,3,681.0,0.007172,0.483029,1,0,...,0,0,0,0,0,0,0,0,0,0
1,AATF,2,0,3,3,681.0,-0.263070,0.483029,0,1,...,0,0,0,0,0,0,0,0,0,0
2,AATF,7,0,3,3,475.0,0.456366,0.009643,1,0,...,0,0,0,0,0,0,0,0,0,0
3,AATF,7,0,3,3,475.0,0.573197,0.009643,0,1,...,0,0,0,0,0,0,0,0,0,0
4,AATF,7,0,3,3,420.0,1.399346,0.274955,1,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
130695,ZNF830,7,0,3,3,-413.0,-0.510371,-0.391400,1,0,...,0,0,0,0,0,0,0,0,0,0
130696,ZNF830,0,1,3,2,906.0,-0.760791,-0.199034,1,0,...,0,0,0,0,0,0,1,0,0,0
130697,ZNF830,0,0,3,2,-786.0,0.784241,-1.159078,1,0,...,0,0,0,0,0,0,0,0,0,0
130698,ZNF830,0,1,3,2,790.0,0.943780,1.074568,1,0,...,0,1,0,0,0,0,0,0,0,0


In [71]:
test_df

Unnamed: 0,Target_Gene_Symbol,sgRNA_Context_Sequence,ATAC_overlap_with_peak_sum,DHS_overlap_with_peak_sum,H3K4me3,H3k27ac,sgRNA_'Cut'_Site_TSS_Offset,doubleZscore,rs3ChenSeqScore,Domain_Kox1,...,TSS_Bucket__-5kb_to_5kb___750__775_,TSS_Bucket__-5kb_to_5kb___775__800_,TSS_Bucket__-5kb_to_5kb___800__825_,TSS_Bucket__-5kb_to_5kb___825__850_,TSS_Bucket__-5kb_to_5kb___850__875_,TSS_Bucket__-5kb_to_5kb___875__900_,TSS_Bucket__-5kb_to_5kb___900__925_,TSS_Bucket__-5kb_to_5kb___925__950_,TSS_Bucket__-5kb_to_5kb___950__975_,TSS_Bucket__-5kb_to_5kb___975__1000_
4605,ATIC,AAAACAACAAACAAAAAGGGAGGATGGTAG,2,0,3,3,-412.0,0.005957,-0.339995,1,...,0,0,0,0,0,0,0,0,0,0
4606,ATIC,AAAACAACAAACAAAAAGGGAGGATGGTAG,2,0,3,3,-412.0,-0.017525,-0.339995,0,...,0,0,0,0,0,0,0,0,0,0
4607,ATIC,AAAATTAGACGGGTGTGGAGGCAGAGGCAG,1,0,2,3,-923.0,-0.239950,0.106215,1,...,0,0,0,0,0,0,0,0,0,0
4608,ATIC,AAAATTAGACGGGTGTGGAGGCAGAGGCAG,1,0,2,3,-923.0,-0.186008,0.106215,0,...,0,0,0,0,0,0,0,0,0,0
4609,ATIC,AAACAAAAGGAATAGTTCATATAGCGGCAA,3,0,3,3,-369.0,0.559061,0.799858,1,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
163529,ZNHIT6,TTTAATGGTTAACACAGATTAGGGTGGGTT,7,0,3,3,-293.0,-0.458460,0.590918,1,...,0,0,0,0,0,0,0,0,0,0
163530,ZNHIT6,TTTACGGCTCTGCGGAGGCCCTGCCGGATT,7,3,3,3,-80.0,-0.478862,-0.455622,1,...,0,0,0,0,0,0,0,0,0,0
163531,ZNHIT6,TTTAGTGGTAAAAGAAGCGAAGGTGGGTGA,6,0,3,3,541.0,-0.386223,0.449871,1,...,0,0,0,0,0,0,0,0,0,0
163532,ZNHIT6,TTTCTGGAGTCAGACGGCGTTCGCGGGGGT,1,0,3,2,943.0,-0.296669,-0.544459,1,...,0,0,0,0,0,0,0,1,0,0


In [72]:
datasetFind = originalCleanedData[originalCleanedData['sgRNA Context Sequence'].isin(test_df.sgRNA_Context_Sequence)]

In [73]:
datasetFind.DataSet.value_counts()

DataSet
InHouse    37703
Nunez      13950
Gilbert     1682
Name: count, dtype: int64

In [78]:
datasetFind = datasetFind[['Target Gene Symbol',
       'sgRNA Context Sequence', 'DataSet','Domain']].drop_duplicates()
datasetFind.DataSet.value_counts()

DataSet
InHouse    18860
Nunez      13950
Gilbert     1682
Name: count, dtype: int64

In [108]:
datasetFind[['Target Gene Symbol','DataSet' ]].drop_duplicates()['DataSet'].value_counts()

DataSet
Nunez      69
InHouse    31
Gilbert     3
Name: count, dtype: int64

In [109]:
69+31+3

103

In [110]:
69/103

0.6699029126213593

In [111]:
31/103

0.30097087378640774

In [112]:
3/103

0.02912621359223301

In [83]:
18860/(18860+13950+1682)

0.5467934593528935

In [84]:
13950/(18860+13950+1682)

0.4044416096486142

In [85]:
1682/(18860+13950+1682)

0.048764930998492405

In [115]:
originalCleanedData[['Target Gene Symbol', 'DataSet']].drop_duplicates()['DataSet'].value_counts()

DataSet
Nunez      336
InHouse    333
Gilbert     36
Name: count, dtype: int64

In [2]:
import pandas as pd
modelTrainingPath = "/Users/fzheng/Library/CloudStorage/GoogleDrive-fzheng@broadinstitute.org/Shared drives/GPP Cloud /R&D/People/Fengyi/rule_set/crispri/data/2024/Manuscript/modelTraining/"

train_df = pd.read_csv(f'{modelTrainingPath}trainData.csv')
test_df = pd.read_csv(f'{modelTrainingPath}testData.csv')


In [3]:
len(train_df)

130700

In [4]:
len(test_df)

32834