In [1]:
# Name, Surname: Gonul Ayci
# E-mail, Date: aycignl@gmail.com, Dec. 2017
# Description: This is a preparation of data for Probabilistic Topic Modeling project
#              In this project, we use Asgari data. You can find sequences with 3-mers format.

In [2]:
import pandas as pd
import numpy as np
from sklearn.cross_validation import train_test_split
import copy



# merge Asgari data

In [3]:
df1 = pd.read_excel('family_classification_metadata.xlsx')
df2 = pd.read_csv('family_classification_sequences.csv')
merge_data = pd.concat([df1, df2], axis=1)
merge_data[0:3]

Unnamed: 0,SwissProt Accession ID,Long ID,Protein Name,Family ID,Family Description,Sequences
0,Q6GZX4,001R_FRG3G,Putative transcription factor 001R,Pox_VLTF3,Poxvirus Late Transcription Factor VLTF3 like,MAFSAEDVLKEYDRRRRMEALLLSLYYPNDRKLLDYKEWSPPRVQV...
1,Q6GZX3,002L_FRG3G,Uncharacterized protein 002L,DUF230,Poxvirus proteins of unknown function,MSIIGATRLQNDKSDTYSAGPCYAGGCSAFTPRGTCGKDWDLGEQT...
2,Q6GZX0,005R_FRG3G,Uncharacterized protein 005R,US22,US22 like,MQNPLPEVMSPEHDKRTTTPMSKEANKFIRELDKKPGDLAVVSDFV...


# select some columns

In [4]:
data = merge_data[["Family ID", "SwissProt Accession ID", "Sequences"]]
print len(data)
data[0:3]

324018


Unnamed: 0,Family ID,SwissProt Accession ID,Sequences
0,Pox_VLTF3,Q6GZX4,MAFSAEDVLKEYDRRRRMEALLLSLYYPNDRKLLDYKEWSPPRVQV...
1,DUF230,Q6GZX3,MSIIGATRLQNDKSDTYSAGPCYAGGCSAFTPRGTCGKDWDLGEQT...
2,US22,Q6GZX0,MQNPLPEVMSPEHDKRTTTPMSKEANKFIRELDKKPGDLAVVSDFV...


# get 100 families which have max. number of protein types

In [5]:
family_analysis1 = data.groupby(["Family ID"]).apply(lambda x: x.sort_values(["SwissProt Accession ID"], ascending = False)).reset_index(drop=True)
family_analysis1

Unnamed: 0,Family ID,SwissProt Accession ID,Sequences
0,120_Rick_ant,Q9ZD49,MSKNGNQDISEFDPLNREFTEAEKQQQMQQEQEFFSQTILDIADDG...
1,120_Rick_ant,Q9AJ83,MRGFMSKDGNLDTSEFDTLANKEYTAEQKQTLEQGQKEFLSQTTTP...
2,120_Rick_ant,Q9AJ81,DTSEFDPLANKEYTEEQKQKLEQEQKELLSQTTTPELEADDGFIVT...
3,120_Rick_ant,Q9AJ80,DTSEFDPLANKEYTEEQKQTLEQEQKEFLSQTTTPALEADDGFIVT...
4,120_Rick_ant,Q9AJ79,MSKDGNLNTSEFDPLANKEYTEEQKQTLEQEQKEFLSQTTTPELEA...
5,120_Rick_ant,Q9AJ77,DTSEFDPLANKEYTEEQKQTEEQEQKEFLSHTTTPALEADDGFIVT...
6,120_Rick_ant,Q9AJ75,DTSEFDPLANKEYTEEQKQTLEQEQKEFLSQTTTPALEADDGFIVT...
7,120_Rick_ant,Q9AJ64,YENDEEYESGIDEKKQEKAALAQPTLDTADDGFSFTPASSTQSTPS...
8,120_Rick_ant,Q9AJ63,MSKNDNQDISEFDPLNREFTEAEKQQQMQQEQEFFSQSILDIVDDG...
9,120_Rick_ant,Q9AJ37,MSKDSDNPGYESGYESDTEEKKQEQAVPAQPISSTANKDGNPDTSE...


In [6]:
family_analysis2 = family_analysis1.groupby('Family ID').count()
family_analysis3 = family_analysis2.sort_values('SwissProt Accession ID', ascending=False)
family_analysis3[0:100]

Unnamed: 0_level_0,SwissProt Accession ID,Sequences
Family ID,Unnamed: 1_level_1,Unnamed: 2_level_1
MMR_HSR1,3084,3084
Helicase_C,2518,2518
ATP-synt_ab,2387,2387
7tm_1,1820,1820
AA_kinase,1750,1750
AAA,1711,1711
tRNA-synt_1,1634,1634
tRNA-synt_2,1419,1419
MFS_1,1303,1303
HSP70,1272,1272


# get names of selected 100 families

In [7]:
family_ids = list(family_analysis3[0:100].index)
family_ids = [x.encode('utf-8') for x in family_ids]
family_ids

['MMR_HSR1',
 'Helicase_C',
 'ATP-synt_ab',
 '7tm_1',
 'AA_kinase',
 'AAA',
 'tRNA-synt_1',
 'tRNA-synt_2',
 'MFS_1',
 'HSP70',
 'Oxidored_q1',
 'His_biosynth',
 'Cpn60_TCP1',
 'EPSP_synthase',
 'Aldedh',
 'Shikimate_DH',
 'GHMP_kinases_N',
 'Ribosomal_S2',
 'Ribosomal_S4',
 'Ribosomal_L16',
 'KOW',
 'UPF0004',
 'Ribosom_S12_S23',
 'GHMP_kinases_C',
 'Ribosomal_S14',
 'Ribosomal_S11',
 'UVR',
 'Ribosomal_L33',
 'BRCT',
 'RF-1',
 'Ank_2',
 'Ribosomal_L20',
 'RNA_pol_Rpb2_1',
 'Ribosomal_S18',
 'ATP-synt_B',
 'Peptidase_M20',
 'Ribosomal_L18e',
 'GIDA',
 'Oxidored_q2',
 'Ldh_1_N',
 'HD',
 'Ribosomal_S10',
 'PALP',
 'Ribosomal_L18p',
 'Ribosomal_L3',
 'tRNA-synt_1g',
 'UbiA',
 'Ribosomal_L4',
 'Ribosomal_S13',
 'Ribosomal_S16',
 'Methyltransf_5',
 'Ribosomal_L32p',
 'EF_TS',
 'THF_DHG_CYH',
 'OSCP',
 'tRNA-synt_1e',
 'SecA_SW',
 'Ribosomal_L31',
 'RNase_HII',
 'IPPT',
 'Ribosomal_L27',
 'LepA_C',
 'Ribosomal_L17',
 'Ribosomal_L23',
 'Ribosomal_L10',
 'Ribosomal_L19',
 'Ribosomal_S20p',
 '

# get 500 samples for each family

In [8]:
l = len(family_ids)
sample_list = []

for index in range(l):
    sample_list.append(family_analysis1[family_analysis1["Family ID"] == family_ids[index]].sample(n=500))

# split data into train and test 

In [9]:
sample_train_data = []
sample_test_data = []

for i in range(l):
    sample_train_data.append(train_test_split(sample_list[i], test_size=0.2)[0])
    sample_test_data.append(train_test_split(sample_list[i], test_size=0.2)[1])

# sliding window 3-mers

In [10]:
classification_train_3_data = copy.deepcopy(sample_train_data)

In [11]:
train_size = 400
test_size = 100
family_size = 100
k = 3

trainingList = []
training = []


for family_index in range(family_size):
    for seq_index in range(train_size):
        trainingList.append([classification_train_3_data[family_index].values[seq_index][2][i:i+k] 
                             for i in range(0, len(classification_train_3_data[family_index].values[seq_index][2]), 1)])

In [12]:
trainingList[0]

['MTF',
 'TFA',
 'FAR',
 'ARS',
 'RSS',
 'SSR',
 'SRS',
 'RSG',
 'SGL',
 'GLQ',
 'LQD',
 'QDT',
 'DTI',
 'TIA',
 'IAA',
 'AAV',
 'AVA',
 'VAT',
 'ATA',
 'TAP',
 'APG',
 'PGS',
 'GSA',
 'SAG',
 'AGV',
 'GVG',
 'VGI',
 'GIV',
 'IVR',
 'VRV',
 'RVS',
 'VSG',
 'SGP',
 'GPR',
 'PRA',
 'RAL',
 'ALE',
 'LEI',
 'EIA',
 'IAD',
 'ADG',
 'DGL',
 'GLF',
 'LFR',
 'FRG',
 'RGK',
 'GKR',
 'KRR',
 'RRP',
 'RPS',
 'PSA',
 'SAT',
 'ATP',
 'TPG',
 'PGG',
 'GGR',
 'GRF',
 'RFL',
 'FLF',
 'LFG',
 'FGE',
 'GEL',
 'ELH',
 'LHA',
 'HAD',
 'ADE',
 'DEE',
 'EEL',
 'ELL',
 'LLD',
 'LDE',
 'DEG',
 'EGL',
 'GLC',
 'LCL',
 'CLV',
 'LVF',
 'VFR',
 'FRG',
 'RGP',
 'GPR',
 'PRS',
 'RSY',
 'SYT',
 'YTG',
 'TGE',
 'GED',
 'EDV',
 'DVA',
 'VAE',
 'AEV',
 'EVQ',
 'VQT',
 'QTH',
 'THG',
 'HGS',
 'GSP',
 'SPA',
 'PAV',
 'AVL',
 'VLS',
 'LSR',
 'SRV',
 'RVL',
 'VLA',
 'LAR',
 'ART',
 'RTL',
 'TLD',
 'LDL',
 'DLG',
 'LGA',
 'GAR',
 'ARL',
 'RLA',
 'LAR',
 'ARP',
 'RPG',
 'PGE',
 'GEF',
 'EFT',
 'FTL',
 'TLR',
 'LRA',
 'RAY',


In [13]:
training_x = []

for i in range(40000):
    training_x.append(','.join(trainingList[i]))

In [14]:
three_sample_train_data = classification_train_3_data[0]
for i in range(1,100):
    three_sample_train_data = pd.concat([three_sample_train_data, classification_train_3_data[i]])

In [15]:
three_sample_train_data['Sequences'] = training_x

In [16]:
three_sample_train_data[0:3]

Unnamed: 0,Family ID,SwissProt Accession ID,Sequences
153078,MMR_HSR1,Q9RVL1,"MTF,TFA,FAR,ARS,RSS,SSR,SRS,RSG,SGL,GLQ,LQD,QD..."
153707,MMR_HSR1,Q5QYB3,"MLP,LPV,PVV,VVA,VAL,ALV,LVG,VGR,GRP,RPN,PNV,NV..."
154101,MMR_HSR1,Q2JDP2,"MPT,PTF,TFV,FVD,VDR,DRV,RVV,VVL,VLH,LHA,HAT,AT..."
