In [1]:
import pandas as pd
positive_data = pd.read_csv("data/NifH_Q10_S10.csv", header = None)
negative_data = pd.read_csv("data/NifH_Qminus10_to_Qminus50.csv", header = None)
negative_seeds_data = pd.read_csv("data/nifD.seeds_clean.txt", sep="\t", header = None)

# Assigning column names
positive_data.columns = ['protein_name', 'protein_sequence']
negative_data.columns = ["protein_name", "protein_sequence"]
negative_seeds_data.columns = ["protein_name", "protein_sequence"]

In [2]:
# print(train)
negative_seeds_data.describe()

Unnamed: 0,protein_name,protein_sequence
count,290,290
unique,290,264
top,Q8KIP2 670 bp,mstvednkqliqdvleaypdkakkkrekhlnvheegktdcgvksnv...
freq,1,7


In [3]:
# print first entry - sanity check
negative_seeds_data.iloc[0]

protein_name                                            O27605 670 bp
protein_sequence    mpfklfdvdaeiperkkhvyikkkedpeedlplcntktipgcmter...
Name: 0, dtype: object

In [4]:
# clear any white spaces
positive_data['protein_sequence'] = positive_data['protein_sequence'].str.strip()
negative_data['protein_sequence'] = negative_data['protein_sequence'].str.strip()
negative_seeds_data['protein_sequence'] = negative_seeds_data['protein_sequence'].str.strip()

# Remove non [A-Z][a-z] characters
positive_data['protein_sequence'] = positive_data.protein_sequence.str.replace('[^a-zA-Z]', '')
negative_data['protein_sequence'] = negative_data.protein_sequence.str.replace('[^a-zA-Z]', '')
negative_seeds_data['protein_sequence'] = negative_seeds_data.protein_sequence.str.replace('[^a-zA-Z]', '')



In [5]:
# All upper case?
# positive_data['sequence_isupper'] = map(lambda x: x.isupper(), positive_data['protein_sequence'])
# negative_data['sequence_isupper'] = map(lambda x: x.isupper(), negative_data['protein_sequence'])
# negative_seeds_data['sequence_isupper'] = map(lambda x: x.isupper(), negative_seeds_data['protein_sequence'])
# negative_seeds_data.describe()


In [6]:
# convert small letter to capital letters
positive_data['protein_sequence'] = positive_data['protein_sequence'].apply(lambda s:s.upper())
negative_data['protein_sequence'] = negative_data['protein_sequence'].apply(lambda s:s.upper())
negative_seeds_data['protein_sequence'] = negative_seeds_data['protein_sequence'].apply(lambda s:s.upper())
negative_seeds_data.describe()

Unnamed: 0,protein_name,protein_sequence
count,290,290
unique,290,264
top,Q8KIP2 670 bp,MSTVEDNKQLIQDVLEAYPDKAKKKREKHLNVHEEGKTDCGVKSNV...
freq,1,7


In [7]:
# find duplicates 
dups = negative_seeds_data[negative_seeds_data.duplicated('protein_sequence')]
# print Duplicates
print(dups)
# for i in range(len(dups)):
#     if dups[i] == True:
#         print(dups[i])
negative_seeds_data.describe()

      protein_name                                   protein_sequence
5    B9E5J9 670 bp  MKKVLDQVLEVYPAKTFKNRKKHILIKSNDEPNPVIQANVRTVPGI...
13   C4IC65 670 bp  MSKIDSVLDKYSAKVYKNRKKHVLELEHETQEIDANRRSVPGLINH...
48   C5U390 670 bp  MTTLKTDPGVDLESFVDEVTSLYPPKVAKKRRAHMVVRKDGDPGLA...
59   B5XPH1 670 bp  MTNATGERNLALIQEVLEVFPETARKERRKHMMISDPQMESVGKCI...
80   C7DP79 670 bp  MSLDYENDGALHAKLIEEVLSQYPDKAAKRRKKHLSVAKSGDEAGE...
81   C7DP97 670 bp  MSLDYENDGALHAKLIEEVLSQYPDKAAKRRKKHLSVAKSGDEAGE...
97   C7DP52 670 bp  MSREYENDGALHAKLIEEVLSHYPDKAAKRRKKHLNVAKSGNEAGG...
141  C5TI44 670 bp  MSLNEEETIFNTRLIEEVLEAYPAKAKKRRQKHLTVAKAPDTEADP...
166  B5ER75 670 bp  MSITAEETREQIVAETKTRNRALIDEVLKVYPEKTAKRRAKHLNVF...
178  B3QB46 670 bp  MSTAVAESPADIKERNKKLIGEVLEAYPDKSAKRRAKHLNTYDAEK...
194  C7QV34 670 bp  MSTVEDRKQLIQDVLDTYPEKLAKKRSKHLNVYEEGKDDCGVKSNI...
198  B3F2I1 670 bp  MSTVEDNKQLIQDVLEAYPDKAKKKREKHLNVHEEGKTDCGVKSNV...
199  B3F2H9 670 bp  MSTVEDNKQLIQDVLEAYPDKAKKKREKHLNVHEEGKTDCGVKSNV...
200  B3F2H7 670 bp  

Unnamed: 0,protein_name,protein_sequence
count,290,290
unique,290,264
top,Q8KIP2 670 bp,MSTVEDNKQLIQDVLEAYPDKAKKKREKHLNVHEEGKTDCGVKSNV...
freq,1,7


In [8]:
# drop duplicates 
positive_data.drop_duplicates(subset='protein_sequence', keep='first', inplace=True)
negative_data.drop_duplicates(subset='protein_sequence', keep='first', inplace=True)
negative_seeds_data.drop_duplicates(subset='protein_sequence', keep='first', inplace=True)
negative_seeds_data.describe()


Unnamed: 0,protein_name,protein_sequence
count,264,264
unique,264,264
top,B3F2J4 670 bp,MSTVEDNKQLIQDVLEAYPEKAKKQRAKHLNVHEEGKADCGVKSNV...
freq,1,1


In [9]:
# check for duplicates between different data sets
# combine two dfs
pieces = {'x': positive_data, 'y': negative_data, 'z': negative_seeds_data}
df_piece = pd.concat(pieces)
df_piece.describe() # the description shows that all sequesces are unique
# print(len(positive_data))
# print(len(negative_data))
# print(len(negative_seeds_data))
# print(len(df_piece))

Unnamed: 0,protein_name,protein_sequence
count,42741,42741
unique,42741,42741
top,AEP26283.1,GKGGIGKSTTTQNTVAALAEMGKKVMVVGCDPKADSTRLLLNGLCQ...
freq,1,1


In [11]:
# How many unique amono acids do we have 
letters = set()
for i in range(200): # taking a smple of 200
    for char in positive_data.iloc[i].protein_sequence:
        letters.add(char)
print(letters)
print(len(letters))
# we have 21 different amono acids

{'H', 'F', 'Q', 'K', 'I', 'Y', 'R', 'G', 'V', 'P', 'S', 'C', 'M', 'X', 'A', 'W', 'L', 'E', 'N', 'T', 'D'}
21


In [47]:
## buidling data tensor for 'positive_data' data set
##
import numpy as np

## make this loop in numpy instead of a python list
multi_2d_sequences_3d = []
for seq_str in positive_data['protein_sequence']:
    sequence_2d = []
    for ch in seq_str:
        z = [255 if int(c) == 1 else 0 for c in list(bin(ord(ch))[2:])]
        sequence_2d.append(np.array(z))
    multi_2d_sequences_3d.append(np.array(sequence_2d))
multi_2d_sequences_3d_np = np.array(multi_2d_sequences_3d)
# sequences_all_2d


In [9]:
## buidling data tensor for 'negative_data' data set
##
## make this loop in numpy instead of a python list
import numpy as np
n_multi_2d_sequences_3d = []
for seq_str in negative_data['protein_sequence']:
    n_sequence_2d = []
    for ch in seq_str:
        z = [255 if int(c) == 1 else 0 for c in list(bin(ord(ch))[2:])]
        n_sequence_2d.append(np.array(z))
    n_multi_2d_sequences_3d.append(np.array(n_sequence_2d))
n_multi_2d_sequences_3d_np = np.array(n_multi_2d_sequences_3d)

In [67]:
# save as images positive data set
from PIL import Image 
for index in range(multi_2d_sequences_3d_np.shape[0]):
    img = Image.fromarray(multi_2d_sequences_3d_np[index].astype('uint8'))
    img.save('p_images/nifH' + str(index) + '.jpg', 'JPEG')


In [11]:
# save as images negative data set
from PIL import Image 
for index in range(n_multi_2d_sequences_3d_np.shape[0]):
    img = Image.fromarray(n_multi_2d_sequences_3d_np[index].astype('uint8'))
    img.save('n_images/nifH' + str(index) + '.jpg', 'JPEG')

