In [4]:
# this script is intended to map the catalytic residues 
# to the kinases domains of kinases proteins containing Cis regulatory elements (CREs)

In [1]:
import pandas as pd

# Load data

In [2]:
# Dataset of kinases containing CREs
kinases = pd.read_csv('../datasets/kinases.tsv', sep= '\t')
kinases.columns = kinases.columns.str.lower().str.replace(" ", "_")

In [3]:
# Table with pdb ids of kinases containing CREs dataset
pdbs_kd_cre = pd.read_csv('../datasets/pdbs_kd_cre.tsv', sep= '\t')
pdbs_kd_cre

Unnamed: 0,uniprot_acc,id_pdb,id_cre,id_kd,pdb
0,A5K0N4,5147,10,10,5fet
1,O74536,8034,21,21,3h4j
2,O74536,8035,21,21,3h4j
3,O75582,579,22,23,3kn5
4,O75582,582,22,23,3kn5
...,...,...,...,...,...
1532,Q16644,2845,142,147,3fhr
1533,Q16644,5339,142,147,7nrb
1534,Q61846,1754,178,183,4bfm
1535,Q61846,7063,178,183,4cqg


In [5]:
# Number of proteins with PDBs containing both regions
pdbs_kd_cre.uniprot_acc.nunique()

42

In [6]:
# Number of PDB structures in those proteins
pdbs_kd_cre.pdb.nunique()

785

In [7]:
# List of proteins with the architecture N----KD----CRE---C-ter
c_ter = (kinases[kinases["cre_(start)"] > kinases["kd_(end)"]]).uniprot.unique().tolist()

In [8]:
with open('../datasets/kd_cre_cter.txt', 'w') as fp:
    for item in c_ter:
        # write each item on a new line
        fp.write("%s\n" % item)

In [12]:
# tabla de uniprots con PDBs que incluyen ambas regiones y tienen el CRE al c-ter
x = pdbs_kd_cre[['uniprot_acc', 'pdb']].drop_duplicates()
uniprot_pdb_c_ter = x[x.uniprot_acc.isin(c_ter)].sort_values("uniprot_acc")

In [13]:
uniprot_pdb_c_ter

Unnamed: 0,uniprot_acc,pdb
1,O74536,3h4j
3,O75582,3kn5
5,O75582,7up7
7,O75582,3kn6
9,O75582,7up5
...,...,...
1325,Q9UIK4,1z9x
1328,Q9UIK4,2cke
1332,Q9UIK4,2a27
1340,Q9UIK4,1wmk


In [65]:
# Distribution of PDBs per protein
uniprot_pdb_c_ter.groupby("uniprot_acc")['pdb'].size().sort_values().reset_index().rename(columns={'pdb': "count"})

Unnamed: 0,uniprot_acc,count
0,O74536,1
1,Q6PHZ2,1
2,Q63450,1
3,Q16566,1
4,Q13555,1
5,Q13554,1
6,Q13131,2
7,Q61846,2
8,Q13043,3
9,Q8VDF3,3


In [69]:
uniprot_pdb_c_ter.uniprot_acc.nunique()

22

In [67]:
uniprot_pdb_c_ter[uniprot_pdb_c_ter.uniprot_acc == 'O75582']

Unnamed: 0,uniprot_acc,pdb
3,O75582,3kn5
5,O75582,7up7
7,O75582,3kn6
9,O75582,7up5
11,O75582,7up6
12,O75582,7up4
14,O75582,7up8


In [70]:
kinases[kinases.uniprot == 'P35968']

Unnamed: 0,uniprot,organism,cre_(start),cre_(end),cre_(s_or_d),length,kd_(start),kd_(end),pdb_id,pdb_res_(å),af,status
80,P35968,H. sapiens,1163,1356,D,,834,1162,no,no,AF-P35968-F1,known


In [27]:
x_group = x.groupby("uniprot_acc")['pdb'].size().sort_values()
x_group

uniprot_acc
A5K0N4      1
O74536      1
Q96NX5      1
Q6PHZ2      1
Q63450      1
P04049      1
Q13554      1
Q16566      1
P29323      1
P36507      1
Q13555      1
Q01974      2
Q61846      2
Q13131      2
P45985      3
P43403      3
Q13043      3
Q15418      3
Q8VDF3      3
Q13557      3
Q14012      4
Q16644      5
P51812      5
Q8IU85      5
Q13188      5
Q9UIK4      6
P36888      6
O75582      7
P54646      8
P53355     11
P31751     16
P07333     19
P10721     22
P49137     26
P31749     28
Q14680     30
P00519     39
P35968     42
Q02750     60
P00523     69
P28482    123
P00533    215
Name: pdb, dtype: int64

In [33]:
#x_group.plot(kind= 'barh', figsize = (8,10))

In [26]:
pdbs_kd_cre[pdbs_kd_cre.uniprot_acc == 'Q01974']

Unnamed: 0,uniprot_acc,id_pdb,id_cre,id_kd,pdb
1354,Q01974,1210,111,114,3zzw
1355,Q01974,1212,111,114,3zzw
1356,Q01974,2425,111,114,4gt4
1357,Q01974,2429,111,114,4gt4


---

# Catalytic site atlas

### flat files

In [14]:
# 'CSA Style' files with list of curated catalytic residues (03-10-22)
csa_pdb_residues = pd.read_csv('../raw_data/literature_pdb_residues.csv')
csa_pdb_residues.columns = csa_pdb_residues.columns.str.lower().str.replace(" ", "_")
csa_pdb_residues

Unnamed: 0,pdb_id,site_number,residue_type,chain_id,residue_number,chemical_function,evidence_type,literature_entry
0,12as,0,Asp,A,46,S,LIT,12as
1,12as,0,Arg,A,100,S,LIT,12as
2,12as,0,Gln,A,116,S,LIT,12as
3,12as,0,Asp,A,235,S,LIT,12as
4,12as,0,Glu,A,248,S,LIT,12as
...,...,...,...,...,...,...,...,...
4465,9pap,0,Gln,A,19,S,LIT,9pap
4466,9pap,0,Cys,A,25,S,LIT,9pap
4467,9pap,0,Cys,A,25,N,LIT,9pap
4468,9pap,0,His,A,159,S,LIT,9pap


In [15]:
csa_pdb_residues.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4470 entries, 0 to 4469
Data columns (total 8 columns):
 #   Column             Non-Null Count  Dtype 
---  ------             --------------  ----- 
 0   pdb_id             4470 non-null   object
 1   site_number        4470 non-null   int64 
 2   residue_type       4470 non-null   object
 3   chain_id           4470 non-null   object
 4   residue_number     4470 non-null   int64 
 5   chemical_function  4470 non-null   object
 6   evidence_type      4470 non-null   object
 7   literature_entry   4470 non-null   object
dtypes: int64(2), object(6)
memory usage: 279.5+ KB


In [8]:
csa_pdb_residues.evidence_type.value_counts()

LIT    4470
Name: evidence_type, dtype: int64

In [16]:
# M-CSA curated data flat-file
#pd.read_csv('../raw_data/curated_data.csv')#, lineterminator='\n',  error_bad_lines=False)

In [17]:
### Loop the data lines
with open("../raw_data/curated_data.csv", 'r') as temp_f:
    # get No of columns in each line
    col_count = [ len(l.split(",")) for l in temp_f.readlines() ]

### Generate column names  (names will be 0, 1, 2, ..., maximum columns - 1)
column_names = [i for i in range(0, max(col_count))]

### Read csv
df = pd.read_csv("../raw_data/curated_data.csv", header=None, delimiter=",", names=column_names)

In [18]:
new_header = df.iloc[0] #grab the first row for the header
df = df[1:] #take the data less the header row
df.columns = new_header #set the header row as the df header

In [20]:
df.columns = df.columns.str.lower().str.replace(" ", "_").str.replace("-", '_').str.replace("/", "_")

In [21]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 28188 entries, 1 to 28188
Data columns (total 20 columns):
 #   Column                             Non-Null Count  Dtype  
---  ------                             --------------  -----  
 0   m_csa_id                           28188 non-null  object 
 1   uniprot_ids                        28188 non-null  object 
 2   pdb                                28138 non-null  object 
 3   ec                                 28188 non-null  object 
 4   residue_reactant_product_cofactor  28188 non-null  object 
 5   pdb_code                           23938 non-null  object 
 6   chain_kegg_compound                25285 non-null  object 
 7   resid_chebi_id                     28188 non-null  object 
 8   function_location_name             28188 non-null  object 
 9   role                               23964 non-null  object 
 10  role_type                          23825 non-null  object 
 11  role_group                         23806 non-null  obj

In [26]:
csa_curated_data = df[['m_csa_id', 'uniprot_ids', 'pdb', 'pdb_code', 'resid_chebi_id']]
csa_curated_data

Unnamed: 0,m_csa_id,uniprot_ids,pdb,pdb_code,resid_chebi_id
1,M0001,P56868,1b73,Asp,7
2,M0001,P56868,1b73,Asp,7
3,M0001,P56868,1b73,Asp,7
4,M0001,P56868,1b73,Asp,7
5,M0001,P56868,1b73,Asp,7
...,...,...,...,...,...
28184,M0966,P15723,4x9e,,61429
28185,M0966,P15723,4x9e,,15377
28186,M0966,P15723,4x9e,,17172
28187,M0966,P15723,4x9e,,18036


In [28]:
csa_pdb_residues[['pdb_id', 'chain_id', 'residue_type', 'residue_number']]

Unnamed: 0,pdb_id,chain_id,residue_type,residue_number
0,12as,A,Asp,46
1,12as,A,Arg,100
2,12as,A,Gln,116
3,12as,A,Asp,235
4,12as,A,Glu,248
...,...,...,...,...
4465,9pap,A,Gln,19
4466,9pap,A,Cys,25
4467,9pap,A,Cys,25
4468,9pap,A,His,159


In [33]:
# Catalytic sites in all kinases dataset
csa_curated_data[csa_curated_data.uniprot_ids.isin(kinases.uniprot.unique())]

Unnamed: 0,m_csa_id,uniprot_ids,pdb,pdb_code,resid_chebi_id
14075,M0246,P06213,1ir3,Asp,1132
14076,M0246,P06213,1ir3,Asp,1132
14077,M0246,P06213,1ir3,Asp,1132
14078,M0246,P06213,1ir3,Asp,1132
14079,M0246,P06213,1ir3,Asp,1132
14080,M0246,P06213,1ir3,Arg,1136
14081,M0246,P06213,1ir3,Arg,1136
14082,M0246,P06213,1ir3,Arg,1136
14083,M0246,P06213,1ir3,Arg,1136
14084,M0246,P06213,1ir3,Arg,1136


In [38]:
# Catalytic sites in kinases with pdbs in both regions
csa_curated_data[csa_curated_data.uniprot_ids.isin(pdbs_kd_cre.uniprot_acc.unique())]

Unnamed: 0,m_csa_id,uniprot_ids,pdb,pdb_code,resid_chebi_id
16991,M0282,P36507,1s9i,Asp,194
16992,M0282,P36507,1s9i,Asp,194
16993,M0282,P36507,1s9i,Asp,194
16994,M0282,P36507,1s9i,Asp,194
16995,M0282,P36507,1s9i,Asp,194
16996,M0282,P36507,1s9i,Asp,221
16997,M0282,P36507,1s9i,Asp,221
16998,M0282,P36507,1s9i,Asp,221
16999,M0282,P36507,1s9i,Asp,221
17000,M0282,P36507,1s9i,Asp,221


In [39]:
pdbs_kd_cre

Unnamed: 0,uniprot_acc,id_pdb,id_cre,id_kd,pdb
0,A5K0N4,5147,10,10,5fet
1,O74536,8034,21,21,3h4j
2,O74536,8035,21,21,3h4j
3,O75582,579,22,23,3kn5
4,O75582,582,22,23,3kn5
...,...,...,...,...,...
1532,Q16644,2845,142,147,3fhr
1533,Q16644,5339,142,147,7nrb
1534,Q61846,1754,178,183,4bfm
1535,Q61846,7063,178,183,4cqg


In [43]:
csa_curated_data[csa_curated_data.uniprot_ids.isin(pdbs_kd_cre.uniprot_acc.unique())]

Unnamed: 0,m_csa_id,uniprot_ids,pdb,pdb_code,resid_chebi_id
16991,M0282,P36507,1s9i,Asp,194
16992,M0282,P36507,1s9i,Asp,194
16993,M0282,P36507,1s9i,Asp,194
16994,M0282,P36507,1s9i,Asp,194
16995,M0282,P36507,1s9i,Asp,194
16996,M0282,P36507,1s9i,Asp,221
16997,M0282,P36507,1s9i,Asp,221
16998,M0282,P36507,1s9i,Asp,221
16999,M0282,P36507,1s9i,Asp,221
17000,M0282,P36507,1s9i,Asp,221


In [61]:
csa_curated_data[csa_curated_data.pdb.isin(pdbs_kd_cre.pdb.unique())]

Unnamed: 0,m_csa_id,uniprot_ids,pdb,pdb_code,resid_chebi_id
16991,M0282,P36507,1s9i,Asp,194
16992,M0282,P36507,1s9i,Asp,194
16993,M0282,P36507,1s9i,Asp,194
16994,M0282,P36507,1s9i,Asp,194
16995,M0282,P36507,1s9i,Asp,194
16996,M0282,P36507,1s9i,Asp,221
16997,M0282,P36507,1s9i,Asp,221
16998,M0282,P36507,1s9i,Asp,221
16999,M0282,P36507,1s9i,Asp,221
17000,M0282,P36507,1s9i,Asp,221


In [64]:
csa_pdb_residues[csa_pdb_residues.pdb_id.isin(pdbs_kd_cre.pdb.unique())]

Unnamed: 0,pdb_id,site_number,residue_type,chain_id,residue_number,chemical_function,evidence_type,literature_entry
3177,1s9i,0,Asp,A,194,S,LIT,1s9i
3178,1s9i,0,Asp,A,221,S,LIT,1s9i


### API search

consulta por API. Necesito un listado de uniprots separados por ,

In [45]:
l = pdbs_kd_cre.uniprot_acc.unique().tolist()

In [54]:
query = ""
for i, unip in enumerate(l):
    if i == len(l)-1:
        query = query + unip
    else:
        query = query + unip + ","

In [57]:
url = "https://www.ebi.ac.uk/thornton-srv/m-csa/api/entries/?format=json&entries.proteins.sequences.uniprot_ids="

In [58]:
url + query

'https://www.ebi.ac.uk/thornton-srv/m-csa/api/entries/?format=json&entries.proteins.sequences.uniprot_ids=A5K0N4,O74536,O75582,P00519,P00523,P00533,P04049,P07333,P10721,P28482,P29323,P31749,P31751,P35968,P36507,P36888,P43403,P45985,P49137,P51812,P53355,P54646,Q6PHZ2,Q8IU85,Q8VDF3,Q9UIK4,Q96NX5,Q01974,Q02750,Q13043,Q13131,Q13188,Q13554,Q13555,Q13557,Q14012,Q14680,Q15418,Q16566,Q16644,Q61846,Q63450'

esta busqueda solo arrojo tres proteinas: P00518;P06213;P36507

---

In [4]:
catalytic_homologues = pd.read_json('../raw_data/catalytic_residues_homologues.json')
catalytic_homologues

Unnamed: 0,mcsa_id,roles_summary,function_location_abv,ptm,roles,residue_sequences,residue_chains
0,1,"activator, electrostatic stabiliser, hydrogen ...",,,"[{'group_function': '', 'function_type': 'inte...","[{'code': 'Glu', 'resid': 147, 'uniprot_id': '...","[{'code': 'Glu', 'resid': 147, 'auth_resid': 1..."
1,1,"electrostatic stabiliser, hydrogen bond donor",,,"[{'group_function': '', 'function_type': 'inte...","[{'code': 'His', 'resid': 180, 'uniprot_id': '...","[{'code': 'His', 'resid': 180, 'auth_resid': 1..."
2,1,"activator, electrostatic stabiliser, hydrogen ...",,,"[{'group_function': 'activator', 'function_typ...","[{'code': 'Cys', 'resid': 70, 'uniprot_id': 'P...","[{'code': 'Cys', 'resid': 70, 'auth_resid': 70..."
3,1,"activator, hydrogen bond acceptor, proton acce...",,,"[{'group_function': 'activator', 'function_typ...","[{'code': 'Cys', 'resid': 178, 'uniprot_id': '...","[{'code': 'Cys', 'resid': 178, 'auth_resid': 1..."
4,1,"electrostatic stabiliser, hydrogen bond donor,...",,,"[{'group_function': '', 'function_type': 'inte...","[{'code': 'Ser', 'resid': 8, 'uniprot_id': 'P5...","[{'code': 'Ser', 'resid': 8, 'auth_resid': 8, ..."
...,...,...,...,...,...,...,...
5015,967,"covalently attached, nucleofuge, nucleophile, ...",,,"[{'group_function': 'covalent catalysis', 'fun...","[{'code': 'Cys', 'resid': 188, 'uniprot_id': '...","[{'code': 'Cys', 'resid': 197, 'auth_resid': 1..."
5016,967,electrostatic stabiliser,,,[{'group_function': 'electrostatic interaction...,"[{'code': 'His', 'resid': 227, 'uniprot_id': '...","[{'code': 'His', 'resid': 236, 'auth_resid': 2..."
5017,967,"activator, proton acceptor, proton donor",,,"[{'group_function': 'activator', 'function_typ...","[{'code': 'Asp', 'resid': 195, 'uniprot_id': '...","[{'code': 'Asp', 'resid': 204, 'auth_resid': 2..."
5018,968,"electrostatic interaction, electrostatic stabi...",,,[{'group_function': 'electrostatic interaction...,"[{'code': 'Glu', 'resid': 1270, 'uniprot_id': ...","[{'code': 'Glu', 'resid': 1270, 'auth_resid': ..."


In [8]:
catalytic_homologues.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5020 entries, 0 to 5019
Data columns (total 7 columns):
 #   Column                 Non-Null Count  Dtype 
---  ------                 --------------  ----- 
 0   mcsa_id                5020 non-null   int64 
 1   roles_summary          5020 non-null   object
 2   function_location_abv  5020 non-null   object
 3   ptm                    5020 non-null   object
 4   roles                  5020 non-null   object
 5   residue_sequences      5020 non-null   object
 6   residue_chains         5020 non-null   object
dtypes: int64(1), object(6)
memory usage: 274.7+ KB


In [9]:
catalytic_homologues.roles_summary[0]

'activator, electrostatic stabiliser, hydrogen bond acceptor, hydrogen bond donor, increase basicity, proton donor'

In [13]:
prueba = catalytic_homologues[['mcsa_id', 'residue_sequences']]

In [17]:
prueba.residue_sequences.explode(",").apply(pd.Series)

KeyboardInterrupt: 

In [14]:
prueba.residue_sequences.apply(pd.Series)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,2479,2480,2481,2482,2483,2484,2485,2486,2487,2488
0,"{'code': 'Glu', 'resid': 147, 'uniprot_id': 'P...","{'code': 'Glu', 'resid': 153, 'uniprot_id': 'A...","{'code': 'Glu', 'resid': 152, 'uniprot_id': 'Q...","{'code': 'Glu', 'resid': 154, 'uniprot_id': 'Q...","{'code': 'Glu', 'resid': 168, 'uniprot_id': 'A...","{'code': 'Glu', 'resid': 170, 'uniprot_id': 'P...","{'code': 'Glu', 'resid': 170, 'uniprot_id': 'B...","{'code': 'Glu', 'resid': 152, 'uniprot_id': 'Q...","{'code': 'Glu', 'resid': 152, 'uniprot_id': 'C...","{'code': 'Glu', 'resid': 174, 'uniprot_id': 'B...",...,,,,,,,,,,
1,"{'code': 'His', 'resid': 180, 'uniprot_id': 'P...","{'code': 'His', 'resid': 195, 'uniprot_id': 'Q...","{'code': 'His', 'resid': 185, 'uniprot_id': 'A...","{'code': 'His', 'resid': 187, 'uniprot_id': 'Q...","{'code': 'His', 'resid': 187, 'uniprot_id': 'P...","{'code': 'His', 'resid': 185, 'uniprot_id': 'P...","{'code': 'His', 'resid': 185, 'uniprot_id': 'Q...","{'code': 'His', 'resid': 185, 'uniprot_id': 'C...","{'code': 'His', 'resid': 208, 'uniprot_id': 'C...","{'code': 'His', 'resid': 188, 'uniprot_id': 'Q...",...,,,,,,,,,,
2,"{'code': 'Cys', 'resid': 70, 'uniprot_id': 'P5...","{'code': 'Cys', 'resid': 74, 'uniprot_id': 'A1...","{'code': 'Cys', 'resid': 74, 'uniprot_id': 'Q8...","{'code': 'Cys', 'resid': 73, 'uniprot_id': 'A5...","{'code': 'Cys', 'resid': 74, 'uniprot_id': 'Q0...","{'code': 'Cys', 'resid': 75, 'uniprot_id': 'P9...","{'code': 'Cys', 'resid': 92, 'uniprot_id': 'P6...","{'code': 'Cys', 'resid': 73, 'uniprot_id': 'C0...","{'code': 'Cys', 'resid': 92, 'uniprot_id': 'B7...","{'code': 'Cys', 'resid': 76, 'uniprot_id': 'Q4...",...,,,,,,,,,,
3,"{'code': 'Cys', 'resid': 178, 'uniprot_id': 'P...","{'code': 'Cys', 'resid': 193, 'uniprot_id': 'Q...","{'code': 'Cys', 'resid': 185, 'uniprot_id': 'P...","{'code': 'Cys', 'resid': 185, 'uniprot_id': 'Q...","{'code': 'Cys', 'resid': 204, 'uniprot_id': 'B...","{'code': 'Cys', 'resid': 185, 'uniprot_id': 'Q...","{'code': 'Cys', 'resid': 186, 'uniprot_id': 'Q...","{'code': 'Cys', 'resid': 204, 'uniprot_id': 'P...","{'code': 'Cys', 'resid': 183, 'uniprot_id': 'Q...","{'code': 'Cys', 'resid': 206, 'uniprot_id': 'C...",...,,,,,,,,,,
4,"{'code': 'Ser', 'resid': 8, 'uniprot_id': 'P56...","{'code': 'Ser', 'resid': 16, 'uniprot_id': 'Q2...","{'code': 'Ser', 'resid': 13, 'uniprot_id': 'A0...","{'code': 'Ser', 'resid': 14, 'uniprot_id': 'Q8...","{'code': 'Ser', 'resid': 29, 'uniprot_id': 'B5...","{'code': 'Ser', 'resid': 11, 'uniprot_id': 'Q0...","{'code': 'Ser', 'resid': 13, 'uniprot_id': 'P9...","{'code': 'Ser', 'resid': 29, 'uniprot_id': 'P6...","{'code': 'Ser', 'resid': 10, 'uniprot_id': 'A3...","{'code': 'Ser', 'resid': 8, 'uniprot_id': 'Q3J...",...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5015,"{'code': 'Cys', 'resid': 188, 'uniprot_id': ''...",,,,,,,,,,...,,,,,,,,,,
5016,"{'code': 'His', 'resid': 227, 'uniprot_id': ''...",,,,,,,,,,...,,,,,,,,,,
5017,"{'code': 'Asp', 'resid': 195, 'uniprot_id': ''...",,,,,,,,,,...,,,,,,,,,,
5018,"{'code': 'Glu', 'resid': 1270, 'uniprot_id': '...",,,,,,,,,,...,,,,,,,,,,


---

# Catalytic sites from UniProt

map the catalytic sites for the whole dataset

In [4]:
kinases.uniprot.nunique()

280

In [None]:
path = '../datasets/uniprot_files/'
pos_global = []
for filename in os.listdir(path):
    f = os.path.join(path, filename)
    if os.path.isfile(f):
        #print(f)
        # string to search in file
        with open(f, 'r') as fp:
            # read all lines using readline()
            lines = fp.readlines()
            filename = os.path.basename(filename)
            filename = os.path.splitext(filename)[0]
            for i, row in enumerate(lines):
                #print(i)
                # uniprot accesion
                #"AC   A5K0N4"
                # check if string present on a current line
                pattern = "^FT   ACT_SITE        (.+)$"
        #         print(row.find(word))
                # find() method returns -1 if the value is not found,
                # if found it return 0
                k = re.findall(pattern, row)
                if k:
                    pos_global.append({'uniprot': filename, 'pos': k[0]})
                #print(row)
                #print(pos)
                # if row.find(word) == 0:
                #     print('string exists in file')
                #     #print('line Number:', lines.index(line))