In [1]:
import pandas as pd
import os
from collections import defaultdict
import paramiko # for ssh
import traceback #  for error handling
import ast
import requests
import re
from Bio.SeqUtils import seq1

# import from ../utils.py
import sys
sys.path.insert(1, os.path.join(sys.path[0], '..'))
from utils import get_pdb_ids, get_resolution, get_dbref_data, read_finalsum, read_finalsum_decomp, \
    read_finalsum_decomp, get_gene_name, get_uniprot_sequence, calculate_sbna_and_download, dwl_pdb_file, align_finalsum_with_uniprot

  "class": algorithms.Blowfish,


In [2]:
cgc_data = pd.read_csv('Census_allTue Apr  9 04_58_38 2024.csv')
cgc_data.head()

Unnamed: 0,Gene Symbol,Name,Entrez GeneId,Genome Location,Tier,Hallmark,Chr Band,Somatic,Germline,Tumour Types(Somatic),Tumour Types(Germline),Cancer Syndrome,Tissue Type,Molecular Genetics,Role in Cancer,Mutation Types,Translocation Partner,Other Germline Mut,Other Syndrome,Synonyms
0,A1CF,APOBEC1 complementation factor,29974.0,10:50799421-50885675,2,,10q11.23,yes,,melanoma,,,E,,oncogene,Mis,,,,"ACF,ACF64,ACF65,APOBEC1CF,ASP,CCDS73133.1,ENSG..."
1,ABI1,abl-interactor 1,10006.0,10:26746593-26860935,1,Yes,10p12.1,yes,,AML,,,L,Dom,"TSG, fusion",T,KMT2A,,,"ABI-1,CCDS7150.1,E3B1,ENSG00000136754.17,NM_00..."
2,ABL1,v-abl Abelson murine leukemia viral oncogene h...,25.0,9:130713946-130885683,1,Yes,9q34.12,yes,,"CML, ALL, T-ALL",,,L,Dom,"oncogene, fusion","T, Mis","BCR, ETV6, NUP214",,,"ABL,CCDS35165.1,ENSG00000097007.17,JTK7,NM_007..."
3,ABL2,"c-abl oncogene 2, non-receptor tyrosine kinase",27.0,1:179099327-179229601,1,,1q25.2,yes,,AML,,,L,Dom,"oncogene, fusion",T,ETV6,,,"ABLL,ARG,CCDS30947.1,ENSG00000143322.19,NM_007..."
4,ACKR3,atypical chemokine receptor 3,57007.0,2:236569641-236582358,1,Yes,2q37.3,yes,,lipoma,,,M,Dom,"oncogene, fusion",T,HMGA2,,,"CCDS2516.1,CMKOR1,CXCR7,ENSG00000144476.5,GPR1..."


In [14]:
# add pdb structures to cgc_data
pdb_structures = []
count = 1
for _, row in cgc_data.iterrows():
    # print curent count, replace previous line    
    gene_symbol = row['Gene Symbol']
    entrez_gene_id = row['Entrez GeneId']
    synonyms = row['Synonyms']
    print(f"Processing {count} of {len(cgc_data)} ({gene_symbol})", end='\r')
    count += 1

    try:
        status, pdb = get_pdb_ids(gene_symbol, entrez_gene_id, synonyms)
    except Exception as e:
        print(f"Failed to get pdb ids for {gene_symbol} with error {e}")
        pdb_structures.append(None)
        continue

    if not pdb:
        print(f"Failed to get pdb ids for {gene_symbol} with status code {status}")
        pdb_structures.append(None)
    else:
        if type(pdb) == list:
            pdb_structures.append(pdb)
        else:
            pdb_structures.append([pdb])

cgc_data['PDB Structures'] = pdb_structures

Failed to get pdb ids for ACSL3 with status code 200
Failed to get pdb ids for ACSL6 with status code 200
Failed to get pdb ids for AFF3 with status code 200
Failed to get pdb ids for AKAP9 with status code 200
Failed to get pdb ids for ARHGEF10 with status code 200
Failed to get pdb ids for ARHGEF10L with status code 200
Failed to get pdb ids for ASPM with status code 200
Failed to get pdb ids for ASXL2 with status code 200
Failed to get pdb ids for ATF1 with status code 200
Failed to get pdb ids for ATP2B3 with status code 200
Failed to get pdb ids for BCL11B with status code 200
Failed to get pdb ids for BCL2L12 with status code 200
Failed to get pdb ids for BCL7A with status code 200
Failed to get pdb ids for BMP5 with status code 200
Failed to get pdb ids for BTG1 with status code 200
Failed to get pdb ids for CARS with error list indices must be integers or slices, not str
Failed to get pdb ids for CBFA2T3 with status code 200
Failed to get pdb ids for CCDC6 with status code 200


In [15]:
cgc_data.head(3)

Unnamed: 0,Gene Symbol,Name,Entrez GeneId,Genome Location,Tier,Hallmark,Chr Band,Somatic,Germline,Tumour Types(Somatic),...,Tissue Type,Molecular Genetics,Role in Cancer,Mutation Types,Translocation Partner,Other Germline Mut,Other Syndrome,Synonyms,PDB Structures,n_structures
0,A1CF,APOBEC1 complementation factor,29974.0,10:50799421-50885675,2,,10q11.23,yes,,melanoma,...,E,,oncogene,Mis,,,,"ACF,ACF64,ACF65,APOBEC1CF,ASP,CCDS73133.1,ENSG...",[2CPD],4
1,ABI1,abl-interactor 1,10006.0,10:26746593-26860935,1,Yes,10p12.1,yes,,AML,...,L,Dom,"TSG, fusion",T,KMT2A,,,"ABI-1,CCDS7150.1,E3B1,ENSG00000136754.17,NM_00...",[7LXE],4
2,ABL1,v-abl Abelson murine leukemia viral oncogene h...,25.0,9:130713946-130885683,1,Yes,9q34.12,yes,,"CML, ALL, T-ALL",...,L,Dom,"oncogene, fusion","T, Mis","BCR, ETV6, NUP214",,,"ABL,CCDS35165.1,ENSG00000097007.17,JTK7,NM_007...","[1AB2, 1AWO, 1BBZ, 1JU5, 1OPL, 1ZZP, 2ABL, 2E2...",81


In [16]:
# count number of elements in each pdb_structure
lens = []
for i in pdb_structures:
    # if value is list
    if isinstance(i, list):
        lens.append(len(i))
    elif i is not None:
        lens.append(1)
    else:
        lens.append(0)

cgc_data['n_structures'] = lens
cgc_data.head(3)

Unnamed: 0,Gene Symbol,Name,Entrez GeneId,Genome Location,Tier,Hallmark,Chr Band,Somatic,Germline,Tumour Types(Somatic),...,Tissue Type,Molecular Genetics,Role in Cancer,Mutation Types,Translocation Partner,Other Germline Mut,Other Syndrome,Synonyms,PDB Structures,n_structures
0,A1CF,APOBEC1 complementation factor,29974.0,10:50799421-50885675,2,,10q11.23,yes,,melanoma,...,E,,oncogene,Mis,,,,"ACF,ACF64,ACF65,APOBEC1CF,ASP,CCDS73133.1,ENSG...",[2CPD],1
1,ABI1,abl-interactor 1,10006.0,10:26746593-26860935,1,Yes,10p12.1,yes,,AML,...,L,Dom,"TSG, fusion",T,KMT2A,,,"ABI-1,CCDS7150.1,E3B1,ENSG00000136754.17,NM_00...",[7LXE],1
2,ABL1,v-abl Abelson murine leukemia viral oncogene h...,25.0,9:130713946-130885683,1,Yes,9q34.12,yes,,"CML, ALL, T-ALL",...,L,Dom,"oncogene, fusion","T, Mis","BCR, ETV6, NUP214",,,"ABL,CCDS35165.1,ENSG00000097007.17,JTK7,NM_007...","[1AB2, 1AWO, 1BBZ, 1JU5, 1OPL, 1ZZP, 2ABL, 2E2...",81


In [17]:
cgc_data.to_csv('Census_all_with_pdb.csv', index=False)

In [9]:
tmp_genes = ["KRAS", "EGFR", "TP53", "HRAS"]

In [10]:
# download pdb files

resolutions = defaultdict(dict)

# count number of structures for all gene
n_total = cgc_data[cgc_data['Gene Symbol'].isin(tmp_genes)]["n_structures"].sum()

for gene in tmp_genes:
    pdb_structures = cgc_data[cgc_data['Gene Symbol'] == gene]['PDB Structures'].values[0]
    if pdb_structures:
        for pdb_id in pdb_structures:
            n_total -= 1
            print(f"Downloading PDB file {pdb_id} for {gene} ({n_total} structures remaining)", end='\r')
            # check if the file already exists
            if not os.path.exists(f'pdb_files/{pdb_id}.pdb'):
                dwl_pdb_file(pdb_id)
            resolution = get_resolution(pdb_id)
            resolutions[gene][pdb_id] = resolution

Failed to download PDB file 7Q9U. Status code: 404 remaining))
Failed to get resolution for 7Q9U with error [Errno 2] No such file or directory: 'pdb_files/7Q9U.pdb'
Downloading PDB file 8FG4 for HRAS (0 structures remaining)))

In [18]:
def sort_pdb_ids_by_resolution(gene_id):
    pdb_resolutions = resolutions[gene_id]
    sorted_pdb_ids = sorted(pdb_resolutions, key=lambda pdb_id: float('inf') if pdb_resolutions[pdb_id] is None else float(pdb_resolutions[pdb_id]))
    sorted_resolutions = [(pdb_id, pdb_resolutions[pdb_id]) for pdb_id in sorted_pdb_ids]
    return sorted_resolutions

top_structures = dict()
n = 30
for gene_id in resolutions:
    sorted_pdb_resolutions = sort_pdb_ids_by_resolution(gene_id)
    print(f"\nTop {n} structures for Gene {gene_id}:")
    for pdb_id, resolution in sorted_pdb_resolutions[:n]:
        print(f"PDB ID: {pdb_id}, Resolution: {resolution}")
    top_structures[gene_id] = [i[0] for i in sorted_pdb_resolutions[:n]]


Top 30 structures for Gene KRAS:
PDB ID: 6P0Z, Resolution: 1.01
PDB ID: 8ONV, Resolution: 1.01
PDB ID: 8AZZ, Resolution: 1.02
PDB ID: 4QL3, Resolution: 1.04
PDB ID: 8AZX, Resolution: 1.04
PDB ID: 8B00, Resolution: 1.04
PDB ID: 8AZV, Resolution: 1.05
PDB ID: 8AZY, Resolution: 1.09
PDB ID: 8B78, Resolution: 1.11
PDB ID: 8AFB, Resolution: 1.12
PDB ID: 4TQA, Resolution: 1.13
PDB ID: 4LDJ, Resolution: 1.15
PDB ID: 6TAN, Resolution: 1.16
PDB ID: 7O70, Resolution: 1.18
PDB ID: 8EDY, Resolution: 1.18
PDB ID: 8EER, Resolution: 1.18
PDB ID: 4Q03, Resolution: 1.20
PDB ID: 7R0N, Resolution: 1.20
PDB ID: 8EBZ, Resolution: 1.20
PDB ID: 8TXH, Resolution: 1.20
PDB ID: 4OBE, Resolution: 1.24
PDB ID: 6QUW, Resolution: 1.24
PDB ID: 5XCO, Resolution: 1.25
PDB ID: 7RT1, Resolution: 1.27
PDB ID: 7T47, Resolution: 1.27
PDB ID: 4LUC, Resolution: 1.29
PDB ID: 4Q01, Resolution: 1.29
PDB ID: 6B0V, Resolution: 1.29
PDB ID: 7RT5, Resolution: 1.29
PDB ID: 7RPZ, Resolution: 1.30

Top 30 structures for Gene EGFR:
PD

In [36]:
# save top_structures to a file
with open(f'top_{n}_structures.txt', 'w') as f:
	f.write(str(top_structures))

In [2]:
# read top_structures from file
with open(f'top_30_structures.txt', 'r') as f:
	top_structures = ast.literal_eval(f.read())
top_structures

{'KRAS': ['6P0Z',
  '8ONV',
  '8AZZ',
  '4QL3',
  '8AZX',
  '8B00',
  '8AZV',
  '8AZY',
  '8B78',
  '8AFB',
  '4TQA',
  '4LDJ',
  '6TAN',
  '7O70',
  '8EDY',
  '8EER',
  '4Q03',
  '7R0N',
  '8EBZ',
  '8TXH',
  '4OBE',
  '6QUW',
  '5XCO',
  '7RT1',
  '7T47',
  '4LUC',
  '4Q01',
  '6B0V',
  '7RT5',
  '7RPZ'],
 'EGFR': ['8A27',
  '8A2D',
  '5UG9',
  '5HG8',
  '8A2A',
  '5UG8',
  '3POZ',
  '6TFV',
  '6TG0',
  '3VRP',
  '5HG5',
  '5CNO',
  '5UGC',
  '3G5Y',
  '5U8L',
  '6TG1',
  '7SI1',
  '8A2B',
  '3W33',
  '6TFY',
  '4I22',
  '6WXN',
  '6V66',
  '3P0Y',
  '3W32',
  '4I24',
  '5GNK',
  '6TFZ',
  '5UGA',
  '7JXQ'],
 'TP53': ['3D06',
  '5MHC',
  '6GGC',
  '6SHZ',
  '4MZI',
  '6GGE',
  '3LW1',
  '5O1E',
  '6RL3',
  '8E7A',
  '5O1C',
  '5O1H',
  '6GGB',
  '6GGF',
  '7B4N',
  '3ZME',
  '5AOK',
  '5G4N',
  '5O1G',
  '6V4F',
  '5AB9',
  '5O1D',
  '8A92',
  '5G4M',
  '5O1F',
  '7B4H',
  '1YC5',
  '3D08',
  '5A7B',
  '5O1I'],
 'HRAS': ['2CE2',
  '2EVW',
  '2CLD',
  '2CL6',
  '2CL7',
  '5WDQ',
  '1C

In [3]:
host = "m3-dtn.massive.org.au"
username = "yliy0004"
with open("pw.txt", "r") as f:
    password = f.read()
base_path = 'ym65_scratch/yliy0004/NetworkAnalysis'

client = paramiko.client.SSHClient()
client.set_missing_host_key_policy(paramiko.AutoAddPolicy())
client.connect(host, username=username, password=password)

# check if R module is loaded (uses .bashrc - not the best solution because STRUDEL cannot be used, make sure to correct this if using STRUDEL)
stdin,stdout,stderr=client.exec_command('R --version') 
print(stdout.read().decode())
print(stderr.read().decode())

sftp_client=client.open_sftp()

# ==================== SBNA ====================
# ## multiple structures by gene
for gene_id in top_structures:
    for pdb_id in top_structures[gene_id]:

# ## or certain genes
# # for gene in ['HRAS']:
#     # tmp = ast.literal_eval(cgc_data.loc[cgc_data['Gene Symbol'] == gene, 'PDB Structures'].values[0])
#     # for pdb_id in tmp:
#     # for pdb_id in top_structures[gene]:
        try:
            resolution = get_resolution(pdb_id)
            if resolution and float(resolution) <= 3.0:
                calculate_sbna_and_download(client, sftp_client, pdb_id, base_path)
        except Exception as e:
            print(f"Failed to process {pdb_id} with error:")
            print(traceback.print_exc())
            continue

## or simply a few pdbs
# for pdb_id in ["1D5R", "1JM7", "1T29", "4FMQ", "4NIF"]:
#     try:
#         calculate_sbna_and_download(client, sftp_client, pdb_id, base_path)
#     except Exception as e:
#         print(f"Failed to process {pdb_id} with error:")
#         print(traceback.print_exc())
#         continue

## or one pdb
# calculate_sbna_and_download(client, sftp_client, "1C26", base_path, chains=['A'])

# close the connection
client.close()
sftp_client.close()

R version 4.0.5 (2021-03-31) -- "Shake and Throw"
Copyright (C) 2021 The R Foundation for Statistical Computing
Platform: x86_64-pc-linux-gnu (64-bit)

R is free software and comes with ABSOLUTELY NO WARRANTY.
You are welcome to redistribute it under the terms of the
GNU General Public License versions 2 or 3.
For more information about these matters see
https://www.gnu.org/licenses/.




Processing 5CNO with chain X...
Starting phenix.pdbtools
on Fri Apr 26 19:56:54 2024 by yliy0004

Processing files:
-------------------------------------------------------------------------------

  Found model, 5CNO.pdb

Processing PHIL parameters:
-------------------------------------------------------------------------------

  Adding command-line PHIL:
  -------------------------
    keep=Chain X

Final processed PHIL parameters:
-------------------------------------------------------------------------------
  data_manager {
    model {
      file = "5CNO.pdb"
    }
    default_model = "5CNO.pdb"


In [13]:
client.close()
sftp_client.close()

# if you want to add more results to final_data
(after having done more sbna iterations)
* reads the pdb structures related to each gene in `cgc_data`
* read the current `final_data.csv`
* appends the new sbna_results to `final_data`

In [2]:
cgc_data = pd.read_csv('Census_all_with_pdb.csv')
cgc_data.head(3)

Unnamed: 0,Gene Symbol,Name,Entrez GeneId,Genome Location,Tier,Hallmark,Chr Band,Somatic,Germline,Tumour Types(Somatic),...,Tissue Type,Molecular Genetics,Role in Cancer,Mutation Types,Translocation Partner,Other Germline Mut,Other Syndrome,Synonyms,PDB Structures,n_structures
0,A1CF,APOBEC1 complementation factor,29974.0,10:50799421-50885675,2,,10q11.23,yes,,melanoma,...,E,,oncogene,Mis,,,,"ACF,ACF64,ACF65,APOBEC1CF,ASP,CCDS73133.1,ENSG...",['2CPD'],1
1,ABI1,abl-interactor 1,10006.0,10:26746593-26860935,1,Yes,10p12.1,yes,,AML,...,L,Dom,"TSG, fusion",T,KMT2A,,,"ABI-1,CCDS7150.1,E3B1,ENSG00000136754.17,NM_00...",['7LXE'],1
2,ABL1,v-abl Abelson murine leukemia viral oncogene h...,25.0,9:130713946-130885683,1,Yes,9q34.12,yes,,"CML, ALL, T-ALL",...,L,Dom,"oncogene, fusion","T, Mis","BCR, ETV6, NUP214",,,"ABL,CCDS35165.1,ENSG00000097007.17,JTK7,NM_007...","['1AB2', '1AWO', '1BBZ', '1JU5', '1OPL', '1ZZP...",81


In [6]:
#tmp_genes = ["KRAS", "EGFR", "TP53", "HRAS"]
# tmp_genes = ["KRAS", "TP53", "HRAS"]
# tmp_genes = ["EGFR"]

In [3]:
final_data = pd.DataFrame()

In [7]:
final_data = list(final_data.values)

In [8]:
print(len(final_data))

33470


In [4]:
filtered_data = cgc_data[cgc_data['PDB Structures'].notna()] # pdb_structures not nan
# filtered_data = filtered_data[filtered_data['Gene Symbol'].isin(tmp_genes)]

count = filtered_data.shape[0]
for i, row in filtered_data.iterrows():
    print(f"Processing {i+1} of {count}", end='\r')
    gene_symbol = row['Gene Symbol']
    pdb_structures = row['PDB Structures']
    for pdb_id in ast.literal_eval(pdb_structures):
        if not os.path.exists(f'../pdb_files/{pdb_id}.pdb'):
            continue
        else:
            data = get_dbref_data(pdb_id)
            resolution = get_resolution(pdb_id)
            for i in data:
                chain = i['chain']

                # check if finalsum exists or if data is already in final_data
                if not os.path.exists(f'../sbna_results/{pdb_id}/{chain}/{pdb_id}_monomer/FinalSum') or\
                      not os.path.exists(f'../sbna_results/{pdb_id}/{chain}/{pdb_id}_monomer/FinalSum_Decomp') or \
                    len(final_data) and len(final_data[(final_data['pdb_id']==pdb_id) & (final_data['chain']==chain)]):
                    continue    
                      
                uniprot_id = i['uniprot']
                assoc_gene = get_gene_name(uniprot_id)
                if assoc_gene != gene_symbol:
                    # sometimes protein chain extends multiple genes, skip if not the same
                    # in future can link if gene is in cgc_data/tcga_data
                    continue

                # read finalSum file and add to table
                final_sum = read_finalsum(pdb_id, chain)
                decomp = read_finalsum_decomp(pdb_id, chain)
                final_sum = final_sum.merge(decomp, on=['res_code', 'num']).drop(['res_x', 'res_y', 'Acid'], axis=1)
                final_sum = align_finalsum_with_uniprot(final_sum, pdb_id, chain)                
                final_sum['residue_match'] = final_sum['uniprot_res'] == final_sum['res_code'] 

                final_sum['pdb_id'] = pdb_id
                final_sum['chain'] = chain
                final_sum['uniprot_id'] = uniprot_id
                final_sum['gene'] = assoc_gene

                final_data = pd.concat([final_data, final_sum])
                
#                final_data.append([gene_symbol, pdb_id, chain, uniprot_id, assoc_gene, resolution, num, row['res_code'], 
#                                row['network_score'], row['SecondOrderIntermodularDegree_AVERAGE'], row['NodeEdgeBetweennessSTRIDE_sidechain_MAX'],
#                                row['LigandMULTIMERCENTROIDSC_MIN'], outside_range, residue_match])
final_data

Multiple residues for the same number detected for 8H1T_L
Error in handling multiple residues for the same number, using all values.
Multiple residues for the same number detected for 1ICF_I
Error in handling multiple residues for the same number, using all values.
Multiple residues for the same number detected for 4AP2_B
Multiple residues for the same number detected for 2WUH_A
Multiple residues for the same number detected for 6V5B_A
Multiple residues for the same number detected for 4UIP_A
Multiple residues for the same number detected for 7MN5_B
Multiple residues for the same number detected for 5H8F_A
Multiple residues for the same number detected for 5E95_A
Multiple residues for the same number detected for 5XCO_A


TypeError: sequence has unexpected type NoneType

In [10]:
final_data = pd.DataFrame(final_data, columns=['gene_symbol', 'pdb_id', 'chain', 'uniprot_id', 'assoc_gene', 'resolution', 'num', 'res_code', 'uniprot_res', 
                                  'network_score', 'SecondOrderIntermodularDegree_AVERAGE', 'NodeEdgeBetweennessSTRIDE_sidechain_MAX',
                                  'LigandMULTIMERCENTROIDSC_MIN', 'outside_range', 'residue_match'])
final_data = final_data.sort_values(["assoc_gene", "pdb_id", "chain", "uniprot_id"], ascending=True)
final_data.head()

Unnamed: 0,gene_symbol,pdb_id,chain,uniprot_id,assoc_gene,resolution,num,res_code,uniprot_res,network_score,SecondOrderIntermodularDegree_AVERAGE,NodeEdgeBetweennessSTRIDE_sidechain_MAX,LigandMULTIMERCENTROIDSC_MIN,outside_range,residue_match
0,EGFR,3POZ,A,P00533,EGFR,1.5,701,Q,Q,-3.791355,-1.246142,-0.541346,2.003868,False,True
1,EGFR,3POZ,A,P00533,EGFR,1.5,702,A,A,-2.634055,-1.088502,-0.455538,1.090015,False,True
2,EGFR,3POZ,A,P00533,EGFR,1.5,703,L,L,-1.231055,-0.643414,-0.105177,0.482465,False,True
3,EGFR,3POZ,A,P00533,EGFR,1.5,704,L,L,0.452193,0.377611,0.205135,0.130553,False,True
4,EGFR,3POZ,A,P00533,EGFR,1.5,705,R,R,-1.086163,0.107137,-0.523559,0.669741,False,True


In [11]:
print(len(final_data))

34998


In [8]:
final_data.to_csv('final_data_aligned.csv', index=False)

In [2]:
final_data = pd.read_csv('final_data.csv')

In [3]:
new = pd.DataFrame()

# create a dictionary to save uniprots, and sequences so don't have to query every time...

In [5]:
tmp = final_data[(final_data['pdb_id'].isin(['3POZ', "8E7A"])) & (final_data['chain'] == "A")]

In [6]:
# count = len(new)

align_finalsum_with_uniprot(tmp)

# for _, row in final_data.iterrows():
#     pdb_id, chain = row['pdb_id'], row['chain']
#     if not os.path.exists(f'../sbna_results/{pdb_id}/{chain}/{pdb_id}_monomer/FinalSum'):
#         continue
#     final_sum = final_data[(final_data['pdb_id'] == pdb_id) & (final_data['chain'] == chain)]
#     new = pd.concat([new, align_finalsum_with_uniprot(final_sum, pdb_id, chain)])
#     count += 1
#     print(f"Processed {count} of {len(final_data)}", end='\r')
#     break
# new

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  final_sum['pdb_chain'] = final_sum['pdb_id'] + "_" + final_sum['chain']
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  final_sum['uniprot_num'] = [item for sublist in uniprot_nums for item in sublist]


Unnamed: 0,gene_symbol,pdb_id,chain,uniprot_id,assoc_gene,resolution,num,res_code,uniprot_res,network_score,SecondOrderIntermodularDegree_AVERAGE,NodeEdgeBetweennessSTRIDE_sidechain_MAX,LigandMULTIMERCENTROIDSC_MIN,outside_range,residue_match,pdb_chain,uniprot_num
0,EGFR,3POZ,A,P00533,EGFR,1.5,701,Q,Q,-3.791355,-1.246142,-0.541346,2.003868,False,True,3POZ_A,701
1,EGFR,3POZ,A,P00533,EGFR,1.5,702,A,A,-2.634055,-1.088502,-0.455538,1.090015,False,True,3POZ_A,702
2,EGFR,3POZ,A,P00533,EGFR,1.5,703,L,L,-1.231055,-0.643414,-0.105177,0.482465,False,True,3POZ_A,703
3,EGFR,3POZ,A,P00533,EGFR,1.5,704,L,L,0.452193,0.377611,0.205135,0.130553,False,True,3POZ_A,704
4,EGFR,3POZ,A,P00533,EGFR,1.5,705,R,R,-1.086163,0.107137,-0.523559,0.669741,False,True,3POZ_A,705
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
33465,TP53,8E7A,A,P04637,TP53,1.3,287,E,E,-1.837237,-1.145720,-0.691517,0.000000,False,True,8E7A_A,287
33466,TP53,8E7A,A,P04637,TP53,1.3,288,N,N,-1.837237,-1.145720,-0.691517,0.000000,False,True,8E7A_A,288
33467,TP53,8E7A,A,P04637,TP53,1.3,289,L,L,-1.196778,-0.636678,-0.560100,0.000000,False,True,8E7A_A,289
33468,TP53,8E7A,A,P04637,TP53,1.3,290,R,R,-1.624046,-0.933228,-0.690818,0.000000,False,True,8E7A_A,290


In [10]:
new.to_csv('final_data_aligned.csv', index=False)