# Pandas
Pandas is a data analysis module for python. It can be used with any type of data, including chemistry and biology data. Data can be stored in tables called dataframes.

# Install necessary libraries
You only run this if you do not already have pandas installed

In [None]:
%pip install pandas

# Loading a Dataframe
You can load dataframes from a comma separated value (csv) file. csv files have values separated by commans. Excel can open csv files as spreadsheets. During debugging, if you want to look at your input csv file, it can be helpful to open it in Excel. <br/>
We will use data on protein structure from the PDB, taken from this source: https://www.kaggle.com/datasets/alfrandom/protein-secondary-structure

In [1]:
import pandas as pd

df = pd.read_csv("2018-06-06-pdb-intersect-pisces.csv")
#this will let us see the beginning of the table
df.head()

Unnamed: 0,pdb_id,chain_code,seq,sst8,sst3,len,has_nonstd_aa,Exptl.,resolution,R-factor,FreeRvalue
0,1FV1,F,NPVVHFFKNIVTPRTPPPSQ,CCCCCBCCCCCCCCCCCCCC,CCCCCECCCCCCCCCCCCCC,20,False,XRAY,1.9,0.23,0.27
1,1LM8,H,DLDLEMLAPYIPMDDDFQLR,CCCCCCCCCBCCSCCCEECC,CCCCCCCCCECCCCCCEECC,20,False,XRAY,1.85,0.2,0.24
2,1O06,A,EEDPDLKAAIQESLREAEEA,CCCHHHHHHHHHHHHHHHTC,CCCHHHHHHHHHHHHHHHCC,20,False,XRAY,1.45,0.19,0.22
3,1QOW,D,CTFTLPGGGGVCTLTSECI*,CCTTSCTTCSSTTSSTTCCC,CCCCCCCCCCCCCCCCCCCC,20,True,XRAY,1.06,0.14,1.0
4,1RDQ,I,TTYADFIASGRTGRRNAIHD,CHHHHHHTSSCSSCCCCEEC,CHHHHHHCCCCCCCCCCEEC,20,False,XRAY,1.26,0.13,0.16


# Examples of what you can do with pandas
You can use pandas to identify rows that meet a certain condition or find the minimum or maximum values. 

In [2]:
#indentify rows with  proteins longer than 500 amino acids
df_long_protein = df[df["len"] > 500]
print(df_long_protein)

#count how many rows remain in the data frame
print("There are " + str(len(df_long_protein)) + " proteins longer than 500 amino acids")

     pdb_id chain_code                                                seq  \
8518   1WLE          B  MGHHHHHHSSGLVPRGSATERQDRNLLYEHAREGYSALPLLDMESL...   
8519   2QCU          B  METKDLIVIGGGINGAGIAADAAGRGLSVLMLEAQDLACATSSASS...   
8520   3POP          C  GSHMTASVPPFTVGREDPRYIELSHSDNHRFVVEPEEFFLPATPDD...   
8521   4C12          A  MDASTLFKKVKVKRVLGSLEQQIDDITTDSRTAREGSIFVASVGYT...   
8522   4G26          A  GAGHMASPSENLSRKAKKKAIQQSPEALLKQKLDMCSKKGDVLEAL...   
...     ...        ...                                                ...   
9073   4LGY          A  GDGLVPRGSHMMEILRGSPALSAFRINKLLARFQAANLQVHNIYAE...   
9074   5XH6          A  GSHMTQFEGFTNLYQVSKTLRFELIPQGKTLKHIQEQGFIEEDKAR...   
9075   5B2R          B  GSGHMDKKYSIGLAIGTNSVGWAVITDEYKVPSKKFKVLGNTDRHS...   
9076   5WLH          A  SNAMKISKVREENRGAKLTVNAKTAVVSENRSQEGILYNDPSRYGK...   
9077   5B2P          A  GSHMNFKILPIAIDLGVKNTGVFSAFYQKGTSLERLDNKNGKVYEL...   

                                                   sst8  \
8518  CCCCCCCCCC

In [None]:
#identify the best and worst resolution structures
print("best resolution structure")
best_res_row = df.loc[df["resolution"].idxmax()]
print(best_res_row)

print("worst resolution structure")
worst_res_row = df.loc[df["resolution"].idxmin()]
print(worst_res_row)