# worksheet8: Pandas dataframes

In [None]:
import pandas as pd
import numpy as np

## DataFrame
- represents a rectangular table of data and contains an ordered collection of columns, each of which can be a different value type (numeric, string, boolean, etc.)
- can be thought of as a collection of Series (columns)
- has both rows and columns, each with their own index
- different ways to create a DataFrame: from series, from dict, from a file 

### df from a series
- ser.to_frame()
- pd.DataFrame

In [None]:
gene_lengths_kb = [2, 4, 4, 5 ]

In [None]:
gene_names = ['BRCA1','BRCA2', 'SMAD2', 'TTN']

In [None]:
gene_lengths_kb_ser = pd.Series(
   
)

In [None]:
gene_lengths_kb_df = gene_lengths_kb_ser.to_frame()


In [None]:
gene_lengths_kb_df

In [None]:
pd.DataFrame(gene_lengths_kb_ser)

In [None]:
pd.DataFrame(gene_lengths_kb_ser, index=['BRCA1', 'TTN'])

### row labels (index) of DataFrame

In [None]:
gene_lengths_kb_df.index

### column labels (index) of DataFrame
- default is a range index if no index specified

In [None]:
gene_lengths_kb_df.columns

### df.values

In [None]:
gene_lengths_kb_df.values

In [None]:
gene_lengths_kb_df.values.ndim

### df from a dict
- pd.DataFrame.from_dict()
- pd.DataFrame

#### pd.DataFrame.from_dict()

In [None]:
gene_expression = [2, 4, 4, 5]

In [None]:
d = {
    k : [v, v+1]
    for k, v in zip(gene_names, gene_expression)
}

In [None]:
d

In [None]:
d.keys()

#### pd.DataFrame.from_dict()
- keys of the dict = col labels (default behaviour)
- if keys of the dict = row labels, set `orient` to index


#### default behaviour (keys = columns)

In [None]:
gene_expression_df = pd.DataFrame.from_dict(d)

In [None]:
gene_expression_df

#### redefine the index if you need
- using .index
- using rename

In [None]:
gene_expression_df.index=['time.0', 'time.1']

In [None]:
gene_expression_df

In [None]:
pd.DataFrame.from_dict(d).rename(index={0:'time.0', 1:'time.1'})

#### assign keys = row labels

In [None]:
d

In [None]:
gene_expression_df = pd.DataFrame.from_dict(d, orient='index')

In [None]:
gene_expression_df

### Q: can you change the colnames?

#### pd.DataFrame 
- takes `index` argument
- constructs default index if none specified
- can transpose this dataframe to make rows the columns, and columns the rows

In [None]:
d

In [None]:
pd.DataFrame(d, index=['time.0', 'time.'])

In [None]:
pd.DataFrame(d)

In [None]:
pd.DataFrame(d).T

### df.dtypes

In [None]:
gene_expression_df

In [None]:
gene_expression_df.dtypes

### add a new column to the df

In [None]:
gene_expression_df['gene_desc'] = [
    'breast cancer gene 1',
    'breast cancer gene 2',
    'SMAD family member 2',
    'Transient Tachypnea of the Newborn'
]

In [None]:
gene_expression_df

In [None]:
gene_expression_df.dtypes

### change dtype of a specific column

In [None]:
gene_expression_df = gene_expression_df.astype({'gene_desc': 'string'})

In [None]:
gene_expression_df

In [None]:
gene_expression_df.dtypes

### df.axes
-  DataFrame has two axes: one for the rows, one for the columns
- axis=0
- axis=1

In [None]:
gene_expression_df

In [None]:
gene_expression_df.columns = ['time.0', 'time.1', 'gene_desc']

In [None]:
gene_expression_df.axes

In [None]:
len(gene_expression_df.axes)

In [None]:
gene_expression_df.axes[0]

In [None]:
gene_expression_df.axes[1]

In [None]:
gene_expression_df['time.0'].sum(axis=0)

In [None]:
gene_expression_df['time.0'].sum(axis='index')

### Q: calculate total gene expression
- sum time.0 and time.1 along axis=1


### Q: how many axes does a series have?
- can you confirm your answer?


### df.shape

In [None]:
gene_expression_df.shape

### df.size
- num_rows * num_cols

In [None]:
gene_expression_df.size

### df.head

In [None]:
gene_expression_df.head(n=3)

### df.empty

In [None]:
gene_expression_df.empty

### df.describe

In [None]:
gene_expression_df.describe()

### DataFrame from a CSV
- Read in a very simple genomics CSV file
- https://pandas.pydata.org/docs/reference/api/pandas.read_csv.html

In [None]:
gene_lengths_data = pd.read_csv('simple_geno.txt')

In [None]:
gene_lengths_data

In [None]:
gene_lengths_data.index = ['gene1', 'gene2', 'gene3', 'gene4']

In [None]:
gene_lengths_data

### Accessing elements of a dataframe
- df.loc
- df.iloc

### df.loc

#### access `gene1` row by label

In [None]:
gene_lengths_data.loc['gene1']

#### Q: Will this work: gene_lengths_data.loc[0]?

#### Q: access `gene1` and `gene2` rows by label

In [None]:
gene_lengths_data.loc[['gene1', 'gene2']]

### get `gene_lengths_kb` output  for `gene1`

In [None]:
gene_lengths_data

In [None]:
gene_lengths_data.loc['gene1']['gene_lengths_kb']

#### Q: get `gene_lengths_kb` output  for `BRCA1`

In [None]:
gene_lengths_data[gene_lengths_data['gene_name'] == 'BRCA1']['gene_lengths_kb']

In [None]:
gene_lengths_data[gene_lengths_data['gene_name'] == 'BRCA1'].loc['gene1']['gene_lengths_kb']

#### access gene_lengths_kb column 

In [None]:
gene_lengths_data['gene_lengths_kb']

In [None]:
gene_lengths_data['gene_lengths_kb'][2:]

In [None]:
gene_lengths_data['gene_lengths_kb'][-2:]

### access df column as an attribute

In [None]:
gene_lengths_data.columns

In [None]:
# access column as an attribute
gene_lengths_data.gene_lengths_kb

### df.iloc

In [None]:
gene_lengths_data

In [None]:
gene_lengths_data.iloc[0]

In [None]:
gene_lengths_data.iloc[0]['gene_lengths_kb']

In [None]:
gene_lengths_data.iloc[3]['gene_lengths_kb']

In [None]:
gene_lengths_data.iloc[0]['gene_name']

## other operators
- df.at[row, col] (label based)
- df.iat[row, col] (integer based)

#### df.at 

In [None]:
gene_lengths_data.at['gene1', 'gene_name']

In [None]:
gene_lengths_data.at['gene4', 'gene_name']

In [None]:
gene_lengths_data.at['gene2', 'gene_lengths_kb']

#### get gene length of `BRCA1`

In [None]:
gene_lengths_data

In [None]:
indexname = gene_lengths_data[gene_lengths_data['gene_name'] == 'BRCA1'].index[0]
colname = 'gene_lengths_kb'
gene_lengths_data.at[indexname, colname]

### Q: will the following work?

In [None]:
# gene_lengths_data.at['gene2', : ]

### df.iat[integer, integer]
- this is analogous to .iloc

In [None]:
gene_lengths_data.iat[0, 1]

In [None]:
gene_lengths_data.iat[0, 0]

### Q: will this work?

In [None]:
# gene_lengths_data.iat[:3, 0]

### Looping and aggregation

#### apply and other methods: 
- Data standardization:
- Scale the data in each numeric column to have a mean of 0 and stddev of 1
- calculate the mean and stddev for each column
- subtract the mean and divide the result by the stddev

In [None]:
gene_expression_df

In [None]:
gene_expression_df_numeric = gene_expression_df.select_dtypes(include=['int64'])

In [None]:
gene_expression_df_numeric

In [None]:
# boolean indexing using .loc
gene_expression_df_numeric.loc['BRCA2'] > 4

In [None]:
gene_expression_df_numeric.mean()

In [None]:
gene_expression_df_numeric.std()

In [None]:
(gene_expression_df_numeric - gene_expression_df_numeric.mean())/gene_expression_df_numeric.std()

In [None]:
gene_expression_df_numeric.apply(
    lambda row: (row - gene_expression_df_numeric.mean())/gene_expression_df_numeric.std(), axis=1
)

### Other useful functions

#### filter operations
- dropna
- fillna

In [None]:
filter_result_df = gene_expression_df_numeric[
(gene_expression_df_numeric > 3) & (gene_expression_df_numeric < 6)
]

In [None]:
filter_result_df

In [None]:
# drop rows with missing values
filter_result_df.dropna(axis=0)

In [None]:
# drop columns with missing values
filter_result_df.dropna(axis=1)

In [None]:
filter_result_df.fillna(value=0,axis=0)

In [None]:
filter_result_df.fillna(value=0,axis=1)

## Groupby, melting, stacking

### groupby

In [None]:
gene_expression_df

In [None]:
gene_expression_df['gene_type'] = ['coding'] * 4

In [None]:
gene_expression_df

In [None]:
non_coding_df = pd.DataFrame({
    'mirna123': [np.nan, np.nan, 'micro rna 123', 'non-coding'],
    'mirna456': [np.nan, np.nan, 'micro rna 456', 'non-coding'],
    'lncRNA1': [np.nan, np.nan, 'long non-coding rna 1', 'non-coding'],
}).T

In [None]:
non_coding_df

In [None]:
non_coding_df.columns=['time.0', 'time.1', 'gene_desc', 'gene_type']

In [None]:
non_coding_df

#### pd.concat 
- stack dataframes on top of one another (or next to each other)
- default is axis=0 (stack on top of one another)
- axis=1 (stack next to each other)

In [None]:
pd.concat([gene_expression_df, non_coding_df])

In [None]:
pd.concat([gene_expression_df, non_coding_df], axis=1)

In [None]:
groupby_df = pd.concat([gene_expression_df, non_coding_df])

In [None]:
groupby_df

In [None]:
groupby_df.groupby('gene_type')

In [None]:
groupby_df.groupby('gene_type')['time.0'].mean()

In [None]:
groupby_df.groupby('gene_type')['time.1'].mean()

In [None]:
groupby_df.groupby('gene_type').count()

In [None]:
groupby_df.groupby('gene_type').gene_desc.count()

In [None]:
groupby_df.groupby('gene_type')['time.0'].agg(['mean', 'min', 'max'])

In [None]:
groupby_df.groupby('gene_type')['time.0'].count()

### named aggregations in groupby

In [None]:
groupby_df.groupby('gene_type').agg(
    mean_time_0=('time.0','mean'),
    min_time_0=('time.0', 'min'),
    max_time_0=('time.0', 'max')
)

### df.melt
- reshape a dataframe for analysis or plotting purposes
- transform a df from a wide format to a long format
- fixed columns specified by `id_vars`
- columns to melt specified by `value_vars`

In [None]:
groupby_df

### melt dataframe by 
- gene expression at time points 0 and 1
- retain index (gene_name), gene_desc and gene_type as fixed

In [None]:
df_to_melt = groupby_df.reset_index().rename(columns={'index':'gene_name'})

In [None]:
df_to_melt

In [None]:
df_to_melt.melt(id_vars=['gene_name','gene_type', 'gene_desc'],
                value_vars=['time.0', 'time.1']
               )

In [None]:
df_to_melt.melt(id_vars=['gene_name','gene_type', 'gene_desc'],
                value_vars=['time.0', 'time.1']
               ).rename(columns={'variable': 'time_points',
                                'value': 'expression_value'
                                })

### stacking
- pd.concat

### other pandas functions to review
- df.iterrows
- df.itertuples
- df.duplicated
- df.mask
- df.dropduplicates
- df.nunique
- df.sort_values
- https://pandas.pydata.org/docs/reference/frame.html on the left

# Collaborative Exercises

### Exercise 1
- Construct a toy EEG signal dataset and normalize it to scale the data between 0 and 1
- Your dataset should have 5 simulated signals recorded in microvolts for 5 different electrodes
```
electrode1: [100, 90, 75, 10, 20]
electrode2: [150, 85, 75, 45, 60]
....
```
- You can choose to implement Min-max normalization using
  `Xnorm = (X - Xmin)/(Xmax - Xmin)`,  where X = df
- Explain results, and why you would want to normalize the data

### Exercise 2
- Build a toy brain MRI metadata dataset from 20 patients
- 10 patients are control, 10 have alzheimers diagnosis (n=20 rows)
- You will write this data to a file in CSV format 
- Introduce duplicate rows and nan values
- See example structure below:

```
subject_id,subject_age,brain_volume_mm3,hippocampus_volume_mm3,diagnosis
1,55,2200000,3500,Control
2,75,1100000,np.nan,alzheimers
3,65,100000,1800,alzheimers
4,45,np.nan,3280,Control
....
```
- Load this data in a pandas dataframe
- drop duplicates (test by row and column), and explain which method you prefer
- find the nan values and fill them with the mean of the corresponding column
- add an additional column for APOE4 status which predicts the risk for alzheimers disease (two values: [positive, negative])
- what is the mean age by diagnosis type? 
- how many distinct values are there for a combination of APOE4 status and diagnosis?
