## Observations and Insights

## Dependencies and starter code

In [1]:
# Dependencies and Setup
import matplotlib.pyplot as plt
import pandas as pd
import scipy.stats as st
import numpy as np

# Study data files
mouse_metadata = "data/Mouse_metadata.csv"
study_results = "data/Study_results.csv"

# Read the mouse data and the study results
mouse_metadata = pd.read_csv(mouse_metadata)
study_results = pd.read_csv(study_results)

# Combine the data into a single dataset
merged = pd.merge(mouse_metadata, study_results, on="Mouse ID")
merged.head()

Unnamed: 0,Mouse ID,Drug Regimen,Sex,Age_months,Weight (g),Timepoint,Tumor Volume (mm3),Metastatic Sites
0,k403,Ramicane,Male,21,16,0,45.0,0
1,k403,Ramicane,Male,21,16,5,38.825898,0
2,k403,Ramicane,Male,21,16,10,35.014271,1
3,k403,Ramicane,Male,21,16,15,34.223992,1
4,k403,Ramicane,Male,21,16,20,32.997729,1


## (SCOTT: Familiarization)

In [8]:
merged.info # only availabile in pandas 1.0 rc

<bound method DataFrame.info of      Mouse ID Drug Regimen   Sex  Age_months  Weight (g)  Timepoint  \
0        k403     Ramicane  Male          21          16          0   
1        k403     Ramicane  Male          21          16          5   
2        k403     Ramicane  Male          21          16         10   
3        k403     Ramicane  Male          21          16         15   
4        k403     Ramicane  Male          21          16         20   
...       ...          ...   ...         ...         ...        ...   
1888     z969     Naftisol  Male           9          30         25   
1889     z969     Naftisol  Male           9          30         30   
1890     z969     Naftisol  Male           9          30         35   
1891     z969     Naftisol  Male           9          30         40   
1892     z969     Naftisol  Male           9          30         45   

      Tumor Volume (mm3)  Metastatic Sites  
0              45.000000                 0  
1              38.825898 

In [9]:
# Familiarize myself w the data/relationships:
merged.describe()

Unnamed: 0,Age_months,Weight (g),Timepoint,Tumor Volume (mm3),Metastatic Sites
count,1893.0,1893.0,1893.0,1893.0,1893.0
mean,12.81458,25.662441,19.572108,50.448381,1.021659
std,7.189592,3.921622,14.07946,8.894722,1.137974
min,1.0,15.0,0.0,22.050126,0.0
25%,7.0,25.0,5.0,45.0,0.0
50%,13.0,27.0,20.0,48.951474,1.0
75%,20.0,29.0,30.0,56.2922,2.0
max,24.0,30.0,45.0,78.567014,4.0


In [8]:
merged.dtypes # looks good. All number based column data types are numbers

Mouse ID               object
Drug Regimen           object
Sex                    object
Age_months              int64
Weight (g)              int64
Timepoint               int64
Tumor Volume (mm3)    float64
Metastatic Sites        int64
dtype: object

In [12]:
# What are the drug regimens?
grp = "Drug Regimen"
merged.groupby(grp).groups.keys()

dict_keys(['Capomulin', 'Ceftamin', 'Infubinol', 'Ketapril', 'Naftisol', 'Placebo', 'Propriva', 'Ramicane', 'Stelasyn', 'Zoniferol'])

In [19]:
# How many rows in the groups?
#len(merged.groupby([grp]).groups['Ceftamin'])
merged.groupby(grp).count()

Unnamed: 0_level_0,Mouse ID,Sex,Age_months,Weight (g),Timepoint,Tumor Volume (mm3),Metastatic Sites
Drug Regimen,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
Capomulin,230,230,230,230,230,230,230
Ceftamin,178,178,178,178,178,178,178
Infubinol,178,178,178,178,178,178,178
Ketapril,188,188,188,188,188,188,188
Naftisol,186,186,186,186,186,186,186
Placebo,181,181,181,181,181,181,181
Propriva,161,161,161,161,161,161,161
Ramicane,228,228,228,228,228,228,228
Stelasyn,181,181,181,181,181,181,181
Zoniferol,182,182,182,182,182,182,182


In [13]:
aggColumn = 'Tumor Volume (mm3)'

# Create a new dataframe for each, then merge. I tried using the new pd.NamedAgg to do rename in place 
#     but kept getting errors

# Mean:
agg = 'mean'
newName = "TVol_mean"
aggs = merged.groupby(grp).agg({aggColumn: agg})
aggs.columns = [newName]

# Median
agg = 'median'
newName = "TVol_median"
aggs2 = merged.groupby(grp).agg({aggColumn: agg})
aggs2.columns = [newName]
aggMerge = pd.merge(aggs, aggs2, on=grp)

# Variance
agg = 'var'
newName = "TVol_variance"
aggs2 = merged.groupby(grp).agg({aggColumn: agg})
aggs2.columns = [newName]
aggMerge2 = pd.merge(aggMerge, aggs2, on=grp)

# Standard Deviation
agg = 'std'
newName = "TVol_StdDev"
aggs2 = merged.groupby(grp).agg({aggColumn: agg})
aggs2.columns = [newName]
aggMerge3 = pd.merge(aggMerge2, aggs2, on=grp)
aggMerge3.head()

Unnamed: 0_level_0,TVol_mean,TVol_median,TVol_variance,TVol_StdDev
Drug Regimen,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Capomulin,40.675741,41.557809,24.947764,4.994774
Ceftamin,52.591172,51.776157,39.290177,6.268188
Infubinol,52.884795,51.820584,43.128684,6.567243
Ketapril,55.235638,53.698743,68.553577,8.279709
Naftisol,54.331565,52.509285,66.173479,8.134708


In [None]:
# Next - scipy.stats.sem
# https://www.programcreek.com/python/example/100333/scipy.stats.sem
# https://www.shanelynn.ie/summarising-aggregation-and-grouping-data-in-python-pandas/
# https://stackoverflow.com/questions/43233963/how-pandas-calculate-sem
# Module 3 - Activity 3


## Summary statistics

In [17]:
merged.head()

Unnamed: 0,Mouse ID,Drug Regimen,Sex,Age_months,Weight (g),Timepoint,Tumor Volume (mm3),Metastatic Sites
0,k403,Ramicane,Male,21,16,0,45.0,0
1,k403,Ramicane,Male,21,16,5,38.825898,0
2,k403,Ramicane,Male,21,16,10,35.014271,1
3,k403,Ramicane,Male,21,16,15,34.223992,1
4,k403,Ramicane,Male,21,16,20,32.997729,1


In [12]:
# Generate a summary statistics table of mean, median, variance, standard deviation, and SEM of the tumor volume for each regimen
# merged.groupby("Drug Regimen").agg({"returns": [np.mean, np.median, np.var, np.std, st.sem]})
merged.groupby("Drug Regimen").groups.keys()

dict_keys(['Capomulin', 'Ceftamin', 'Infubinol', 'Ketapril', 'Naftisol', 'Placebo', 'Propriva', 'Ramicane', 'Stelasyn', 'Zoniferol'])

## Bar plots

In [None]:
# Generate a bar plot showing number of data points for each treatment regimen using pandas

In [None]:
# Generate a bar plot showing number of data points for each treatment regimen using pyplot

## Pie plots

In [None]:
# Generate a pie plot showing the distribution of female versus male mice using pandas

In [None]:
# Generate a pie plot showing the distribution of female versus male mice using pyplot

## Quartiles, outliers and boxplots

In [None]:
# Calculate the final tumor volume of each mouse across four of the most promising treatment regimens. Calculate the IQR and quantitatively determine if there are any potential outliers. 

In [None]:
# Generate a box plot of the final tumor volume of each mouse across four regimens of interest

## Line and scatter plots

In [None]:
# Generate a line plot of time point versus tumor volume for a mouse treated with Capomulin

In [None]:
# Generate a scatter plot of mouse weight versus average tumor volume for the Capomulin regimen

In [None]:
# Calculate the correlation coefficient and linear regression model for mouse weight and average tumor volume for the Capomulin regimen