## Observations and Insights

## Dependencies and starter code

In [28]:
# Dependencies and Setup
import matplotlib.pyplot as plt
import pandas as pd
import scipy.stats as st
import numpy as np
from scipy.stats import sem

# Study data files
mouse_metadata = "data/Mouse_metadata.csv"
study_results = "data/Study_results.csv"

# Read the mouse data and the study results
mouse_metadata = pd.read_csv(mouse_metadata)
study_results = pd.read_csv(study_results)


In [29]:
len(mouse_metadata)

249

In [30]:
len(study_results)

1893

In [31]:
# Combine the data into a single dataset
complete_study_data = pd.merge(study_results, mouse_metadata,
                                 how='left', on='Mouse ID')
complete_study_data.head()

Unnamed: 0,Mouse ID,Timepoint,Tumor Volume (mm3),Metastatic Sites,Drug Regimen,Sex,Age_months,Weight (g)
0,b128,0,45.0,0,Capomulin,Female,9,22
1,f932,0,45.0,0,Ketapril,Male,15,29
2,g107,0,45.0,0,Ketapril,Female,2,29
3,a457,0,45.0,0,Ketapril,Female,11,30
4,c819,0,45.0,0,Ketapril,Male,21,25


In [32]:
#get rid of duplicates
complete_study_data['Mouse ID'].nunique()


249

In [33]:
#check for duplicate IDs
duplicate_mouse_ids = complete_study_data.loc[complete_study_data.duplicated(subset=["Mouse ID", "Timepoint"]),"Mouse ID"].unique()
duplicate_mouse_ids

array(['g989'], dtype=object)

In [34]:
complete_study_data.loc[complete_study_data["Mouse ID"] == "g989"]

Unnamed: 0,Mouse ID,Timepoint,Tumor Volume (mm3),Metastatic Sites,Drug Regimen,Sex,Age_months,Weight (g)
107,g989,0,45.0,0,Propriva,Female,21,26
137,g989,0,45.0,0,Propriva,Female,21,26
329,g989,5,48.786801,0,Propriva,Female,21,26
360,g989,5,47.570392,0,Propriva,Female,21,26
620,g989,10,51.745156,0,Propriva,Female,21,26
681,g989,10,49.880528,0,Propriva,Female,21,26
815,g989,15,51.325852,1,Propriva,Female,21,26
869,g989,15,53.44202,0,Propriva,Female,21,26
950,g989,20,55.326122,1,Propriva,Female,21,26
1111,g989,20,54.65765,1,Propriva,Female,21,26


In [35]:
clean_study_data = complete_study_data[complete_study_data["Mouse ID"].isin(duplicate_mouse_ids) == False]
clean_study_data.head()

Unnamed: 0,Mouse ID,Timepoint,Tumor Volume (mm3),Metastatic Sites,Drug Regimen,Sex,Age_months,Weight (g)
0,b128,0,45.0,0,Capomulin,Female,9,22
1,f932,0,45.0,0,Ketapril,Male,15,29
2,g107,0,45.0,0,Ketapril,Female,2,29
3,a457,0,45.0,0,Ketapril,Female,11,30
4,c819,0,45.0,0,Ketapril,Male,21,25


In [36]:
len(clean_study_data['Mouse ID'].unique())

248

## Summary statistics

In [42]:
tumorvolume = clean_study_data["Tumor Volume (mm3)"]
tumorvolume

#measures of central tendency for tumor volume
mean_tumorvolume = np.mean(tumorvolume)
print(f"The mean tumor volume is {mean_tumorvolume}")

median_tumorvolume = np.median(tumorvolume)
print(f"The median tumor volume is {median_tumorvolume}")

mode_tumorvolume = st.mode(tumorvolume)
print(f"The mode tumor volume is {mode_tumorvolume}")

sem_tumorvolume = st.sem(tumorvolume)
print(f"The SEM value for the tumor volume is {sem_tumorvolume}")


The mean tumor volume is 50.43529323225536
The median tumor volume is 48.933453655
The mode tumor volume is ModeResult(mode=array([45.]), count=array([248]))
The SEM value for the tumor volume is 0.20559062186986257
The SEM value for the tumor volume data is 0.20559062186986257


In [2]:
# Generate a summary statistics table of mean, median, variance, standard deviation, and SEM of the tumor volume for each regimen

## Bar plots

In [3]:
# Generate a bar plot showing number of data points for each treatment regimen using pandas

In [4]:
# Generate a bar plot showing number of data points for each treatment regimen using pyplot

## Pie plots

In [5]:
# Generate a pie plot showing the distribution of female versus male mice using pandas

In [6]:
# Generate a pie plot showing the distribution of female versus male mice using pyplot

## Quartiles, outliers and boxplots

In [7]:
# Calculate the final tumor volume of each mouse across four of the most promising treatment regimens. Calculate the IQR and quantitatively determine if there are any potential outliers. 

In [8]:
# Generate a box plot of the final tumor volume of each mouse across four regimens of interest

## Line and scatter plots

In [9]:
# Generate a line plot of time point versus tumor volume for a mouse treated with Capomulin

In [10]:
# Generate a scatter plot of mouse weight versus average tumor volume for the Capomulin regimen

In [11]:
# Calculate the correlation coefficient and linear regression model for mouse weight and average tumor volume for the Capomulin regimen