## Observations and Insights 

In [1]:
# Dependencies and Setup
import matplotlib.pyplot as plt
import pandas as pd
import scipy.stats as st
import numpy as np

# Study data files
mouse_metadata_path = "data/Mouse_metadata.csv"
study_results_path = "data/Study_results.csv"

# Read the mouse data and the study results
mouse_metadata = pd.read_csv(mouse_metadata_path)
study_results = pd.read_csv(study_results_path)

# Combine the data into a single dataset
#Methodology: looked at both datasets to see what colun could be used to merge the data. They both share "Mouse ID"
mouse_results = study_results.merge(mouse_metadata, on='Mouse ID')

# Display the data table for preview
mouse_results.head()

Unnamed: 0,Mouse ID,Timepoint,Tumor Volume (mm3),Metastatic Sites,Drug Regimen,Sex,Age_months,Weight (g)
0,b128,0,45.0,0,Capomulin,Female,9,22
1,b128,5,45.651331,0,Capomulin,Female,9,22
2,b128,10,43.270852,0,Capomulin,Female,9,22
3,b128,15,43.784893,0,Capomulin,Female,9,22
4,b128,20,42.731552,0,Capomulin,Female,9,22


In [2]:
# Checking the number of mice.
number_mice = mouse_results['Mouse ID'].nunique()
number_mice

249

In [3]:
#Getting the duplicate mice by ID number that shows up for Mouse ID and Timepoint. 


In [4]:
# Create a clean DataFrame by dropping the duplicate mouse by its ID.
#Checked the count prior to dropping duplicates -- count as 1,893 and after dropping duplicates it's 1,888
clean_results = mouse_results.drop_duplicates(subset=['Mouse ID', 'Timepoint'], keep='last')

In [5]:
# Checking the number of mice in the clean DataFrame.
clean_results.count()

Mouse ID              1888
Timepoint             1888
Tumor Volume (mm3)    1888
Metastatic Sites      1888
Drug Regimen          1888
Sex                   1888
Age_months            1888
Weight (g)            1888
dtype: int64

In [6]:
clean_results = clean_results.rename(columns={"Tumor Volume (mm3)":"Tumor_volume","Drug Regimen":"Drug"})

In [7]:
clean_results.head()

Unnamed: 0,Mouse ID,Timepoint,Tumor_volume,Metastatic Sites,Drug,Sex,Age_months,Weight (g)
0,b128,0,45.0,0,Capomulin,Female,9,22
1,b128,5,45.651331,0,Capomulin,Female,9,22
2,b128,10,43.270852,0,Capomulin,Female,9,22
3,b128,15,43.784893,0,Capomulin,Female,9,22
4,b128,20,42.731552,0,Capomulin,Female,9,22


## Summary Statistics

In [8]:
#Generate a summary statistics table of mean, median, variance, standard deviation, and SEM of the tumor volume for each regimen

In [12]:
#set up the groupby and the mean and median
drug_group = clean_results.groupby(["Drug"]).Tumor_volume.agg(["mean","median"])

In [13]:
drug_group.head()

Unnamed: 0_level_0,mean,median
Drug,Unnamed: 1_level_1,Unnamed: 2_level_1
Capomulin,40.675741,41.557809
Ceftamin,52.591172,51.776157
Infubinol,52.884795,51.820584
Ketapril,55.235638,53.698743
Naftisol,54.331565,52.509285


In [14]:
#set up dictionaries of tumor size for each drug regimine
capomulin = clean_results.loc[clean_results['Drug']=="Capomulin",['Tumor_volume']]
ceftamin = clean_results.loc[clean_results['Drug']=="Ceftamin",['Tumor_volume']]
infubinol = clean_results.loc[clean_results['Drug']=="Infubinol",['Tumor_volume']]
ketapril = clean_results.loc[clean_results['Drug']=="Ketapril",['Tumor_volume']]
naftisol = clean_results.loc[clean_results['Drug']=="Naftisol",['Tumor_volume']]

In [15]:
# get the variance, standard deviation, and SEM of the tumor volume. 
var_capomulin = np.var(capomulin)
std_capomulin = np.std(capomulin)
sem_capomulin = st.sem(capomulin)

var_ceftamin = np.var(ceftamin)
std_ceftamin = np.std(ceftamin)
sem_ceftamin = st.sem(ceftamin)

var_infubinol = np.var(infubinol)
std_infubinol = np.std(infubinol)
sem_infubinol = st.sem(infubinol)

var_ketapril = np.var(ketapril)
std_ketapril = np.std(ketapril)
sem_ketapril = st.sem(ketapril)

var_naftisol = np.var(naftisol)
std_naftisol = np.std(naftisol)
sem_naftisol = st.sem(naftisol)

In [16]:
# Assemble the resulting series into a single summary dataframe.
stats_summary = {'Drug':['Capomulin', 'Ceftamin' , 'Infubinol', 'Ketapril', 'Naftisol'],
    'Variance':[var_capomulin.values[0], var_ceftamin.values[0], var_infubinol.values[0], var_ketapril.values[0], var_naftisol.values[0]],
                'Std_Dev':[std_capomulin.values[0], std_ceftamin.values[0], std_infubinol.values[0], std_ketapril.values[0], std_naftisol.values[0]],
                 'SEM':[sem_capomulin, sem_ceftamin, sem_infubinol, sem_ketapril, sem_naftisol]}

In [22]:
stats_summary_df = pd.DataFrame(stats_summary)
stats_summary_df.set_index('Drug')


Unnamed: 0_level_0,Variance,Std_Dev,SEM
Drug,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Capomulin,24.839296,4.983904,[0.32934562340083096]
Ceftamin,39.069446,6.250556,[0.46982053275261093]
Infubinol,42.886388,6.54877,[0.4922356938011383]
Ketapril,68.18893,8.257659,[0.6038598237739697]
Naftisol,65.817708,8.112811,[0.5964657512424236]


In [26]:
# Generate a summary statistics table of mean, median, variance, standard deviation, and SEM of the tumor volume for each regimen
drug_summary = drug_group.merge(stats_summary_df, on="Drug")
drug_summary = drug_summary.rename(columns={"Drug":"Drug Regimen","mean":"Mean","median":"Median"})
drug_summary

Unnamed: 0,Drug Regimen,Mean,Median,Variance,Std_Dev,SEM
0,Capomulin,40.675741,41.557809,24.839296,4.983904,[0.32934562340083096]
1,Ceftamin,52.591172,51.776157,39.069446,6.250556,[0.46982053275261093]
2,Infubinol,52.884795,51.820584,42.886388,6.54877,[0.4922356938011383]
3,Ketapril,55.235638,53.698743,68.18893,8.257659,[0.6038598237739697]
4,Naftisol,54.331565,52.509285,65.817708,8.112811,[0.5964657512424236]


In [None]:
# Using the aggregation method, produce the same summary statistics in a single line

## Bar and Pie Charts

In [None]:
# Generate a bar plot showing the total number of measurements taken on each drug regimen using pandas.



In [None]:
# Generate a bar plot showing the total number of measurements taken on each drug regimen using pyplot.



In [None]:
# Generate a pie plot showing the distribution of female versus male mice using pandas



In [None]:
# Generate a pie plot showing the distribution of female versus male mice using pyplot



## Quartiles, Outliers and Boxplots

In [None]:
# Calculate the final tumor volume of each mouse across four of the treatment regimens:  
# Capomulin, Ramicane, Infubinol, and Ceftamin

# Start by getting the last (greatest) timepoint for each mouse


# Merge this group df with the original dataframe to get the tumor volume at the last timepoint


In [None]:
# Put treatments into a list for for loop (and later for plot labels)


# Create empty list to fill with tumor vol data (for plotting)


# Calculate the IQR and quantitatively determine if there are any potential outliers. 

    
    # Locate the rows which contain mice on each drug and get the tumor volumes
    
    
    # add subset 
    
    
    # Determine outliers using upper and lower bounds
    

In [None]:
quartiles = clean_results['Tumor Volume (mm3)'].quantile([.25,.5,.75])
lowerq = quartiles[0.25]
upperq = quartiles[0.75]
iqr = upperq-lowerq

print(f"The lower quartile of occupancy is: {lowerq}")
print(f"The upper quartile of occupancy is: {upperq}")
print(f"The interquartile range of occupancy is: {iqr}")
print(f"The the median of occupancy is: {quartiles[0.5]} ")

lower_bound = lowerq - (1.5*iqr)
upper_bound = upperq + (1.5*iqr)
print(f"Values below {lower_bound} could be outliers.")
print(f"Values above {upper_bound} could be outliers.")

outlier_occupancy = clean_results.loc[(clean_results['Tumor Volume (mm3)'] < lower_bound) | (clean_results['Tumor Volume (mm3)'] > upper_bound)]
outlier_occupancy

print(f"The minimum median income of the potential outliers is {outlier_occupancy['Tumor Volume (mm3)'].min()}")
print(f"The maximum median income of the potential outliers is {outlier_occupancy['Tumor Volume (mm3)'].max()}")

In [None]:
# Generate a box plot of the final tumor volume of each mouse across four regimens of interest


## Line and Scatter Plots

In [None]:
# Generate a line plot of tumor volume vs. time point for a mouse treated with Capomulin


In [None]:
# Generate a scatter plot of average tumor volume vs. mouse weight for the Capomulin regimen
plt.scatter(capomulin_results['Weight (g)'], capomulin_results['Tumor Volume (mm3)'])
plt.ylabel("Tumor Volume (mm3)")
plt.xlabel("Weight (g)")
plt.title("Average tumor volume vs. mouse weight for the Capomulin regimen")
plt.show()

## Correlation and Regression

In [None]:
# Calculate the correlation coefficient and linear regression model 
# for mouse weight and average tumor volume for the Capomulin regimen
