## Observations and Insights 

In [44]:
# Dependencies and Setup
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
import scipy.stats as st
import scipy.stats as sem

%matplotlib notebook

# Study data files
mouse_metadata_path = "./data/Mouse_metadata.csv"
study_results_path = "./data/Study_results.csv"

# Read the mouse data and the study results
mouse_metadata = pd.read_csv(mouse_metadata_path)
study_results = pd.read_csv(study_results_path)

# Combine the data into a single dataset
dfmousestudy = pd.merge(mouse_metadata, study_results, on="Mouse ID")

# Display the data table for preview
dfmousestudy.head()

Unnamed: 0,Mouse ID,Drug Regimen,Sex,Age_months,Weight (g),Timepoint,Tumor Volume (mm3),Metastatic Sites
0,k403,Ramicane,Male,21,16,0,45.0,0
1,k403,Ramicane,Male,21,16,5,38.825898,0
2,k403,Ramicane,Male,21,16,10,35.014271,1
3,k403,Ramicane,Male,21,16,15,34.223992,1
4,k403,Ramicane,Male,21,16,20,32.997729,1


In [45]:
#  Checking the number of mice
mice_total = len(dfmousestudy['Mouse ID'].unique())
mice_total

249

In [46]:
all_mice = len(dfmousestudy['Mouse ID'])
all_mice


1893

In [47]:
unique_dfmousestudy = dfmousestudy.drop_duplicates(subset=['Mouse ID', 'Timepoint'])
unique_dfmousestudy

Unnamed: 0,Mouse ID,Drug Regimen,Sex,Age_months,Weight (g),Timepoint,Tumor Volume (mm3),Metastatic Sites
0,k403,Ramicane,Male,21,16,0,45.000000,0
1,k403,Ramicane,Male,21,16,5,38.825898,0
2,k403,Ramicane,Male,21,16,10,35.014271,1
3,k403,Ramicane,Male,21,16,15,34.223992,1
4,k403,Ramicane,Male,21,16,20,32.997729,1
...,...,...,...,...,...,...,...,...
1888,z969,Naftisol,Male,9,30,25,63.145652,2
1889,z969,Naftisol,Male,9,30,30,65.841013,3
1890,z969,Naftisol,Male,9,30,35,69.176246,4
1891,z969,Naftisol,Male,9,30,40,70.314904,4


In [48]:
# unique_dfmousestudy_dup
# duplicate_mice = dfmousestudy.loc[:, ['Mouse ID', 'Timepoint']].

In [49]:
# Getting the duplicate mice by ID number that shows up for Mouse ID and Timepoint. 
# Optional: Get all the data for the duplicate mouse ID. 
duplicates=dfmousestudy[dfmousestudy.duplicated(["Mouse ID", "Timepoint"], keep=False)]
duplicates

Unnamed: 0,Mouse ID,Drug Regimen,Sex,Age_months,Weight (g),Timepoint,Tumor Volume (mm3),Metastatic Sites
908,g989,Propriva,Female,21,26,0,45.0,0
909,g989,Propriva,Female,21,26,0,45.0,0
910,g989,Propriva,Female,21,26,5,48.786801,0
911,g989,Propriva,Female,21,26,5,47.570392,0
912,g989,Propriva,Female,21,26,10,51.745156,0
913,g989,Propriva,Female,21,26,10,49.880528,0
914,g989,Propriva,Female,21,26,15,51.325852,1
915,g989,Propriva,Female,21,26,15,53.44202,0
916,g989,Propriva,Female,21,26,20,55.326122,1
917,g989,Propriva,Female,21,26,20,54.65765,1


In [50]:
# Create a clean DataFrame by dropping the duplicate mouse by its ID.
unique_dfmousestudy

Unnamed: 0,Mouse ID,Drug Regimen,Sex,Age_months,Weight (g),Timepoint,Tumor Volume (mm3),Metastatic Sites
0,k403,Ramicane,Male,21,16,0,45.000000,0
1,k403,Ramicane,Male,21,16,5,38.825898,0
2,k403,Ramicane,Male,21,16,10,35.014271,1
3,k403,Ramicane,Male,21,16,15,34.223992,1
4,k403,Ramicane,Male,21,16,20,32.997729,1
...,...,...,...,...,...,...,...,...
1888,z969,Naftisol,Male,9,30,25,63.145652,2
1889,z969,Naftisol,Male,9,30,30,65.841013,3
1890,z969,Naftisol,Male,9,30,35,69.176246,4
1891,z969,Naftisol,Male,9,30,40,70.314904,4


In [51]:
# Checking the number of mice in the clean DataFrame.
len(unique_dfmousestudy['Mouse ID'].unique())

249

## Summary Statistics

In [52]:
unique_dfmousestudy.columns

Index(['Mouse ID', 'Drug Regimen', 'Sex', 'Age_months', 'Weight (g)',
       'Timepoint', 'Tumor Volume (mm3)', 'Metastatic Sites'],
      dtype='object')

In [58]:
# Generate a summary statistics table of mean, median, variance, standard deviation, and SEM of the tumor volume for each regimen
# This method is the most straighforward, creating multiple series and putting them all together at the end.
by_regimen = unique_dfmousestudy.loc[:,['Tumor Volume (mm3)', 'Drug Regimen']].groupby('Drug Regimen')
drug_mean = round(by_regimen['Tumor Volume (mm3)'].mean(),2)
drug_median = round(by_regimen['Tumor Volume (mm3)'].median(),2)
drug_variance = round(by_regimen['Tumor Volume (mm3)'].var(),2)
drug_stdev = (round(by_regimen['Tumor Volume (mm3)'].std(),2))
drug_sem = round(by_regimen['Tumor Volume (mm3)'].sem(),4)
drug_data = {"Mean": drug_mean, "Median": drug_median, 
             "Variance": drug_variance, 
             "Std Dev": drug_stdev, "SEM": drug_sem}
summary_regimen = pd.DataFrame(drug_data)
summary_regimen


Unnamed: 0_level_0,Mean,Median,Variance,Std Dev,SEM
Drug Regimen,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
Capomulin,40.68,41.56,24.95,4.99,0.3293
Ceftamin,52.59,51.78,39.29,6.27,0.4698
Infubinol,52.88,51.82,43.13,6.57,0.4922
Ketapril,55.24,53.7,68.55,8.28,0.6039
Naftisol,54.33,52.51,66.17,8.13,0.5965
Placebo,54.03,52.29,61.17,7.82,0.5813
Propriva,52.39,50.91,43.14,6.57,0.5259
Ramicane,40.22,40.67,23.49,4.85,0.321
Stelasyn,54.23,52.43,59.45,7.71,0.5731
Zoniferol,53.24,51.82,48.53,6.97,0.5164


In [None]:
# Generate a summary statistics table of mean, median, variance, standard deviation, and SEM of the tumor volume for each regimen
# This method produces everything in a single groupby function

## Bar and Pie Charts

In [66]:
grouped_regimen = unique_dfmousestudy.loc[:,['Mouse ID', 'Drug Regimen']].groupby('Drug Regimen')
mice_by_regimen = grouped_regimen.count()['Mouse ID']
mice_by_regimen

Drug Regimen
Capomulin    230
Ceftamin     178
Infubinol    178
Ketapril     188
Naftisol     186
Placebo      181
Propriva     156
Ramicane     228
Stelasyn     181
Zoniferol    182
Name: Mouse ID, dtype: int64

In [None]:
# Generate a bar plot showing the total number of mice for each treatment throughout the course of the study using pandas.



In [75]:
# Generate a bar plot showing the total number of mice for each treatment throughout the course of the study using pyplot. 
grouped_regimen = unique_dfmousestudy.loc[:,['Mouse ID', 'Drug Regimen']].groupby('Drug Regimen')
mice_by_regimen = grouped_regimen.count()['Mouse ID']
x_axis = np.arange(len(mice_by_regimen))
plt.bar(x_axis, mice_by_regimen, color='r', alpha=0.5, align="center")
tick_locations = [value for value in x_axis]
plt.xticks(tick_locations, ('Capomulin', 'Ceftamin', 'Infubinol', 'KIetapril', 'Naftisol', 'Placebo', 'Propriva', 'Ramincane', 'Stelasyn', 'Zoniferol'), rotation=75)
plt.title("Number of Mice by Drug Regimen")
plt.xlabel("Drug Regimen")
plt.ylabel("Number of Mice")

<IPython.core.display.Javascript object>

Text(0, 0.5, 'Number of Mice')

In [None]:
# Generate a pie plot showing the distribution of female versus male mice using pandas



In [None]:
# Generate a pie plot showing the distribution of female versus male mice using pyplot



## Quartiles, Outliers and Boxplots

In [None]:
# Calculate the final tumor volume of each mouse across four of the treatment regimens:  
# Capomulin, Ramicane, Infubinol, and Ceftamin

# Start by getting the last (greatest) timepoint for each mouse


# Merge this group df with the original dataframe to get the tumor volume at the last timepoint


In [None]:
# Put treatments into a list for for loop (and later for plot labels)


# Create empty list to fill with tumor vol data (for plotting)


# Calculate the IQR and quantitatively determine if there are any potential outliers. 

    
    # Locate the rows which contain mice on each drug and get the tumor volumes
    
    
    # add subset 
    
    
    # Determine outliers using upper and lower bounds
    

In [None]:
# Generate a box plot of the final tumor volume of each mouse across four regimens of interest


## Line and Scatter Plots

In [None]:
# Generate a line plot of time point versus tumor volume for a mouse treated with Capomulin


In [None]:
# Generate a scatter plot of mouse weight versus average tumor volume for the Capomulin regimen


## Correlation and Regression

In [None]:
# Calculate the correlation coefficient and linear regression model 
# for mouse weight and average tumor volume for the Capomulin regimen
