## Observations and Insights 

In [113]:
# Dependencies and Setup
import matplotlib.pyplot as plt
import pandas as pd
import scipy.stats as st
import numpy as np

# Study data files
mouse_metadata_path = "data/Mouse_metadata.csv"
study_results_path = "data/Study_results.csv"

# Read the mouse data and the study results
mouse_metadata = pd.read_csv(mouse_metadata_path)
study_results = pd.read_csv(study_results_path)

# Combine the data into a single dataset
#mouse_metadata.head()
#study_results.head()
# the only common column is "Mouse ID"
mergeDF = pd.merge(mouse_metadata, study_results, on="Mouse ID") # merge
mergeDF = mergeDF.sort_values(["Mouse ID", "Timepoint"], ascending=True) # pre clean
mergeDF = mergeDF.reset_index(drop=True) # change up the index numbers so it starts at 0 instead of 400 something

# Display the data table for preview
mergeDF

Unnamed: 0,Mouse ID,Drug Regimen,Sex,Age_months,Weight (g),Timepoint,Tumor Volume (mm3),Metastatic Sites
0,a203,Infubinol,Female,20,23,0,45.000000,0
1,a203,Infubinol,Female,20,23,5,48.508468,0
2,a203,Infubinol,Female,20,23,10,51.852437,1
3,a203,Infubinol,Female,20,23,15,52.777870,1
4,a203,Infubinol,Female,20,23,20,55.173336,1
...,...,...,...,...,...,...,...,...
1888,z969,Naftisol,Male,9,30,25,63.145652,2
1889,z969,Naftisol,Male,9,30,30,65.841013,3
1890,z969,Naftisol,Male,9,30,35,69.176246,4
1891,z969,Naftisol,Male,9,30,40,70.314904,4


In [114]:
# Checking the number of mice, AKA the number of unique Mouse IDs
mouseIDDF = mergeDF.drop_duplicates(subset = "Mouse ID", keep = 'first', inplace = False)
# looks at the column Mouse ID and keeps only the first of every individual value
#mouseIDDF.head()
len(mouseIDDF) # there are 249 individual mice

249

In [115]:
# Getting the duplicate mice by ID number that shows up for Mouse ID and Timepoint. 
timepointDupeDF = mergeDF[
    mergeDF.duplicated(
        #subset = ["Mouse ID", "Timepoint"],
        keep='first'
    )
]
timepointDupeDF
#print(mergeDF.loc[586])

Unnamed: 0,Mouse ID,Drug Regimen,Sex,Age_months,Weight (g),Timepoint,Tumor Volume (mm3),Metastatic Sites
587,g989,Propriva,Female,21,26,0,45.0,0


In [116]:
# Optional: Get all the data for the duplicate mouse ID. 
mergeDF.loc[mergeDF["Mouse ID"]=="g989",:]

Unnamed: 0,Mouse ID,Drug Regimen,Sex,Age_months,Weight (g),Timepoint,Tumor Volume (mm3),Metastatic Sites
586,g989,Propriva,Female,21,26,0,45.0,0
587,g989,Propriva,Female,21,26,0,45.0,0
588,g989,Propriva,Female,21,26,5,48.786801,0
589,g989,Propriva,Female,21,26,5,47.570392,0
590,g989,Propriva,Female,21,26,10,51.745156,0
591,g989,Propriva,Female,21,26,10,49.880528,0
592,g989,Propriva,Female,21,26,15,51.325852,1
593,g989,Propriva,Female,21,26,15,53.44202,0
594,g989,Propriva,Female,21,26,20,55.326122,1
595,g989,Propriva,Female,21,26,20,54.65765,1


In [137]:
# Create a clean DataFrame by dropping the duplicate mouse by its ID.
mergeDF = mergeDF[mergeDF["Mouse ID"] != "g989"]
# validate that the data with this Mouse ID has been removed
#mergeDF.loc[mergeDF["Mouse ID"]=="g989",:]

In [118]:
# Checking the number of mice in the clean DataFrame.
mouseIDCleanDF = mergeDF.drop_duplicates(subset = "Mouse ID", keep = 'first', inplace = False)
len(mouseIDCleanDF) # there are 248 individual mice in the (cleaned) data set now

248

## Summary Statistics

In [119]:
# Generate a summary statistics table of mean, median, variance, standard deviation, and SEM of the tumor volume for each regimen
# find the names of the regimens
#regimenDF = mergeDF.drop_duplicates(subset = "Drug Regimen", keep = 'first', inplace = False)
#regimenDF
# the names of the regimens are as follows:
# "Infubinol",
# "Placebo",
# "Ceftamin",
# "Stelasyn",
# "Zoniferol",
# "Ramicane",
# "Ketapril",
# "Propriva",
# "Capomulin",

# infubinolStatDF = mergeDF.loc[mergeDF["Drug Regimen"]=="Infubinol", :]
# infubinolStatDF.head()

# Use groupby and summary statistical methods to calculate the following properties of each drug regimen: 
# mean, median, variance, standard deviation, and SEM of the tumor volume.
drugMeanDF = mergeDF.groupby('Drug Regimen')["Tumor Volume (mm3)"].mean()
drugMedianDF = mergeDF.groupby('Drug Regimen')["Tumor Volume (mm3)"].median()
drugVarianceDF = mergeDF.groupby('Drug Regimen')["Tumor Volume (mm3)"].var()
drugStdDevDF = mergeDF.groupby('Drug Regimen')["Tumor Volume (mm3)"].std()
drugSEMDF = mergeDF.groupby('Drug Regimen')["Tumor Volume (mm3)"].sem()

# Assemble the resulting series into a single summary dataframe.
summDF = pd.DataFrame({
    "Drug Regimen": ["Capomulin", "Ceftamin", "Infubinol", "Ketapril", "Naftisol", "Placebo", "Propriva", "Ramicane", "Stelasyn", "Zoniferol"],
    "Mean of Tumor Volume (mm3)": [drugMeanDF[0], drugMeanDF[1], drugMeanDF[2], drugMeanDF[3], drugMeanDF[4], drugMeanDF[5], drugMeanDF[6], drugMeanDF[7], drugMeanDF[8], drugMeanDF[9]],
    "Median of Tumor Volume (mm3)": [drugMedianDF[0], drugMedianDF[1], drugMedianDF[2], drugMedianDF[3], drugMedianDF[4], drugMedianDF[5], drugMedianDF[6], drugMedianDF[7], drugMedianDF[8], drugMedianDF[9]],
    "Variance of Tumor Volume (mm3)": [drugVarianceDF[0], drugVarianceDF[1], drugVarianceDF[2], drugVarianceDF[3], drugVarianceDF[4], drugVarianceDF[5], drugVarianceDF[6], drugVarianceDF[7], drugVarianceDF[8], drugVarianceDF[9]],
    "Standard Deviation of Tumor Volume (mm3)": [drugStdDevDF[0], drugStdDevDF[1], drugStdDevDF[2], drugStdDevDF[3], drugStdDevDF[4], drugStdDevDF[5], drugStdDevDF[6], drugStdDevDF[7], drugStdDevDF[8], drugStdDevDF[9]],
    "Standard Error of the Mean of Tumor Volume (mm3)": [drugSEMDF[0], drugSEMDF[1], drugSEMDF[2], drugSEMDF[3], drugSEMDF[4], drugSEMDF[5], drugSEMDF[6], drugSEMDF[7], drugSEMDF[8], drugSEMDF[9]],
})
summDF

Unnamed: 0,Drug Regimen,Mean of Tumor Volume (mm3),Median of Tumor Volume (mm3),Variance of Tumor Volume (mm3),Standard Deviation of Tumor Volume (mm3),Standard Error of the Mean of Tumor Volume (mm3)
0,Capomulin,40.675741,41.557809,24.947764,4.994774,0.329346
1,Ceftamin,52.591172,51.776157,39.290177,6.268188,0.469821
2,Infubinol,52.884795,51.820584,43.128684,6.567243,0.492236
3,Ketapril,55.235638,53.698743,68.553577,8.279709,0.60386
4,Naftisol,54.331565,52.509285,66.173479,8.134708,0.596466
5,Placebo,54.033581,52.288934,61.168083,7.821003,0.581331
6,Propriva,52.32093,50.446266,43.852013,6.622085,0.544332
7,Ramicane,40.216745,40.673236,23.486704,4.846308,0.320955
8,Stelasyn,54.233149,52.431737,59.450562,7.710419,0.573111
9,Zoniferol,53.236507,51.818479,48.533355,6.966589,0.516398


In [120]:
# Generate a summary statistics table of mean, median, variance, standard deviation, and SEM of the tumor volume for each regimen
# didn't I just do this?
summDF

# Using the aggregation method, produce the same summary statistics in a single line
aggSummDF = mergeDF.groupby('Drug Regimen').agg({'Tumor Volume (mm3)': ['mean', 'median', 'var', 'std', 'sem']})
aggSummDF
# numbers match up
# that's a good sign lol

Unnamed: 0_level_0,Tumor Volume (mm3),Tumor Volume (mm3),Tumor Volume (mm3),Tumor Volume (mm3),Tumor Volume (mm3)
Unnamed: 0_level_1,mean,median,var,std,sem
Drug Regimen,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2
Capomulin,40.675741,41.557809,24.947764,4.994774,0.329346
Ceftamin,52.591172,51.776157,39.290177,6.268188,0.469821
Infubinol,52.884795,51.820584,43.128684,6.567243,0.492236
Ketapril,55.235638,53.698743,68.553577,8.279709,0.60386
Naftisol,54.331565,52.509285,66.173479,8.134708,0.596466
Placebo,54.033581,52.288934,61.168083,7.821003,0.581331
Propriva,52.32093,50.446266,43.852013,6.622085,0.544332
Ramicane,40.216745,40.673236,23.486704,4.846308,0.320955
Stelasyn,54.233149,52.431737,59.450562,7.710419,0.573111
Zoniferol,53.236507,51.818479,48.533355,6.966589,0.516398


## Bar and Pie Charts

In [136]:
# Generate a bar plot showing the total number of timepoints for all mice tested for each drug regimen using Pandas.
# the x axis should be the Mouse ID
# the y axis should be the total number of time points for that Mouse ID

xAxis = mergeDF.drop_duplicates(subset = "Mouse ID", keep = 'first', inplace = False)["Mouse ID"]
yAxis = mergeDF["Mouse ID"].count()
mergeDF.plot.bar(x = xAxis, y = yAxis)

KeyError: "None of [Index(['a203', 'a251', 'a262', 'a275', 'a366', 'a401', 'a411', 'a444', 'a457',\n       'a492',\n       ...\n       'y769', 'y793', 'y865', 'z234', 'z314', 'z435', 'z578', 'z581', 'z795',\n       'z969'],\n      dtype='object', length=248)] are in the [columns]"

In [10]:
# Generate a bar plot showing the total number of timepoints for all mice tested for each drug regimen using pyplot.



In [11]:
# Generate a pie plot showing the distribution of female versus male mice using Pandas



In [12]:
# Generate a pie plot showing the distribution of female versus male mice using pyplot



## Quartiles, Outliers and Boxplots

In [13]:
# Calculate the final tumor volume of each mouse across four of the treatment regimens:  
# Capomulin, Ramicane, Infubinol, and Ceftamin

# Start by getting the last (greatest) timepoint for each mouse


# Merge this group df with the original dataframe to get the tumor volume at the last timepoint


In [14]:
# Put treatments into a list for for loop (and later for plot labels)


# Create empty list to fill with tumor vol data (for plotting)


# Calculate the IQR and quantitatively determine if there are any potential outliers. 

    
    # Locate the rows which contain mice on each drug and get the tumor volumes
    
    
    # add subset 
    
    
    # Determine outliers using upper and lower bounds
    

In [15]:
# Generate a box plot of the final tumor volume of each mouse across four regimens of interest


## Line and Scatter Plots

In [16]:
# Generate a line plot of tumor volume vs. time point for a mouse treated with Capomulin


In [17]:
# Generate a scatter plot of average tumor volume vs. mouse weight for the Capomulin regimen


## Correlation and Regression

In [18]:
# Calculate the correlation coefficient and linear regression model 
# for mouse weight and average tumor volume for the Capomulin regimen
