## Observations and Insights 

In [104]:
# Dependencies and Setup
import matplotlib.pyplot as plt
import pandas as pd
import scipy.stats as st

# Study data files
mouse_metadata_path = "Resources/mouse_metadata.csv"
study_results_path = "Resources/study_results.csv"

# Read the mouse data and the study results
mouse_metadata = pd.read_csv(mouse_metadata_path)
study_results = pd.read_csv(study_results_path)
print(mouse_metadata.columns)
print(study_results.columns)

Index(['Mouse ID', 'Drug Regimen', 'Sex', 'Age_months', 'Weight (g)'], dtype='object')
Index(['Mouse ID', 'Timepoint', 'Tumor Volume (mm3)', 'Metastatic Sites'], dtype='object')


In [105]:
# Combine the data into a single dataset
combined_data_raw = pd.merge(mouse_metadata, study_results, on="Mouse ID", how="outer")
# Display the data table for preview
combined_data_raw

Unnamed: 0,Mouse ID,Drug Regimen,Sex,Age_months,Weight (g),Timepoint,Tumor Volume (mm3),Metastatic Sites
0,k403,Ramicane,Male,21,16,0,45.000000,0
1,k403,Ramicane,Male,21,16,5,38.825898,0
2,k403,Ramicane,Male,21,16,10,35.014271,1
3,k403,Ramicane,Male,21,16,15,34.223992,1
4,k403,Ramicane,Male,21,16,20,32.997729,1
...,...,...,...,...,...,...,...,...
1888,z969,Naftisol,Male,9,30,25,63.145652,2
1889,z969,Naftisol,Male,9,30,30,65.841013,3
1890,z969,Naftisol,Male,9,30,35,69.176246,4
1891,z969,Naftisol,Male,9,30,40,70.314904,4


In [140]:
# Returns all the mouse data for each mouse to be stored in a dictionary
def getMouseData(m):
    d = {}
    mouse_subset = combined_data.loc[combined_data["Mouse ID"] == m,]
    d["Drug"] = mouse_subset["Drug Regimen"].values[0]
    d["Sex"] = mouse_subset["Sex"].values[0]
    d["Age(mos)"] = mouse_subset["Age_months"].values[0]
    d["Weight(g)"] = mouse_subset["Weight (g)"].values[0]
    timepoints = mouse_subset["Timepoint"].values
    tumors = mouse_subset["Tumor Volume (mm3)"].values
    d["Data"] = {timepoints[i]: tumors[i] for i in range(len(timepoints))}
    return d

In [141]:
# Checking the number of mice.
mice_unique = combined_data_raw["Mouse ID"].unique()
len(mice_unique)

249

In [142]:
# Removing the duplicate rows that shows up for Mouse ID and Timepoint.
combined_data = combined_data_raw.drop_duplicates(subset=["Mouse ID","Timepoint"])
combined_data = pd.DataFrame(combined_data)
combined_data

Unnamed: 0,Mouse ID,Drug Regimen,Sex,Age_months,Weight (g),Timepoint,Tumor Volume (mm3),Metastatic Sites
0,k403,Ramicane,Male,21,16,0,45.000000,0
1,k403,Ramicane,Male,21,16,5,38.825898,0
2,k403,Ramicane,Male,21,16,10,35.014271,1
3,k403,Ramicane,Male,21,16,15,34.223992,1
4,k403,Ramicane,Male,21,16,20,32.997729,1
...,...,...,...,...,...,...,...,...
1888,z969,Naftisol,Male,9,30,25,63.145652,2
1889,z969,Naftisol,Male,9,30,30,65.841013,3
1890,z969,Naftisol,Male,9,30,35,69.176246,4
1891,z969,Naftisol,Male,9,30,40,70.314904,4


In [143]:
# Optional: Get all the data for the duplicate mouse ID. 
mice_timepoint_data = {}
for mouse in mice_unique:
    mice_timepoint_data[mouse] = getMouseData(mouse)

mice_timepoint_data

{'k403': {'Drug': 'Ramicane',
  'Sex': 'Male',
  'Age(mos)': 21,
  'Weight(g)': 16,
  'Data': {0: 45.0,
   5: 38.82589807,
   10: 35.01427146,
   15: 34.22399174,
   20: 32.99772858,
   25: 33.46457735,
   30: 31.09949753,
   35: 26.54699343,
   40: 24.36550471,
   45: 22.05012627}},
 's185': {'Drug': 'Capomulin',
  'Sex': 'Female',
  'Age(mos)': 3,
  'Weight(g)': 17,
  'Data': {0: 45.0,
   5: 43.87849569,
   10: 37.61494768,
   15: 38.17723195,
   20: 36.86687576,
   25: 33.94994037,
   30: 32.95967078,
   35: 28.32853059,
   40: 25.47214326,
   45: 23.34359787}},
 'x401': {'Drug': 'Capomulin',
  'Sex': 'Female',
  'Age(mos)': 16,
  'Weight(g)': 15,
  'Data': {0: 45.0,
   5: 45.47375302,
   10: 39.11389068,
   15: 39.77625003,
   20: 36.06583471,
   25: 36.61712031,
   30: 32.91529186,
   35: 30.20682488,
   40: 28.16739732,
   45: 28.48403281}},
 'm601': {'Drug': 'Capomulin',
  'Sex': 'Male',
  'Age(mos)': 22,
  'Weight(g)': 17,
  'Data': {0: 45.0,
   5: 41.40859145,
   10: 36.825366

In [146]:
# Checking the number of mice in the clean DataFrame.
mice_unique = combined_data["Mouse ID"].unique()
len(mice_unique)

249

## Summary Statistics

In [157]:
# Generate a summary statistics table of mean, median, variance, standard deviation, and SEM of the tumor volume for each regimen

# Use groupby and summary statistical methods to calculate the following properties of each drug regimen: 
# mean, median, variance, standard deviation, and SEM of the tumor volume. 
# Assemble the resulting series into a single summary dataframe.
grouped_drug = combined_data.groupby("Drug Regimen")["Tumor Volume (mm3)"]
compare_drug_tumor = pd.DataFrame({})
compare_drug_tumor["mean"] = grouped_drug.mean()
compare_drug_tumor["median"] = grouped_drug.median()
compare_drug_tumor["variance"] = grouped_drug.var()
compare_drug_tumor["Std Dev"] = grouped_drug.std

compare_drug_tumor


Unnamed: 0_level_0,mean,median,variance
Drug Regimen,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Capomulin,40.675741,41.557809,24.947764
Ceftamin,52.591172,51.776157,39.290177
Infubinol,52.884795,51.820584,43.128684
Ketapril,55.235638,53.698743,68.553577
Naftisol,54.331565,52.509285,66.173479
Placebo,54.033581,52.288934,61.168083
Propriva,52.393463,50.909965,43.138803
Ramicane,40.216745,40.673236,23.486704
Stelasyn,54.233149,52.431737,59.450562
Zoniferol,53.236507,51.818479,48.533355


In [81]:
# Generate a summary statistics table of mean, median, variance, standard deviation, and SEM of the tumor volume for each regimen

# Using the aggregation method, produce the same summary statistics in a single line


## Bar and Pie Charts

In [82]:
# Generate a bar plot showing the total number of measurements taken on each drug regimen using pandas.



In [83]:
# Generate a bar plot showing the total number of measurements taken on each drug regimen using pyplot.



In [84]:
# Generate a pie plot showing the distribution of female versus male mice using pandas



In [85]:
# Generate a pie plot showing the distribution of female versus male mice using pyplot



## Quartiles, Outliers and Boxplots

In [86]:
# Calculate the final tumor volume of each mouse across four of the treatment regimens:  
# Capomulin, Ramicane, Infubinol, and Ceftamin

# Start by getting the last (greatest) timepoint for each mouse


# Merge this group df with the original dataframe to get the tumor volume at the last timepoint


In [87]:
# Put treatments into a list for for loop (and later for plot labels)


# Create empty list to fill with tumor vol data (for plotting)


# Calculate the IQR and quantitatively determine if there are any potential outliers. 

    
    # Locate the rows which contain mice on each drug and get the tumor volumes
    
    
    # add subset 
    
    
    # Determine outliers using upper and lower bounds
    

In [88]:
# Generate a box plot of the final tumor volume of each mouse across four regimens of interest


## Line and Scatter Plots

In [89]:
# Generate a line plot of tumor volume vs. time point for a mouse treated with Capomulin


In [90]:
# Generate a scatter plot of average tumor volume vs. mouse weight for the Capomulin regimen


## Correlation and Regression

In [91]:
# Calculate the correlation coefficient and linear regression model 
# for mouse weight and average tumor volume for the Capomulin regimen
