## Observations and Insights 

In [60]:
# Dependencies and Setup
import matplotlib.pyplot as plt
import pandas as pd
import scipy.stats as st

# Study data files
mouse_metadata_path = "data/Mouse_metadata.csv"
study_results_path = "data/Study_results.csv"

# Read the mouse data and the study results
mouse_metadata = pd.read_csv(mouse_metadata_path)
study_results = pd.read_csv(study_results_path)

# Combine the data into a single dataset
print(mouse_metadata.columns)
study_results.head(20)

mouse_study_data = pd.merge(study_results,mouse_metadata,on='Mouse ID',how='left')
print(mouse_study_data.columns)


Index(['Mouse ID', 'Drug Regimen', 'Sex', 'Age_months', 'Weight (g)'], dtype='object')
Index(['Mouse ID', 'Timepoint', 'Tumor Volume (mm3)', 'Metastatic Sites',
       'Drug Regimen', 'Sex', 'Age_months', 'Weight (g)'],
      dtype='object')


Unnamed: 0,Mouse ID,Timepoint,Tumor Volume (mm3),Metastatic Sites,Drug Regimen,Sex,Age_months,Weight (g)
0,b128,0,45.000000,0,Capomulin,Female,9,22
1,f932,0,45.000000,0,Ketapril,Male,15,29
2,g107,0,45.000000,0,Ketapril,Female,2,29
3,a457,0,45.000000,0,Ketapril,Female,11,30
4,c819,0,45.000000,0,Ketapril,Male,21,25
...,...,...,...,...,...,...,...,...
1888,r944,45,41.581521,2,Capomulin,Male,12,25
1889,u364,45,31.023923,3,Capomulin,Male,18,17
1890,p438,45,61.433892,1,Ceftamin,Female,11,26
1891,x773,45,58.634971,4,Placebo,Female,21,30


In [56]:
# Checking the number of mice in the DataFrame.
print("Number of unique mice:  " + str(mouse_study_data['Mouse ID'].nunique()))
print("Number of mice in data frame:  " + str(mouse_study_data['Mouse ID'].count()))

Number of unique mice:  249
Number of mice in data frame:  1893


In [85]:
# Getting the duplicate mice by ID number that shows up for Mouse ID and Timepoint. 

duplicate_mice = mouse_study_data.duplicated(subset=['Mouse ID','Timepoint'])

if duplicate_mice.any():
        print("Duplicate Mouse ID(s) is(are)  >:" + mouse_study_data.loc[duplicate_mice]['Mouse ID'].unique())
        duplicate_list = mouse_study_data.loc[duplicate_mice]['Mouse ID'].tolist()

['Duplicate Mouse ID(s) is(are)  >:g989']


In [92]:
# Optional: Get all the data for the duplicate mouse ID. 
mouse_study_temp = mouse_study_data.set_index('Mouse ID')
print(mouse_study_temp.loc[duplicate_list].count())
mouse_study_temp.loc[duplicate_list]



Unnamed: 0_level_0,Timepoint,Tumor Volume (mm3),Metastatic Sites,Drug Regimen,Sex,Age_months,Weight (g)
Mouse ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
g989,0,45.000000,0,Propriva,Female,21,26
g989,0,45.000000,0,Propriva,Female,21,26
g989,5,48.786801,0,Propriva,Female,21,26
g989,5,47.570392,0,Propriva,Female,21,26
g989,10,51.745156,0,Propriva,Female,21,26
...,...,...,...,...,...,...,...
g989,20,55.326122,1,Propriva,Female,21,26
g989,20,54.657650,1,Propriva,Female,21,26
g989,25,56.045564,1,Propriva,Female,21,26
g989,30,59.082294,1,Propriva,Female,21,26


In [95]:
# Create a clean DataFrame by dropping the duplicate mouse by its ID.
clean_mouse_data = mouse_study_data.drop_duplicates(subset=['Mouse ID','Timepoint'])
clean_mouse_data


Unnamed: 0,Mouse ID,Timepoint,Tumor Volume (mm3),Metastatic Sites,Drug Regimen,Sex,Age_months,Weight (g)
0,b128,0,45.000000,0,Capomulin,Female,9,22
1,f932,0,45.000000,0,Ketapril,Male,15,29
2,g107,0,45.000000,0,Ketapril,Female,2,29
3,a457,0,45.000000,0,Ketapril,Female,11,30
4,c819,0,45.000000,0,Ketapril,Male,21,25
...,...,...,...,...,...,...,...,...
1888,r944,45,41.581521,2,Capomulin,Male,12,25
1889,u364,45,31.023923,3,Capomulin,Male,18,17
1890,p438,45,61.433892,1,Ceftamin,Female,11,26
1891,x773,45,58.634971,4,Placebo,Female,21,30


In [101]:
# Checking the number of mice in the clean DataFrame.
duplicate_mice2 = clean_mouse_data.duplicated(subset=['Mouse ID','Timepoint'])

if ~duplicate_mice2.any():
        print("No duplicates")

print("Number of unique mice:  " + str(clean_mouse_data['Mouse ID'].nunique()))
print("Number of mice in data frame:  " + str(clean_mouse_data['Mouse ID'].count()))

No duplicates
Number of unique mice:  249
Number of mice in data frame:  1888


## Summary Statistics

In [106]:
# Generate a summary statistics table of mean, median, variance, standard deviation, and SEM of the tumor volume for each regimen

# This method is the most straightforward, creating multiple series and putting them all together at the end.
regimen_summary = clean_mouse_data.groupby('Drug Regimen').agg({'Tumor Volume (mm3)' : ['mean','median','var','std']})
regimen_summary

Unnamed: 0_level_0,Tumor Volume (mm3),Tumor Volume (mm3),Tumor Volume (mm3),Tumor Volume (mm3)
Unnamed: 0_level_1,mean,median,var,std
Drug Regimen,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2
Capomulin,40.675741,41.557809,24.947764,4.994774
Ceftamin,52.591172,51.776157,39.290177,6.268188
Infubinol,52.884795,51.820584,43.128684,6.567243
Ketapril,55.235638,53.698743,68.553577,8.279709
Naftisol,54.331565,52.509285,66.173479,8.134708
Placebo,54.033581,52.288934,61.168083,7.821003
Propriva,52.393463,50.909965,43.138803,6.568014
Ramicane,40.216745,40.673236,23.486704,4.846308
Stelasyn,54.233149,52.431737,59.450562,7.710419
Zoniferol,53.236507,51.818479,48.533355,6.966589


In [8]:
# Generate a summary statistics table of mean, median, variance, standard deviation, and SEM of the tumor volume for each regimen

# This method produces everything in a single groupby function.

regimen_summary = clean_mouse_data.groupby('Drug Regimen').agg({'Tumor Volume (mm3)' : ['mean','median','var','std']})
regimen_summary

## Bar Plots

In [123]:
# Generate a bar plot showing the number of mice per time point for each treatment throughout the course of the study using pandas.
x_axis = clean_mouse_data['Drug Regimen']
y_axis = clean_mouse_data['Mouse ID'].value_counts()
print(x_axis)
print(y_axis)
timepoint_plot.plot(kind="bar", y=c, facecolor="red")

0       Capomulin
1        Ketapril
2        Ketapril
3        Ketapril
4        Ketapril
          ...    
1888    Capomulin
1889    Capomulin
1890     Ceftamin
1891      Placebo
1892     Stelasyn
Name: Drug Regimen, Length: 1888, dtype: object
c559    10
a897    10
o331    10
c282    10
l700    10
        ..
u153     1
f932     1
o848     1
b447     1
x336     1
Name: Mouse ID, Length: 249, dtype: int64


NameError: name 'timepoint_plot' is not defined

In [10]:
# Generate a bar plot showing the number of mice per time point for each treatment throughout the course of the study using pyplot.

## Pie Plots

In [11]:
# Generate a pie plot showing the distribution of female versus male mice using pandas
mice_gender = 

In [12]:
# Generate a pie plot showing the distribution of female versus male mice using pyplot

## Quartiles, Outliers and Boxplots

In [13]:
# Calculate the final tumor volume of each mouse across four of the most promising treatment regimens. Calculate the IQR and quantitatively determine if there are any potential outliers. 


In [14]:
# Generate a box plot of the final tumor volume of each mouse across four regimens of interest

## Line and Scatter Plots

In [15]:
# Generate a line plot of time point versus tumor volume for a mouse treated with Capomulin


In [16]:
# Generate a scatter plot of mouse weight versus average tumor volume for the Capomulin regimen



## Correlation and Regression

In [17]:
# Calculate the correlation coefficient and linear regression model 
# for mouse weight and average tumor volume for the Capomulin regimen
