## Observations and Insights 

In [1]:
# Dependencies and Setup
import matplotlib.pyplot as plt
import pandas as pd
import scipy.stats as st

# Study data files
mouse_metadata_path = "Resources/Mouse_metadata.csv"
study_results_path = "Resources/Study_results.csv"

# Read the mouse data and the study results
mouse_metadata = pd.read_csv(mouse_metadata_path)
study_results = pd.read_csv(study_results_path)

In [2]:
mouse_metadata.head()

Unnamed: 0,Mouse ID,Drug Regimen,Sex,Age_months,Weight (g)
0,k403,Ramicane,Male,21,16
1,s185,Capomulin,Female,3,17
2,x401,Capomulin,Female,16,15
3,m601,Capomulin,Male,22,17
4,g791,Ramicane,Male,11,16


In [3]:
study_results.head()

Unnamed: 0,Mouse ID,Timepoint,Tumor Volume (mm3),Metastatic Sites
0,b128,0,45.0,0
1,f932,0,45.0,0
2,g107,0,45.0,0
3,a457,0,45.0,0
4,c819,0,45.0,0


In [4]:
# Combine the data into a single dataset
combined_mouse_df = pd.merge(mouse_metadata, study_results, on="Mouse ID")


# Display the data table for preview
combined_mouse_df.head()

Unnamed: 0,Mouse ID,Drug Regimen,Sex,Age_months,Weight (g),Timepoint,Tumor Volume (mm3),Metastatic Sites
0,k403,Ramicane,Male,21,16,0,45.0,0
1,k403,Ramicane,Male,21,16,5,38.825898,0
2,k403,Ramicane,Male,21,16,10,35.014271,1
3,k403,Ramicane,Male,21,16,15,34.223992,1
4,k403,Ramicane,Male,21,16,20,32.997729,1


In [5]:
# Checking the number of mice.
combined_mouse_df.count()

Mouse ID              1893
Drug Regimen          1893
Sex                   1893
Age_months            1893
Weight (g)            1893
Timepoint             1893
Tumor Volume (mm3)    1893
Metastatic Sites      1893
dtype: int64

In [6]:
# Getting the duplicate mice by ID number that shows up for Mouse ID and Timepoint. 
found_dupes = combined_mouse_df.duplicated(['Mouse ID'])
found_dupes


0       False
1        True
2        True
3        True
4        True
        ...  
1888     True
1889     True
1890     True
1891     True
1892     True
Length: 1893, dtype: bool

In [None]:
# Optional: Get all the data for the duplicate mouse ID. 
#for i in found_dupes:
#    if found_dupes[i] == True:
#        print (found_dupes[i])


In [18]:
# Create a clean DataFrame by dropping the duplicate mouse by its ID.
no_dupes_df = combined_mouse_df.drop_duplicates(['Mouse ID','Timepoint'])

In [19]:
# Checking the number of mice in the clean DataFrame.
no_dupes_df.count()

Mouse ID              1888
Drug Regimen          1888
Sex                   1888
Age_months            1888
Weight (g)            1888
Timepoint             1888
Tumor Volume (mm3)    1888
Metastatic Sites      1888
dtype: int64

In [21]:
# Checking for mice with duplicate data
no_dupes_df = no_dupes_df.drop_duplicates(['Mouse ID'])
no_dupes_df.value_counts()


Mouse ID  Drug Regimen  Sex     Age_months  Weight (g)  Timepoint  Tumor Volume (mm3)  Metastatic Sites
z969      Naftisol      Male    9           30          0          45.0                0                   1
i177      Ramicane      Male    10          18          0          45.0                0                   1
j365      Zoniferol     Male    24          28          0          45.0                0                   1
j296      Ceftamin      Female  24          30          0          45.0                0                   1
j246      Capomulin     Female  21          21          0          45.0                0                   1
                                                                                                          ..
q633      Zoniferol     Male    12          25          0          45.0                0                   1
q610      Ramicane      Female  18          21          0          45.0                0                   1
q597      Ramicane      

## Summary Statistics

In [46]:
# Generate a summary statistics table of mean, median, variance, standard deviation, and SEM of the tumor volume for each regimen
summary_stats_df = pd.DataFrame({
    'Mean',
    'Median',
    'Variance',
    'Standard Deviation',
    'Standard Error of Means'
})
# Use groupby and summary statistical methods to calculate the following properties of each drug regimen: 
# mean, median, variance, standard deviation, and SEM of the tumor volume. 
# Assemble the resulting series into a single summary dataframe.
regimen_grouped = no_dupes_df.groupby(['Drug Regimen'])

regimen_mean = regimen_grouped.mean()

regimen_median = regimen_grouped.median()

regimen_var = regimen_grouped.var()

regimen_stddev = regimen_grouped.std()

standard_errors = [st.sem(regimen_grouped['Tumor Volume (mm3)']]
regimen_sem = standard(regimen_grouped['Tumor Volume (mm3)'])
print(regimen_sem)


TypeError: unsupported operand type(s) for /: 'str' and 'int'

In [None]:
# Generate a summary statistics table of mean, median, variance, standard deviation, and SEM of the tumor volume for each regimen

# Using the aggregation method, produce the same summary statistics in a single line


## Bar and Pie Charts

In [None]:
# Generate a bar plot showing the total number of measurements taken on each drug regimen using pandas.



In [None]:
# Generate a bar plot showing the total number of measurements taken on each drug regimen using pyplot.



In [None]:
# Generate a pie plot showing the distribution of female versus male mice using pandas



In [None]:
# Generate a pie plot showing the distribution of female versus male mice using pyplot



## Quartiles, Outliers and Boxplots

In [None]:
# Calculate the final tumor volume of each mouse across four of the treatment regimens:  
# Capomulin, Ramicane, Infubinol, and Ceftamin

# Start by getting the last (greatest) timepoint for each mouse


# Merge this group df with the original dataframe to get the tumor volume at the last timepoint


In [None]:
# Put treatments into a list for for loop (and later for plot labels)


# Create empty list to fill with tumor vol data (for plotting)


# Calculate the IQR and quantitatively determine if there are any potential outliers. 

    
    # Locate the rows which contain mice on each drug and get the tumor volumes
    
    
    # add subset 
    
    
    # Determine outliers using upper and lower bounds
    

In [None]:
# Generate a box plot of the final tumor volume of each mouse across four regimens of interest


## Line and Scatter Plots

In [None]:
# Generate a line plot of tumor volume vs. time point for a mouse treated with Capomulin


In [None]:
# Generate a scatter plot of average tumor volume vs. mouse weight for the Capomulin regimen


## Correlation and Regression

In [None]:
# Calculate the correlation coefficient and linear regression model 
# for mouse weight and average tumor volume for the Capomulin regimen
