## Observations and Insights 

In [None]:
# Dependencies and Setup
import matplotlib.pyplot as plt
import pandas as pd
import scipy.stats as st
import numpy as np

# Study data files
mouse_metadata_path = "data/Mouse_metadata.csv"
study_results_path = "data/Study_results.csv"

# Read the mouse data and the study results
mouse_metadata = pd.read_csv(mouse_metadata_path)
study_results = pd.read_csv(study_results_path)

# Combine the data into a single dataset
mouse_results_df = pd.merge(mouse_metadata, study_results, on='Mouse ID')

# Display the data table for preview
mouse_results_df.head()

In [None]:
# Checking the number of mice.
dup_records = mouse_results_df[mouse_results_df.duplicated(['Mouse ID','Timepoint'])]
dup_records

In [None]:
# Getting the duplicate mice by ID number that shows up for Mouse ID and Timepoint. 
dup_record = dup_records['Mouse ID'].unique()
dup_record[0]

In [None]:
# Optional: Get all the data for the duplicate mouse ID. 
mouse_results_df[mouse_results_df['Mouse ID']==dup_record[0]]

In [None]:
# Create a clean DataFrame by dropping the duplicate mouse by its ID.
clean_mouse_results_df = mouse_results_df.drop(mouse_results_df[mouse_results_df['Mouse ID']==dup_record[0]].index)
clean_mouse_results_df

In [None]:
# Checking the number of mice in the clean DataFrame.
len(clean_mouse_results_df['Mouse ID'].unique())

## Summary Statistics

In [None]:
# Generate a summary statistics table of mean, median, variance, standard deviation, and SEM of the tumor volume for each regimen

# Use groupby and summary statistical methods to calculate the following properties of each drug regimen: 
# mean, median, variance, standard deviation, and SEM of the tumor volume. 
drugregimen_df = clean_mouse_results_df.groupby('Drug Regimen')

mean_df = drugregimen_df['Tumor Volume (mm3)'].mean()
median_df = drugregimen_df['Tumor Volume (mm3)'].median()
var_df = drugregimen_df['Tumor Volume (mm3)'].var()
std_df = drugregimen_df['Tumor Volume (mm3)'].std()
sem_df = drugregimen_df['Tumor Volume (mm3)'].sem()

# Assemble the resulting series into a single summary dataframe.
summary_drugregimen_df = pd.DataFrame({'Tumor Volume Mean':mean_df
                                      ,'Tumor Volume Median':median_df
                                      ,'Tumor Volume Variance':var_df
                                      ,'Tumor Volume Standard Deviation':std_df
                                      ,'Tumor Volume SEM':sem_df
                                      })
summary_drugregimen_df

In [None]:
# Using the aggregation method, produce the same summary statistics in a single line
summary_df = pd.DataFrame({'Tumor Volume Mean':[clean_mouse_results_df['Tumor Volume (mm3)'].mean()]
                          ,'Tumor Volume Median':[clean_mouse_results_df['Tumor Volume (mm3)'].median()]
                          ,'Tumor Volume Variance':[clean_mouse_results_df['Tumor Volume (mm3)'].var()]
                          ,'Tumor Volume Standard Deviation':[clean_mouse_results_df['Tumor Volume (mm3)'].std()]
                          ,'Tumor Volume SEM':[clean_mouse_results_df['Tumor Volume (mm3)'].sem()]
                          })
summary_df

## Bar and Pie Charts

In [None]:
# Generate a bar plot showing the total number of measurements taken on each drug regimen using pandas.
regimen_df = drugregimen_df['Drug Regimen'].count()
bar_plot = regimen_df.plot.bar(x='Drug Regimen', y='values', align='center', title='Number of Measurements Taken Per Drug Regimen')
bar_plot.set_ylabel('Number of Measurements Taken')

In [None]:
# Generate a bar plot showing the total number of measurements taken on each drug regimen using pyplot.

# Create lists for the x & y values
count_df = drugregimen_df['Drug Regimen'].count()
regimen_ls = list(count_df.keys())
counts_ls = list(count_df.values.flatten())

# Configure x axis tickes
x_axis = np.arange(len(regimen_ls))
tick_locations = [value for value in x_axis]
plt.xticks(tick_locations, regimen_ls, rotation=90)

# Specify title & axis labels
plt.title('Number of Measurements Taken Per Drug Regimen')
plt.xlabel('Drug Regimen')
plt.ylabel('Number of Measurements Taken')

# plt.xlim(-.25,-.25)
# Plot chart
plt.bar(x_axis, counts_ls, align='center', width=.5)

In [None]:
# Generate a pie plot showing the distribution of female versus male mice using pandas
sex_df = clean_mouse_results_df.groupby('Sex')['Mouse ID'].nunique()
pie_plot = sex_df.plot.pie(y='values', title='Distribution of Mice by Sex', autopct="%1.1f%%")
pie_plot.set_ylabel('Number of Mice')

In [None]:
# Generate a pie plot showing the distribution of female versus male mice using pyplot
plt.title('Distribution of Mice by Sex')
plt.ylabel('Number of Mice')
labels = list(sex_df.keys())
plt.pie(sex_df, labels=labels, autopct="%1.1f%%")
plt.show()

## Quartiles, Outliers and Boxplots

In [None]:
# Calculate the final tumor volume of each mouse across four of the treatment regimens:  
# Capomulin, Ramicane, Infubinol, and Ceftamin

# Start by getting the last (greatest) timepoint for each mouse
last_timepoint = clean_mouse_results_df.groupby('Mouse ID')['Timepoint'].max()

# Merge this group df with the original dataframe to get the tumor volume at the last timepoint
last_timepoint_df = clean_mouse_results_df.merge(last_timepoint, on='Mouse ID', suffixes=('','_max'))
last_timepoint_df = last_timepoint_df[last_timepoint_df['Timepoint']==last_timepoint_df['Timepoint_max']]
last_timepoint_df

In [None]:
# Put treatments into a list for for loop (and later for plot labels)
regimen_ls = ['Capomulin', 'Ramicane', 'Infubinol', 'Ceftamin']

# Create empty list to fill with tumor vol data (for plotting)
tumor_val_ls = []

# Calculate the IQR and quantitatively determine if there are any potential outliers. 
for regimen in regimen_ls:
    # Locate the rows which contain mice on each drug and get the tumor volumes
    temp_df = last_timepoint_df[last_timepoint_df['Drug Regimen']==regimen]
    
    # add subset
    values = temp_df['Tumor Volume (mm3)'] 
    tumor_val_ls.append(values)

    # Determine outliers using upper and lower bounds
    quartiles = temp_df['Tumor Volume (mm3)'].quantile([.25,.5,.75])
    lower = quartiles[.25]
    median = quartiles[.5]
    upper = quartiles[.75]
    iqr = upper-lower
    lower_bound = lower - (1.5 * iqr)
    upper_bound = upper + (1.5 * iqr)
    outliers_df = values.loc[(temp_df['Tumor Volume (mm3)'] <= lower_bound) |
                             (temp_df['Tumor Volume (mm3)'] >= upper_bound)]
    outliers_ct = len(outliers_df)
    output =  f'{regimen}:\n '
    output += f'Lower Quartile = {lower}\n Median = {median}\n Upper Quartile = {upper}\n '
    output += f'Interquartile Range = {iqr}\n Lower Outlier Boundary = {lower_bound}\n Upper Outlier Boundary = {upper_bound}\n '
    output += f'Outlier Count = {outliers_ct}\n'
    print(output)

In [None]:
# Generate a box plot of the final tumor volume of each mouse across four regimens of interest

# Specify outlier color, size & shape
flierprops = {'markerfacecolor':'red', 'markersize':12, 'marker':'^'}

# Create plot
plt.subplots()
plt.boxplot(tumor_val_ls, labels=regimen_ls, flierprops=flierprops)

# Specify title & axis labels
plt.title('Final Tumor Volume by Drug Regimen')
plt.ylabel('Final Tumor Volume (mm3)')
plt.xlabel('Drug Regimen')

#Show plot
plt.show()

## Line and Scatter Plots

In [None]:
# Generate a line plot of tumor volume vs. time point for a mouse treated with Capomulin
mouse_df = clean_mouse_results_df[clean_mouse_results_df['Mouse ID']=='s185']

x_axis = mouse_df['Timepoint']
values = mouse_df['Tumor Volume (mm3)']

plt.plot(x_axis, values)
plt.xlabel('Timepoint')
plt.ylabel('Tumor Volume (mm3)')
plt.title('Capomulin Reults for Mounse s185')

plt.show()

In [None]:
# Generate a scatter plot of average tumor volume vs. mouse weight for the Capomulin regimen
Capomulin_df = clean_mouse_results_df[clean_mouse_results_df['Drug Regimen']=='Capomulin'].groupby('Mouse ID')
weight_df = Capomulin_df.mean()
weight_df = weight_df[['Weight (g)','Tumor Volume (mm3)']]

# min_weight = weight_df['Weight (g)'].min()
# max_weight = weight_df['Weight (g)'].max()

x_axis = weight_df['Weight (g)']
values = weight_df['Tumor Volume (mm3)']

plt.title('Average Tumer Volume by Mouse Weight')
plt.xlabel('Weight (g)')
plt.ylabel('Tumor Volume (mm3)')

plt.scatter(x_axis, values)
plt.show()

## Correlation and Regression

In [None]:
# Calculate the correlation coefficient and linear regression model 
# for mouse weight and average tumor volume for the Capomulin regimen
