## Observations and Insights

## Dependencies and starter code

In [None]:
%matplotlib notebook

In [None]:
# Dependencies and Setup
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
import scipy.stats as st

# Study data files
mouse_metadata = "data/Mouse_metadata.csv"
study_results = "data/Study_results.csv"

# Read the mouse data and the study results
mouse_metadata = pd.read_csv(mouse_metadata)
study_results = pd.read_csv(study_results)

# Combine the data into a single dataset
metadata_results_combined_df = pd.merge(mouse_metadata, study_results, on='Mouse ID', how='outer')

metadata_results_combined_df

## Summary statistics

In [None]:
# Generate a summary statistics table of mean, median, variance, standard deviation, and SEM of the tumor volume for each regimen
regimen_group = metadata_results_combined_df.groupby(['Drug Regimen'])
volume_summary = regimen_group['Tumor Volume (mm3)'].mean()
volume_summary_df = pd.DataFrame(volume_summary)
volume_summary_df = volume_summary_df.rename(columns={'Tumor Volume (mm3)':'Mean Tumor Volume (mm3)'})
volume_summary_df['Median Tumor Volume (mm3)'] = regimen_group['Tumor Volume (mm3)'].median()
volume_summary_df['Variance Tumor Volume (mm3)'] = regimen_group['Tumor Volume (mm3)'].var()
volume_summary_df['Stand. Dev. Tumor Volume (mm3)'] = regimen_group['Tumor Volume (mm3)'].std()
volume_summary_df['SEM Tumor Volume (mm3)'] = regimen_group['Tumor Volume (mm3)'].sem()
volume_summary_df

## Bar plots

In [None]:
# Generate a bar plot showing number of data points for each treatment regimen using pyplot

number_of_data_points = regimen_group['Tumor Volume (mm3)'].count()
drug_regimen = list(set(metadata_results_combined_df['Drug Regimen']))
x_axis = np.arange(len(drug_regimen))

plt.bar(x_axis, number_of_data_points, tick_label=drug_regimen)
plt.xticks(rotation=45)
plt.title('Number of Data Points per Drug Regimen')
plt.ylabel('Number of Data Points')
plt.grid()
plt.tight_layout()

In [None]:
# Generate a bar plot showing number of data points for each treatment regimen using pandas
data_point_count_df = pd.DataFrame(number_of_data_points)

Count_PDPlot = data_point_count_df.plot(kind='bar', grid=True, legend=False)
plt.title('Number of Data Points per Drug Regimen')
Count_PDPlot.set_ylabel('Number of Data Points')
plt.tight_layout()

## Pie plots

In [None]:
# Generate a pie plot showing the distribution of female versus male mice using pyplot

sex_ratio_group = metadata_results_combined_df.groupby(['Sex']).count()
sex_ratio_df = pd.DataFrame(sex_ratio_group['Mouse ID'])
sex_ratio = list(sex_ratio_df['Mouse ID'])
labels = sex_ratio_df.index.values.tolist()

Ratio_MPLPlot = plt.pie(sex_ratio, labels=labels, autopct='%1.1f%%', startangle=180)


In [None]:
# Generate a pie plot showing the distribution of female versus male mice using pandas
sex_ratio_df
gender_list = sex_ratio_df.keys()

Ratio_PDPlot = sex_ratio_df.plot(kind='pie', y=gender_list)
Ratio_PDPlot.set_ylabel('')
plt.tight_layout()
plt.title('Female to Male Ratio')

## Quartiles, outliers and boxplots

In [None]:
# Calculate the final tumor volume of each mouse across four of the most promising treatment regimens. 
# Calculate the IQR and quantitatively determine if there are any potential outliers. 

metadata_results_combined_df

regimen_df = pd.DataFrame(regimen_group['Mouse ID'].count())
popular_regimen_df = regimen_df.sort_values(by='Mouse ID', ascending=False).head(4)
# metadata_results_grouped_mouse = metadata_results_combined_df.groupby(['Mouse ID', 'Drug Regimen'])
# tumor_results = metadata_results_grouped_mouse['Tumor Volume (mm3)'].mean()
# tumor_results_df = pd.DataFrame(tumor_results)
# tumor_results_df = metadata_results_grouped_mouse
# metadata_results_grouped_mouse.count()
# metadata_results_combined_df

popular_regimen_df

In [None]:
# Generate a box plot of the final tumor volume of each mouse across four regimens of interest

## Line and scatter plots

In [None]:
# Generate a line plot of time point versus tumor volume for a mouse treated with Capomulin

In [None]:
# Generate a scatter plot of mouse weight versus average tumor volume for the Capomulin regimen

In [None]:
# Calculate the correlation coefficient and linear regression model for mouse weight and average tumor volume for the Capomulin regimen