## Observations and Insights 

#Look across all previously generated figures and tables and write at least three observations or inferences 
#that can be made from the data. Include these observations at the top of notebook.

Dependencies and data setup

In [None]:
import matplotlib.pyplot as plt
import pandas as pd
import scipy.stats as st
from itertools import cycle, islice
import seaborn as sns
from scipy.stats import linregress

# Study data files
mouse_metadata_path = "data/Mouse_metadata.csv"
study_results_path = "data/Study_results.csv"

# Read the mouse data and the study results
mouse_metadata = pd.read_csv(mouse_metadata_path)
study_results = pd.read_csv(study_results_path)
study_results = study_results.rename(columns={"Drug Regimen":"Drug_Regimen", "Tumor Volume (mm3)":"Tumor_Volume_mm3", "Mouse ID":"Mouse_ID"})
mouse_metadata = mouse_metadata.rename(columns={"Drug Regimen":"Drug_Regimen", "Mouse ID":"Mouse_ID"})

mouse_study_df = pd.merge(mouse_metadata, study_results)

mouse_study_df

Check for duplicate mouse ID numbers

In [None]:
# Checking the number of mice.
a = mouse_metadata["Mouse_ID"].nunique()
b = study_results["Mouse_ID"].nunique()
c = mouse_study_df["Mouse_ID"].nunique()
a, b, c

In [None]:
# Getting the duplicate mice by ID number that shows up for Mouse ID and Timepoint. 
#https://stackoverflow.com/questions/29276958/identifying-duplicate-pairs-in-python-pandas
find_dup = mouse_study_df.groupby(['Mouse_ID', 'Timepoint']).size()                                 
find_dup = find_dup[find_dup > 1]
find_dup

In [None]:
# Optional: Get all the data for the duplicate mouse ID. 
# One copy of g989 seems to have died... will remove both of them, although maybe whoever collected the data
#knows the story
dup = mouse_study_df.loc[mouse_study_df['Mouse_ID'] == "g989"]
dup

Make clean dataframe

In [None]:
# Create a clean DataFrame by dropping the duplicate mouse by its ID.
mouse_study_clean_df = mouse_study_df[mouse_study_df["Mouse_ID"] != "g989"]
#mouse_study_clean_df.dtypes
mouse_study_clean_df

In [None]:
# Checking the number of mice in the clean DataFrame.
mouse_study_clean_df["Mouse_ID"].nunique()

Put placebo results first (for science)

In [None]:
#This seems harder than it should be, but none of the easier-looking suggestions I found did more than put the
#treatments in alphabetical order.

#https://stackoverflow.com/questions/43266211/sort-a-dataframe-based-on-values-of-another-column-using-\
#integer-data-type                                                     
#https://stackoverflow.com/questions/23279238/custom-sorting-with-pandas

drugs = mouse_study_clean_df.Drug_Regimen.unique()
#drugs
order = {'Placebo':0,'Ramicane':1, 'Capomulin':2, 'Infubinol':3,  'Ceftamin':4, 'Stelasyn':5, 'Zoniferol':6, 'Ketapril':7, 'Propriva':8, 'Naftisol':9}
mouse_study_clean_df.loc[:,'rank'] = mouse_study_clean_df.loc[:,'Drug_Regimen'].map(order)
mouse_study_clean_df = mouse_study_clean_df.iloc[mouse_study_clean_df.loc[:,'rank'].argsort()].dropna()
mouse_study_clean_df = mouse_study_clean_df.drop(labels=['rank'],axis=1)

mouse_study_clean_df.loc[:, "Age_months"] = pd.to_numeric(mouse_study_clean_df.loc[:, "Age_months"])
mouse_study_clean_df.loc[:,"Weight (g)"] = pd.to_numeric(mouse_study_clean_df.loc[:, "Weight (g)"])
mouse_study_clean_df.loc[:,"Timepoint"] = pd.to_numeric(mouse_study_clean_df.loc[:, "Timepoint"])
mouse_study_clean_df.loc[:,"Tumor_Volume_mm3"] = pd.to_numeric(mouse_study_clean_df.loc[:, "Tumor_Volume_mm3"])
mouse_study_clean_df.loc[:,"Metastatic Sites"] = pd.to_numeric(mouse_study_clean_df.loc[:,"Metastatic Sites"])

mouse_study_clean_df

#mouse_study_clean_df.dtypes

## Summary Statistics

Generate a summary statistics table of mean, median, variance, standard deviation, and SEM of the tumor volume 
for each regimen

This method is the most straighforward, creating multiple series and putting them all together at the end.

In [None]:
mice_by_drugs = mouse_study_clean_df.groupby("Drug_Regimen")
#drugs = mouse_study_clean_df.Drug_Regimen.unique()
#mice_by_drugs
mean = mice_by_drugs["Tumor_Volume_mm3"].mean()
median = mice_by_drugs["Tumor_Volume_mm3"].median()
variance = mice_by_drugs["Tumor_Volume_mm3"].var()
std = mice_by_drugs["Tumor_Volume_mm3"].std()
sem = mice_by_drugs["Tumor_Volume_mm3"].sem()

mouse_study_stats_df = pd.merge(mean, median, on=["Drug_Regimen"])
mouse_study_stats_df = mouse_study_stats_df.rename(columns = {"Tumor_Volume_mm3_x":"mean", "Tumor_Volume_mm3_y":"median"})
mouse_study_stats_df = pd.merge(mouse_study_stats_df, variance, on=["Drug_Regimen"])
mouse_study_stats_df = mouse_study_stats_df.rename(columns = {"Tumor_Volume_mm3":"variance"})
mouse_study_stats_df = pd.merge(mouse_study_stats_df, std, on=["Drug_Regimen"])
mouse_study_stats_df = mouse_study_stats_df.rename(columns = {"Tumor_Volume_mm3":"std"})
mouse_study_stats_df = pd.merge(mouse_study_stats_df, sem, on=["Drug_Regimen"])
mouse_study_stats_df = mouse_study_stats_df.rename(columns = {"Tumor_Volume_mm3":"sem"})

#Put placebo results first
order = pd.Categorical(['Placebo','Ramicane', 'Capomulin', 'Infubinol',  'Ceftamin', 'Stelasyn', 'Zoniferol', 'Ketapril', 'Propriva', 'Naftisol'], ordered=True)
mouse_study_stats_df = mouse_study_stats_df.reindex(order)

mouse_study_stats_df

This method produces everything in a single groupby function

In [None]:
#https://stackoverflow.com/questions/55388610/how-to-calculate-aggregated-summary-statistics-in-pandas-dataframe

summary_table = mouse_study_clean_df.groupby(["Drug_Regimen"]).agg({"Tumor_Volume_mm3":['mean', 'median', 'var', 'std', 'sem']})

#Put placebo results first
order = pd.Categorical(['Placebo','Ramicane', 'Capomulin', 'Infubinol',  'Ceftamin', 'Stelasyn', 'Zoniferol', 'Ketapril', 'Propriva', 'Naftisol'], ordered=True)
summary_table = summary_table.reindex(order)

summary_table

## Bar and Pie Charts

Use Pandas to generate a bar plot showing the total number of mice in each treatment regimen throughout the course of the study. 

In [None]:
#Put placebo results first
ordered_drugs = ['Placebo','Ramicane', 'Capomulin', 'Infubinol',  'Ceftamin', 'Stelasyn', 'Zoniferol', 'Ketapril', 'Propriva', 'Naftisol']
order = pd.api.types.CategoricalDtype(categories=ordered_drugs, ordered=True)
mouse_study_clean_df['Drug_Regimen'] = mouse_study_clean_df['Drug_Regimen'].astype(order)

drug_only_group = mouse_study_clean_df.groupby(['Drug_Regimen'])

count_unique_mice = drug_only_group['Mouse_ID'].nunique()

ax=count_unique_mice.plot(kind="bar", figsize=(7,3), legend=False, rot=30, zorder=3)

ax.set_ylabel("Mice", size=12)
ax.set_xlabel("Drug Regimen", size=12)
ax.set_title("Mice per treatment", size=14)
ax.grid(axis='y', zorder=0)
plt.draw()

Use Pyplot to generate a bar plot showing the total number of mice treatment in each treatment regimen throughout 
the course of the study.

In [None]:
#Put placebo results first
ordered_drugs = ['Placebo','Ramicane', 'Capomulin', 'Infubinol',  'Ceftamin', 'Stelasyn', 'Zoniferol', 'Ketapril', 'Propriva', 'Naftisol']
order = pd.api.types.CategoricalDtype(categories=ordered_drugs, ordered=True)
mouse_study_clean_df.loc['Drug_Regimen'] = mouse_study_clean_df['Drug_Regimen'].astype(order)

drug_only_group = mouse_study_clean_df.groupby(['Drug_Regimen'])

count_unique_mice = drug_only_group['Mouse_ID'].nunique()

count_chart = count_unique_mice.plot(kind='bar', figsize=(7,3), rot=30, zorder=3)
plt.title("Mice per treatment", size=14)
plt.ylabel("Mice", size=12)
plt.xlabel("Drug Regimen", size=12)
plt.grid(axis='y', zorder=0)
plt.show()

In [None]:
#"Duplicate" mouse was removed from Propiva group, what's with Stelasyn? Nothing obvious to me in data,
#probably have to ask the researchers.
Propriva_group = mouse_study_df.loc[mouse_study_df['Drug_Regimen'] == "Stelasyn"]
Propriva_group_sorted = Propriva_group.sort_values(["Mouse_ID","Timepoint"])
pd.set_option('display.max_rows', 1000)
Propriva_group_sorted

Here are a couple of charts I made having interpreted "throughout the study" as "over the time course".
Too pretty to discard.

In real life I would probably make more user-friendly x-axes, couldn't yet find a pandas method
that worked the way I wanted (and now see I don't have to).

In [None]:
#https://stackoverflow.com/questions/39275294/sort-by-certain-order-situation-pandas-dataframe-groupby

#Put placebo results first
ordered_drugs = ['Placebo','Ramicane', 'Capomulin', 'Infubinol',  'Ceftamin', 'Stelasyn', 'Zoniferol', 'Ketapril', 'Propriva', 'Naftisol']
order = pd.api.types.CategoricalDtype(categories=ordered_drugs, ordered=True)
mouse_study_clean_df['Drug_Regimen'] = mouse_study_clean_df['Drug_Regimen'].astype(order)

drug_group = mouse_study_clean_df.groupby(['Drug_Regimen', 'Timepoint'])

count_mice = drug_group['Mouse_ID'].count()

my_colors = list(islice(cycle(['0', '.07', '0.14', '0.21', '0.28', '0.35', '0.42', '0.49', '0.56', '0.63']), None, len(drug_group)))

ax=count_mice.plot(kind="bar", figsize=(20,3), legend=False, title="Survival by treatment", color=my_colors, zorder=3)

ax.set_ylabel("Surviving mice", size=16)
ax.set_xlabel("Drug Regimen, Timepoint (days)", size=16)
ax.set_title("Survival by treatment", size=20)
ax.grid(axis='y', zorder=0)

In [None]:
#Put placebo results first
ordered_drugs = ['Placebo','Ramicane', 'Capomulin', 'Infubinol',  'Ceftamin', 'Stelasyn', 'Zoniferol', 'Ketapril', 'Propriva', 'Naftisol']
order = pd.api.types.CategoricalDtype(categories=ordered_drugs, ordered=True)
mouse_study_clean_df.loc['Drug_Regimen'] = mouse_study_clean_df['Drug_Regimen'].astype(order)

drug_group = mouse_study_clean_df.groupby(['Drug_Regimen', 'Timepoint'])

count_mice = drug_group['Mouse_ID'].count()

colors = '0', '.07', '0.14', '0.21', '0.28', '0.35', '0.42', '0.49', '0.56', '0.63'

count_chart = count_mice.plot(kind='bar', color=colors, figsize=(20,3), zorder=3)
plt.title("Survival by treatment", size=20)
plt.ylabel("Surviving mice", size=16)
plt.xlabel("Drug Regimen, Timepoint (days)", size=16)
plt.grid(axis='y', zorder=0)
plt.show()

Generate a pie plot using Pandas's `DataFrame.plot()` that shows the distribution of female or male mice in the study.

In [None]:
ax=mouse_study_clean_df['Sex'].value_counts().plot.pie(startangle=90, autopct='%1.0f%%', label='')
ax.set_title("Mice by sex")

Generate a pie plot showing the distribution of female versus male mice using pyplot.

In [None]:
labels="Male", "Female"
plt.pie(mouse_study_clean_df['Sex'].value_counts(), labels=labels, autopct='%1.0f%%', startangle=90)
plt.title("Mice by sex")
plt.axis('equal')
plt.show

## Quartiles, Outliers and Boxplots

Calculate the final tumor volume of each mouse across four of the most promising treatment regimens: Capomulin, 
Ramicane, Infubinol, and Ceftamin. 

In [None]:
#https://stackoverflow.com/questions/23394476/keep-other-columns-when-doing-groupby

mouse_study_clean_df_by_drug =  mouse_study_clean_df.set_index('Drug_Regimen')
mouse_study_clean_best_drugs = mouse_study_clean_df_by_drug.drop(index=['Placebo','Stelasyn', 'Zoniferol', 'Ketapril', 'Propriva', 'Naftisol'])
mouse_study_clean_best_drugs_no_na = mouse_study_clean_best_drugs.dropna()
mouse_study_clean_best_drugs_no_na.reset_index(inplace=True)
final_timepoints_grouped = mouse_study_clean_best_drugs_no_na.sort_values("Timepoint").groupby(['Drug_Regimen','Mouse_ID'], as_index=False).last().dropna()
final_timepoints_grouped["Metastatic Sites"] = final_timepoints_grouped["Metastatic Sites"].astype(int)

pd.set_option('display.max_rows', 1000)
final_timepoints_grouped
#final_timepoints_grouped.dtypes

#final_timepoints_index_reset = final_timepoints_grouped.reset_index()
#final_timepoints_index_reset

#As far as I can tell there are none of the unwanted treatments in this dataframe, and yet
#the boxplots are giving trouble... Here's one test, results are all "False"

#c1 = ['Placebo','Stelasyn','Zoniferol','Ketapril','Propriva','Naftisol']
#final_timepoints_grouped['Drug_Regimen'].isin(c1)

#c1 = ['Placebo','Stelasyn','Zoniferol','Ketapril','Propriva','Naftisol']
#final_timepoints_index_reset['Drug_Regimen'].isin(c1)

#FINALLY found an answer that works, although I can't say I see why. Applied for box plots (below).

#https://stackoverflow.com/questions/39380389/hoy-can-i-plot-a-group-by-boxplot-in-pandas-dropping-unused-categories


Calculate the quartiles and IQR and quantitatively determine if there are any potential outliers across 
all four treatment regimens.

Answer: One outlier (barely) in Infubinol mininum volume.

In [None]:

#best_drugs_final_timepoints_df = best_drugs_final_timepoints_df[best_drugs_final_timepoints_df["Drug_Regimen"].isin(['Capomulin','Ramicane','Infubinol','Ceftamin'])]
Lower_quartile = final_timepoints_grouped.groupby("Drug_Regimen")['Tumor_Volume_mm3'].quantile([.25])
Median = final_timepoints_grouped.groupby("Drug_Regimen")['Tumor_Volume_mm3'].quantile([.5])
Upper_quartile = final_timepoints_grouped.groupby("Drug_Regimen")['Tumor_Volume_mm3'].quantile([.75])

quartiles_df=pd.merge(Lower_quartile, Median, on="Drug_Regimen")
quartiles_df = quartiles_df.rename(columns = {"Tumor_Volume_mm3_x":"Lower_quartile", "Tumor_Volume_mm3_y":"Median"})
quartiles_df=pd.merge(quartiles_df, Upper_quartile, on="Drug_Regimen")
quartiles_df = quartiles_df.rename(columns = {"Tumor_Volume_mm3":"Upper_quartile"})

quartiles_df["IQR"] = quartiles_df["Upper_quartile"] - quartiles_df["Lower_quartile"]

quartiles_df["Lower_bound"] = quartiles_df["Lower_quartile"] - (1.5*quartiles_df["IQR"])
quartiles_df["Upper_bound"] = quartiles_df["Upper_quartile"] + (1.5*quartiles_df["IQR"])

quartiles_df["Min_volume"] = final_timepoints_grouped.groupby("Drug_Regimen")['Tumor_Volume_mm3'].min()
quartiles_df["Max_volume"] = final_timepoints_grouped.groupby("Drug_Regimen")['Tumor_Volume_mm3'].max()

#For some reason there were rows of NaN values showing up for the unwanted treatments, although as far as I 
#can see they aren't in the source file final_timepoints_grouped (see previous cell). Finally gave up and 
#deleted them:

#quartiles_df
quartiles_no_NaN = quartiles_df.dropna()
quartiles_no_NaN

# Tried to highlight the outlier in the table by applying conditions, but attempts at even really simple highlighting
# gave "AttributeError: 'Styler' object has no attribute 'style'". Decided it was unimportant for now.



Using Matplotlib, generate a box and whisker plot of the final tumor volume for all four treatment regimens 
and highlight any potential outliers in the plot by changing their color and style.

Extra special data cleanup for box plots.

In [None]:
#https://stackoverflow.com/questions/39380389/hoy-can-i-plot-a-group-by-boxplot-in-pandas-dropping-unused-categories

final_timepoints_grouped["Drug_Regimen2"] = pd.Categorical(final_timepoints_grouped["Drug_Regimen"], list(['Ramicane', 'Capomulin', 'Infubinol', 'Ceftamin']))

final_timepoints_grouped

Matplotlib version

In [None]:
red_diamond = dict(markerfacecolor='r', marker='D', markersize=8)
ax = final_timepoints_grouped.boxplot(column=["Tumor_Volume_mm3"], by="Drug_Regimen2", flierprops=red_diamond)
plt.suptitle("")
ax.set_xlabel("Drug Regimen")
ax.set_title("Final Tumor Volume (mm^3)")
plt.show() 

Seaborn version - better default axis labeling IMHO, and a simple option for showing all the 
data (my preference) without having to merge plots or some ugly thing.

In [None]:
#https://seaborn.pydata.org/generated/seaborn.boxplot.html

red_square = dict(markerfacecolor='r', markeredgecolor='r', marker='D', markersize=8)
sns.set(style="whitegrid")
ax = sns.boxplot(x='Drug_Regimen2',y='Tumor_Volume_mm3',color="skyblue", data=final_timepoints_grouped,flierprops=red_square)
ax = sns.swarmplot(x='Drug_Regimen2',y='Tumor_Volume_mm3',data=final_timepoints_grouped, color=".25")

ax.set_xlabel("Drug Regimen")
ax.set_ylabel("Final Tumor Volume (mm^3)")
plt.show()

This is a pandas version...

In [None]:
#https://stackoverflow.com/questions/39297093/change-the-facecolor-of-boxplot-in-pandas
#(doesn't work exactly as in the example... but anyway does change some colors)

red_diamond = dict(markerfacecolor='r', marker='D', markersize=8)
props = dict(boxes="DarkGreen", whiskers="DarkOrange", medians="DarkBlue", caps="Gray")
tumor_size_boxplot = final_timepoints_grouped.boxplot(column="Tumor_Volume_mm3", by="Drug_Regimen2", flierprops=red_diamond, figsize=(7, 5), color=props)
plt.suptitle('') 
plt.show()

## Line and Scatter Plots

Select a mouse that was treated with Capomulin and generate a line plot of tumor volume vs. timepoint for that mouse.

In [None]:
Mouse_b128_df = mouse_study_clean_df.loc[mouse_study_clean_df['Mouse_ID'] == "b128"]
Mouse_b128_df_sorted
plot = plt.plot(Mouse_b128_df_sorted["Timepoint"], Mouse_b128_df_sorted["Tumor_Volume_mm3"])
plt.xlabel('Timepoint (day)', size=12)
plt.ylabel('Tumor volume (mm^3)', size=12)
plt.suptitle('Capomulin treatment: Mouse B128')
plt.show()

Generate a scatter plot of average tumor volume vs. mouse weight for the Capomulin treatment regimen.

In [None]:
Capomulin_df = mouse_study_clean_df.loc[mouse_study_clean_df['Drug_Regimen'] == 'Capomulin']

Capomulin_df.loc[:, "Age_months"] = pd.to_numeric(Capomulin_df.loc[:, "Age_months"])
Capomulin_df.loc[:,"Weight (g)"] = pd.to_numeric(Capomulin_df.loc[:, "Weight (g)"])
Capomulin_df.loc[:,"Timepoint"] = pd.to_numeric(Capomulin_df.loc[:, "Timepoint"])
Capomulin_df.loc[:,"Tumor_Volume_mm3"] = pd.to_numeric(Capomulin_df.loc[:, "Tumor_Volume_mm3"])
Capomulin_df.loc[:,"Metastatic Sites"] = pd.to_numeric(Capomulin_df.loc[:,"Metastatic Sites"])

Capomulin_by_mouse = Capomulin_df.groupby("Mouse_ID")

mean_tumor_volume = Capomulin_by_mouse["Tumor_Volume_mm3"].mean()

mean_mouse_weight = Capomulin_by_mouse["Weight (g)"].mean()

Capomulin_tumor_vol_mouse_weight = pd.merge(mean_mouse_weight, mean_tumor_volume, on=["Mouse_ID"])

plot = plt.scatter(Capomulin_tumor_vol_mouse_weight["Weight (g)"], Capomulin_tumor_vol_mouse_weight["Tumor_Volume_mm3"])
plt.xlabel('Average weight (g)', size=12)
plt.ylabel('Average tumor volume (mm^3)', size=12)
plt.suptitle('Capomulin treatment: Average tumor volume vs. Average mouse weight')
plt.show()

Generate a scatter plot of mouse weight versus average tumor volume for the Capomulin regimen.

In [None]:
plot = plt.scatter(Capomulin_tumor_vol_mouse_weight["Tumor_Volume_mm3"], Capomulin_tumor_vol_mouse_weight["Weight (g)"])
plt.ylabel('Average weight (g)', size=12)
plt.xlabel('Average tumor volume (mm^3)', size=12)
plt.suptitle('Capomulin treatment: Average mouse weight vs. Average tumor volume')
plt.show()

## Correlation and Regression

Calculate the correlation coefficient and linear regression model between mouse weight and 
average tumor volume for the Capomulin treatment. 

In [None]:
#Capomulin_tumor_vol_mouse_weight

Average_mouse_weight = Capomulin_tumor_vol_mouse_weight.iloc[:,0]
Average_tumor_volume = Capomulin_tumor_vol_mouse_weight.iloc[:,1]
correlation = st.pearsonr(Average_mouse_weight,Average_tumor_volume)
print(f"The correlation between average mouse weight and average tumor volume is {round(correlation[0],2)}")

Calculate the linear regression model between mouse weight and average tumor volume 
for the Capomulin treatment. Plot the linear regression model on top of the previous scatter plot.

In [None]:
x_values = Capomulin_tumor_vol_mouse_weight["Weight (g)"]
y_values = Capomulin_tumor_vol_mouse_weight["Tumor_Volume_mm3"]
(slope, intercept, rvalue, pvalue, stderr) = linregress(x_values, y_values)
regress_values = x_values * slope + intercept
line_eq = "y = " + str(round(slope,2)) + "x + " + str(round(intercept,2))
plt.scatter(x_values,y_values)
plt.plot(x_values,regress_values,"r-")
plt.annotate(line_eq,(20,35),fontsize=15,color="red")
plt.xlabel('Average weight (g)', size=12)
plt.ylabel('Average tumor volume (mm^3)', size=12)
plt.suptitle('Capomulin treatment: Average tumor volume vs. Average mouse weight', size=14)
print(f"The r-squared is: {rvalue**2}")
plt.show()