## Observations and Insights 

* For this dataset, the number of mice tested was greatest for Capomulin and Ramicane and was least for Propriva.  The division between male and female mice was split relatively evenly.
* The data for Capomulin and Ramicane showed the least variability with the lowest standard deviation and standard error of the mean.  In addition, these drugs show the smallest final tumor volume indicating effectiveness in treatment.
* For Capomulin, there is a strong correlation between the tumor volume and mouse weight with a correlation coefficient of 0.84 and an r-squared value of 0.71.  
* Capomulin and Ramicane are the most promising drug regimens in this study given the large sample size, low variability, and greatest tumor volume reduction.

In [None]:
# Dependencies and Setup
import matplotlib.pyplot as plt
import pandas as pd
import scipy.stats as sts

# Study data files
mouse_metadata_path = "data/Mouse_metadata.csv"
study_results_path = "data/Study_results.csv"

# Read the mouse data and the study results
mouse_metadata = pd.read_csv(mouse_metadata_path)
study_results = pd.read_csv(study_results_path)
print(mouse_metadata.columns)
print(study_results.columns)

# Combine the data into a single dataset
master_df = pd.merge(study_results, mouse_metadata, on="Mouse ID", how="outer")
master_df

In [None]:
# Checking the number of mice in the DataFrame.
len(master_df["Mouse ID"].value_counts())

In [None]:
# Getting the duplicate mice by ID number that shows up for Mouse ID and Timepoint. 
duplicate_df = master_df[master_df.duplicated(subset=["Mouse ID","Timepoint"], keep=False)]
duplicate_df

In [None]:
# Create a clean DataFrame by dropping the duplicate mouse by its ID.

clean_df = master_df[master_df["Mouse ID"] != "g989"]

clean_df

In [None]:
# Checking the number of mice in the clean DataFrame.
len(clean_df["Mouse ID"].value_counts())

## Summary Statistics

In [None]:
# Generate a summary statistics table of mean, median, variance, standard deviation, and SEM of the tumor volume for each regimen


# This method is the most straightforward, creating multiple series and putting them all together at the end.

regimen_gb = clean_df.groupby(["Drug Regimen"])

sumstats = {"Tumor Volume Mean": regimen_gb["Tumor Volume (mm3)"].mean(),
           "Tumor Volume Median": regimen_gb["Tumor Volume (mm3)"].median(),
           "Tumor Volume Variance": regimen_gb["Tumor Volume (mm3)"].var(),
           "Tumor Volume Standard Deviation": regimen_gb["Tumor Volume (mm3)"].std(),
           "Tumor Volume SEM": regimen_gb["Tumor Volume (mm3)"].sem()}

sumstats_df = pd.DataFrame(sumstats)
                           
sumstats_df


In [None]:
# Generate a summary statistics table of mean, median, variance, standard deviation, and SEM of the tumor volume for each regimen

# This method produces everything in a single groupby function.

regimen_tumor_gb = clean_df.groupby(["Drug Regimen"])

regimen_tumor_gb["Tumor Volume (mm3)"].agg(["mean", "median", "var", "std", "sem"])

## Bar Plots

In [None]:
# Generate a bar plot showing the number of mice per time point for each treatment throughout the course of the study using pandas.
regimen_df = clean_df.loc[:, ["Drug Regimen", "Mouse ID", "Timepoint"]]
regimen_group = regimen_df.groupby(["Drug Regimen"])
mouse_count = regimen_group["Mouse ID"].count()

mouse_chart = mouse_count.plot(kind="bar")
mouse_chart.set_title("Number of Mice Tested for Drug Regimens")
mouse_chart.set_xlabel("Drug Regimen")
mouse_chart.set_ylabel("Mouse Count")


In [None]:
# Generate a bar plot showing the number of mice per time point for each treatment throughout the course of the study using pyplot.
regimen_df = regimen_df.sort_values("Drug Regimen")
mouse = regimen_group["Mouse ID"].count()
regimen = regimen_df["Drug Regimen"].unique()

fig, ax = plt.subplots()
xlabels = regimen
plt.bar(regimen, mouse)
ax.set_xticklabels(xlabels, rotation=90, ha='center')
plt.title("Number of Mice Tested for Drug Regimens")
plt.xlabel("Drug Regimen")
plt.ylabel("Mouse Count")

## Pie Plots

In [None]:
# Generate a pie plot showing the distribution of female versus male mice using pandas

gender_gb = clean_df.groupby(["Sex"])

gender = gender_gb["Sex"].value_counts()
index = gender_gb["Sex"].unique()

gender_df = pd.DataFrame({"Gender": gender})

pie_plot = gender_df["Gender"].plot.pie(y="Gender", labels=("Female", "Male"), autopct='%1.1f%%', colors=("b", "g"))
pie_plot.set_title("Mice Gender Ratio")

In [None]:
# Generate a pie plot showing the distribution of female versus male mice using pyplot
plt.pie(gender, labels=("Female", "Male"), autopct="%1.1f%%", colors=("b", "g"))
plt.title("Mice Gender Ratio")


## Quartiles, Outliers and Boxplots

In [None]:
# Calculate the final tumor volume of each mouse across four of the most promising treatment regimens. 
#Calculate the IQR and quantitatively determine if there are any potential outliers. 
# Calculate the final tumor volume of each mouse across four of the most promising treatment regimens. Calculate the IQR and quantitatively determine if there are any potential outliers. 

# Start by getting the last (greatest) timepoint for each mouse
mouse_time = clean_df.groupby(["Mouse ID"])["Timepoint"].max()

#Join last timepoint data with working data frame
mouse_df = pd.merge(mouse_time, clean_df, on=("Mouse ID", "Timepoint"), how="inner")

#Find most promising treatment - determined by the smallest tumor volume on average at the last timepoint for each mouse.
final_time_regimen_gb = mouse_df.groupby(["Drug Regimen"])

final_time_regimen_df = pd.DataFrame(final_time_regimen_gb["Tumor Volume (mm3)"].describe())
sorted_regimen_df = final_time_regimen_df.sort_values("mean")

print(f"\nThe four most promising drug regimens using the maximum timepoint are {sorted_regimen_df.index[0]}, {sorted_regimen_df.index[1]}, {sorted_regimen_df.index[2]}, and {sorted_regimen_df.index[3]}.")

#If tumor volume is the indication of promising treatment, then the above statement will print the most promising treatments.
#However, since many of the mice included in this dataset did not survive or continue the study to the final timepoint of day 45,
#another analysis using only the day 45 samples was used to deterimine the most promising treatments for the remaining
#analysis.  
time_45_df = clean_df.loc[clean_df["Timepoint"] == 45]
time_45_gb = time_45_df.groupby(["Drug Regimen"])
time_45_final_df = pd.DataFrame(time_45_gb["Tumor Volume (mm3)"].describe())
sort_time_45_df = time_45_final_df.sort_values("mean")

print(f"\nThe four most promising drug regimens for mice surviving to day 45 are {sort_time_45_df.index[0]}, {sort_time_45_df.index[1]}, {sort_time_45_df.index[2]}, and {sort_time_45_df.index[3]}.")

In [None]:
# Quantitatively determine Ramicane outliers
#Note - analysis of the most promising regimens will include data for all timepoints.  
Ramicane_df = mouse_df.loc[mouse_df["Drug Regimen"] == "Ramicane", :]

print(f"Mean: {Ramicane_df['Tumor Volume (mm3)'].mean()}")
print(f"Median: {Ramicane_df['Tumor Volume (mm3)'].median()}")
print(f"Mode: {Ramicane_df['Tumor Volume (mm3)'].mode()}")

quartiles = Ramicane_df["Tumor Volume (mm3)"].quantile([.25,.5,.75])
lowerq = quartiles[0.25]
upperq = quartiles[0.75]
iqr = upperq-lowerq

print(f"Lower quartile: {lowerq}")
print(f"Upper quartile: {upperq}")
print(f"Interquartile range: {iqr}")
print(f"Median Tumor volume: {quartiles[0.5]} ")

lower_bound = lowerq - (1.5*iqr)
upper_bound = upperq + (1.5*iqr)
print(f"Values below {lower_bound} could be outliers.")
print(f"Values above {upper_bound} could be outliers.")

outlier_tumor = Ramicane_df.loc[(Ramicane_df["Tumor Volume (mm3)"] < lower_bound) | (Ramicane_df["Tumor Volume (mm3)"] > upper_bound)]
outlier_tumor

In [None]:
# Quantitatively determine Capomulin outliers
Capomulin_df = mouse_df.loc[mouse_df["Drug Regimen"] == "Capomulin", :]

print(f"Mean: {Capomulin_df['Tumor Volume (mm3)'].mean()}")
print(f"Median: {Capomulin_df['Tumor Volume (mm3)'].median()}")
print(f"Mode: {Capomulin_df['Tumor Volume (mm3)'].mode()}")

quartiles = Capomulin_df["Tumor Volume (mm3)"].quantile([.25,.5,.75])
lowerq = quartiles[0.25]
upperq = quartiles[0.75]
iqr = upperq-lowerq

print(f"Lower quartile: {lowerq}")
print(f"Upper quartile: {upperq}")
print(f"Interquartile range: {iqr}")
print(f"Median Tumor volume: {quartiles[0.5]} ")

lower_bound = lowerq - (1.5*iqr)
upper_bound = upperq + (1.5*iqr)
print(f"Values below {lower_bound} could be outliers.")
print(f"Values above {upper_bound} could be outliers.")

outlier_tumor = Capomulin_df.loc[(Capomulin_df["Tumor Volume (mm3)"] < lower_bound) | (Capomulin_df["Tumor Volume (mm3)"] > upper_bound)]
outlier_tumor

In [None]:
# Quantitatively determine Ceftamin outliers
Ceftamin_df = mouse_df.loc[mouse_df["Drug Regimen"] == "Ceftamin", :]

print(f"Mean: {Ceftamin_df['Tumor Volume (mm3)'].mean()}")
print(f"Median: {Ceftamin_df['Tumor Volume (mm3)'].median()}")
print(f"Mode: {Ceftamin_df['Tumor Volume (mm3)'].mode()}")

quartiles = Ceftamin_df["Tumor Volume (mm3)"].quantile([.25,.5,.75])
lowerq = quartiles[0.25]
upperq = quartiles[0.75]
iqr = upperq-lowerq

print(f"Lower quartile: {lowerq}")
print(f"Upper quartile: {upperq}")
print(f"Interquartile range: {iqr}")
print(f"Median Tumor volume: {quartiles[0.5]} ")

lower_bound = lowerq - (1.5*iqr)
upper_bound = upperq + (1.5*iqr)
print(f"Values below {lower_bound} could be outliers.")
print(f"Values above {upper_bound} could be outliers.")

outlier_tumor = Ceftamin_df.loc[(Ceftamin_df["Tumor Volume (mm3)"] < lower_bound) | (Ceftamin_df["Tumor Volume (mm3)"] > upper_bound)]
outlier_tumor

In [None]:
# Quantitatively determine Infubinol outliers
Infubinol_df = mouse_df.loc[mouse_df["Drug Regimen"] == "Infubinol", :]

print(f"Mean: {Infubinol_df['Tumor Volume (mm3)'].mean()}")
print(f"Median: {Infubinol_df['Tumor Volume (mm3)'].median()}")
print(f"Mode: {Infubinol_df['Tumor Volume (mm3)'].mode()}")

quartiles = Infubinol_df["Tumor Volume (mm3)"].quantile([.25,.5,.75])
lowerq = quartiles[0.25]
upperq = quartiles[0.75]
iqr = upperq-lowerq

print(f"Lower quartile: {lowerq}")
print(f"Upper quartile: {upperq}")
print(f"Interquartile range: {iqr}")
print(f"Median Tumor volume: {quartiles[0.5]} ")

lower_bound = lowerq - (1.5*iqr)
upper_bound = upperq + (1.5*iqr)
print(f"Values below {lower_bound} could be outliers.")
print(f"Values above {upper_bound} could be outliers.")

outlier_tumor = Infubinol_df.loc[(Infubinol_df["Tumor Volume (mm3)"] < lower_bound) | (Infubinol_df["Tumor Volume (mm3)"] > upper_bound)]
outlier_tumor

In [None]:
# Generate a box plot of the final tumor volume of each mouse across four regimens of interest
#
#Subset data by drug regimen
Ramicane = mouse_df.loc[(mouse_df["Drug Regimen"] == "Ramicane" )]
Capomulin = mouse_df.loc[(mouse_df["Drug Regimen"] == "Capomulin" )]
Ceftamin = mouse_df.loc[(mouse_df["Drug Regimen"] == "Ceftamin" )]
Infubinol = mouse_df.loc[(mouse_df["Drug Regimen"] == "Infubinol" )]

#set plot specifications
fig, ax = plt.subplots()
Best_Regimen = [Ramicane["Tumor Volume (mm3)"], Capomulin["Tumor Volume (mm3)"], Ceftamin["Tumor Volume (mm3)"], Infubinol["Tumor Volume (mm3)"]]
ax.boxplot(Best_Regimen, False, "r", labels=["Ramicane", "Capomulin", "Ceftamin", "Infubinol"])
plt.title("Tumor Volume by Drug Regimen")
plt.ylabel("Tumor Volume (mm3)")
plt.xlabel("Drug Regimen")
plt.show()


## Line and Scatter Plots

In [None]:
# Generate a line plot of time point versus tumor volume for a mouse treated with Capomulin
#Select a mouse from the Capomulin treatment set
capomulin_mouse_df = clean_df.loc[(clean_df["Drug Regimen"] == "Capomulin") & (clean_df["Mouse ID"] == "b128"), :]

#set plot specifications
plt.plot(capomulin_mouse_df["Timepoint"], capomulin_mouse_df["Tumor Volume (mm3)"])
plt.title('Mouse "b128" Treatment Results')
plt.xlabel("Timepoint")
plt.ylabel("Tumor Volume (mm3)")
plt.xlim(-.1, 45.3)
plt.ylim(37.5, 46)
plt.show()

In [None]:
# Generate a scatter plot of mouse weight versus average tumor volume for the Capomulin regimen
capomulin_df = clean_df.loc[clean_df["Drug Regimen"] == "Capomulin", :]
plt.scatter(capomulin_df["Weight (g)"].groupby(capomulin_df["Mouse ID"]).mean(),
            capomulin_df["Tumor Volume (mm3)"].groupby(capomulin_df["Mouse ID"]).mean())
plt.title("Capomulin Treated Mice \nAverage Weight vs. Average Tumor Volume (mm3)")
plt.xlabel("Weight (g)")
plt.ylabel("Tumor Volume (mm3)")
plt.show()


## Correlation and Regression

In [None]:
# Calculate the correlation coefficient and linear regression model 
# for mouse weight and average tumor volume for the Capomulin regimen

#mouse average weight and average tumor volume calculations
weight = capomulin_df["Weight (g)"].groupby(capomulin_df["Mouse ID"]).mean()
avg_tumor_vol = capomulin_df["Tumor Volume (mm3)"].groupby(capomulin_df["Mouse ID"]).mean()

#Pearson correlation coefficient
correlation = sts.pearsonr(weight, avg_tumor_vol)

#linear regression equation
slope, intercept, r_value, p_value, std_err = sts.linregress(weight, avg_tumor_vol)

fit = slope * weight + intercept

line_eq = "y = " + str(round(slope,2)) + "x +" + str(round(intercept,2))

#plot avg weight vs avg tumor vol and regression line
plt.scatter(weight, avg_tumor_vol, label="Mouse Data")
plt.plot(weight, fit, "r", label="fitted line")
plt.annotate(line_eq, (18.7, 37.8), fontsize=15, color="red")
plt.title("Capomulin Treated Mice with Best Fit Line \nAverage Weight vs. Average Tumor Volume (mm3)")
plt.xlabel("Weight (g)")
plt.ylabel("Tumor Volume (mm3)")
plt.legend()

print(f"The correlation between both factors is {round(correlation[0],2)}")
print(f"The r square value is {round(r_value**2, 2)}.")
plt.show()