## Observations and Insights

## Dependencies and starter code

In [None]:
# Dependencies and Setup
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import scipy.stats as st

# Study data files
mouse_metadata = "data/Mouse_metadata.csv"
study_results = "data/Study_results.csv"

# Read the mouse data and the study results
mouse_metadata_df = pd.read_csv(mouse_metadata)
study_results_df = pd.read_csv(study_results)

# Combine the data into a single dataset
combined_data_df = pd.merge(mouse_metadata_df, study_results_df, how="left", on="Mouse ID")
combined_data_df.head()

## Summary statistics

In [None]:
# Generate a summary statistics table of mean, median, variance, standard deviation, and SEM of the tumor volume for each regimen
                      

regimen_avg = combined_data_df.groupby("Drug Regimen")["Tumor Volume (mm3)"].mean()
regimen_median = combined_data_df.groupby("Drug Regimen")["Tumor Volume (mm3)"].median()
regimen_var = combined_data_df.groupby("Drug Regimen")["Tumor Volume (mm3)"].var()
regimen_std = combined_data_df.groupby("Drug Regimen")["Tumor Volume (mm3)"].std()
regimen_sem = combined_data_df.groupby("Drug Regimen")["Tumor Volume (mm3)"].sem()


regimen_df = pd.DataFrame({"Mean": regimen_avg, "Median": regimen_median, 
                           "Variance": regimen_var, "Std Dev": regimen_std, "SEM": regimen_sem})
regimen_df




## Bar plots

In [None]:
# Generate a bar plot showing number of data points for each treatment regimen using pandas
#regimen_ct = combined_data_df[["Drug Regimen"]].count()
#regimen_ct

regimen_treat = combined_data_df[["Drug Regimen"]]

data_pts_regimen = regimen_treat["Drug Regimen"].value_counts()
#data_pts_regimen

pand_bplot = data_pts_regimen.plot.bar(x="Drug Treatment", y="Count",rot=90)
pand_bplot

In [None]:
# Generate a bar plot showing number of data points for each treatment regimen using pyplot

x_axis = regimen_treat["Drug Regimen"].unique()

drug_regimen = regimen_treat["Drug Regimen"].value_counts()

plt.bar(x_axis, drug_regimen, color="b", align="center")
plt.xticks(rotation=90)
plt.title("Drug Regimen Volume")
plt.xlabel("Drug Regimen")
plt.ylabel("Count")
plt.xlim(-0.65, len(x_axis)-0.35)
plt.ylim(0, max(drug_regimen)+25)




## Pie plots

In [None]:
#Determine distinct values for Male and Females to be used for pandas pie chart
male_female = combined_data_df[["Sex"]]
male_female_ct = male_female["Sex"].value_counts()
male_female_ct

In [None]:
# Generate a pie plot showing the distribution of female versus male mice using pandas

mvf_df = pd.DataFrame({"Sex": [958, 935]}, index=["Male", "Female"] )

plot = mvf_df.plot(kind="pie", y="Sex", autopct='%1.1f%%', figsize=(5, 5))



In [None]:
# Generate a pie plot showing the distribution of female versus male mice using pyplot
mice_sex = combined_data_df[["Sex"]]

labels = mice_sex["Sex"].unique()
sizes = mice_sex["Sex"].value_counts()

colors = ["red", "blue"]
explode = (0,0)
plt.pie(sizes, explode=explode, labels=labels, colors=colors, autopct="%1.1f%%", startangle=0)

## Quartiles, outliers and boxplots

In [None]:
# Calculate the final tumor volume of each mouse across four of the most promising treatment regimens.


#Create Dataframe containing information related only to Campomulin, Ramicane, Infubinol, and Ceftamin
promising_treatment_df = combined_data_df.loc[combined_data_df["Drug Regimen"].isin(["Capomulin", "Ramicane", "Infubinol", "Ceftamin"])]

#Calculate final tumor volume for each mouse ID for Campomulin, Ramicane, Infubinol, and Ceftamin
tum_vol_promising_treatment = promising_treatment_df.groupby(["Mouse ID","Drug Regimen"])["Tumor Volume (mm3)"].last()

#Create Dataframe with Final Tumor Volume by MouseID 
final_tum_vol_mid_df = pd.DataFrame({"Final Tumor Volume (mm3)": tum_vol_promising_treatment})
final_tum_vol_mid_df





In [None]:
#Calculate the IQR and quantitatively determine if there are any potential outliers.

final_tumor_volume = final_tum_vol_mid_df["Final Tumor Volume (mm3)"]
quartiles = final_tumor_volume.quantile([.25,.5,.75])
lowerq = quartiles[0.25]
upperq = quartiles[0.75]
iqr = upperq-lowerq

print(f"The lower quartile of final tumor volumes for Capomulin, Ramicane, Infubinol, and Ceftamin is: {lowerq}")
print(f"The upper quartile of final tumor volumes for Capomulin, Ramicane, Infubinol, and Ceftamin is: {upperq}")
print(f"The interquartile of final tumor volumes for Capomulin, Ramicane, Infubinol, and Ceftamin is: {iqr}")
print(f"The the median of final tumor volumes for Capomulin, Ramicane, Infubinol, and Ceftamin is: {quartiles[0.5]} ")

lower_bound = lowerq - (1.5*iqr)
upper_bound = upperq + (1.5*iqr)
print(f"Values below {lower_bound} would be outliers.")
print(f"Values above {upper_bound} would be outliers.")

#Per the data, there are no outliers


In [None]:
# Generate a box plot of the final tumor volume of each mouse across four regimens of interest
fig1, ax1 = plt.subplots()
ax1.set_title("Final Tumor Volume across 4 Treatments")
ax1.set_ylabel("Tumor Volume (mm3)")
ax1.boxplot(final_tumor_volume)
plt.show()

## Line and scatter plots

In [None]:
# Generate a line plot of time point versus tumor volume for a mouse treated with Capomulin

#Determine unique values for Timepoint for x-axis values
time_point = combined_data_df[["Timepoint"]]
time_point_dedup = time_point["Timepoint"].unique()
x_values = time_point_dedup


#Determine tumor_volume for mice treated with Capomulin for y-axis values
tum_vol_cap_df = combined_data_df.loc[combined_data_df["Drug Regimen"].isin(["Capomulin"])]
y_values = tum_vol_cap_df[["Tumor Volume (mm3)"]]

#Create Scatterplot
plt.scatter(x_values,y_values)
plt.xlabel("Timepoint")
plt.ylabel("Tumor Volume (mm3) for Capomulin")
plt.show()



In [None]:
# Generate a scatter plot of mouse weight versus average tumor volume for the Capomulin regimen

mouse_weight = tum_vol_cap_df[["Weight (g)"]]
mouse_weight

#avg_tumor_vol = tum_vol_cap_df["Tumor Volume (mm3)"].mean()
#avg_tumor_vol



In [None]:
# Calculate the correlation coefficient and linear regression model for mouse weight and average tumor volume for the Capomulin regimen