## Observations and Insights 

In [None]:
# Dependencies and Setup
import matplotlib.pyplot as plt
import pandas as pd
import scipy.stats as st
import numpy as np
from scipy import stats
from sklearn.linear_model import LinearRegression

# Study data files
mouse_metadata_path = "data/Mouse_metadata.csv"
study_results_path = "data/Study_results.csv"

# Read the mouse data and the study results
mouse_metadata = pd.read_csv(mouse_metadata_path)
study_results = pd.read_csv(study_results_path)

# Combine the data into a single dataset

# Display the data table for preview

metadata = pd.DataFrame(mouse_metadata)
results = pd.DataFrame(study_results)
df3 = pd.merge(metadata, results, on="Mouse ID")
df3

In [None]:
uf3 = df3.rename(columns={"Drug Regimen": "DrugRegimen", "Mouse ID" : "MouseID"})
uf3

In [None]:
# Checking the number of mice.
pip = uf3["MouseID"].value_counts()
pip

In [None]:
# Optional: Get all the data for the duplicate mouse ID. 
# Getting the duplicate mice by ID number that shows up for Mouse ID and Timepoint. df3
two_mouse = uf3.loc[uf3.MouseID=='g989']
two_mouse

In [None]:
dropedpip = pip.drop(['g989'])
dropedpip

In [None]:
# Create a clean DataFrame by dropping the duplicate mouse by its ID.
unf_alt = uf3.set_index("MouseID")
unf_alt

In [None]:
unf_clean = unf_alt.drop(index=('g989'))
unf_clean

In [None]:
unf_clean.loc()

In [None]:
# Checking the number of mice in the clean DataFrame.
dropedpip.count()

## Summary Statistics (Done)

In [None]:
# Generate a summary statistics table of mean, median, variance, standard deviation, and SEM of the tumor volume for each regimen
fig = unf_clean.groupby("DrugRegimen")
meanfig = fig.mean()
avg_tumor = meanfig["Tumor Volume (mm3)"]

medfig = fig.median()
med_tumor = medfig["Tumor Volume (mm3)"]

varfig = fig.var()
var_tumor = varfig["Tumor Volume (mm3)"]

stdfig = fig.std()
std_tumor = stdfig["Tumor Volume (mm3)"]

semfig = fig.sem()
sem_tumor = semfig["Tumor Volume (mm3)"]

summary_data = pd.DataFrame({"mean" : avg_tumor,
              "median" : med_tumor,
              "variance" : var_tumor,
              "standard deviation" : std_tumor,
              "sem" : sem_tumor 
})
summary_data


# Use groupby and summary statistical methods to calculate the following properties of each drug regimen: 
# mean, median, variance, standard deviation, and SEM of the tumor volume. 
# Assemble the resulting series into a single summary dataframe.



In [None]:
# Generate a summary statistics table of mean, median, variance, standard deviation, and SEM of the tumor volume for each regimen

# Using the aggregation method, produce the same summary statistics in a single line
unf_clean.groupby('DrugRegimen')['Tumor Volume (mm3)'].agg(['mean', 'median','var', 'std','sem'])

## Bar and Pie Charts (Done)

In [None]:
# Generate a bar plot showing the total number of timepoints for all mice tested for each drug regimen using Pandas.
barli = unf_clean.groupby(["DrugRegimen"])
gather3 = barli["Timepoint"].count()
frame3 = pd.DataFrame(gather3)
sortframe3 = frame3.sort_values(by=['Timepoint'], ascending=False)
sortframe3

In [None]:
sortframe3.plot(kind="bar", title = "Total Number of Timepoints for All Mice Tested for Each Drug Regimen")
plt.ylabel("Total Number of Timepoints")
plt.legend(["Total Number of Timepoints"])
plt.show()

In [None]:
# Generate a bar plot showing the total number of timepoints for all mice tested for each drug regimen using pyplot.
plt.bar(sortframe3.index, sortframe3["Timepoint"])
plt.xticks(rotation=90)
plt.xlabel("DrugRegimen")
plt.ylabel("Total Number of Timepoints")
plt.title("Total Number of Timepoints for All Mice Tested for Each Drug Regimen")
plt.legend(["Total Number of Timepoints"])
plt.show()

In [None]:
# Generate a pie plot showing the distribution of female versus male mice using Pandas
unf_clean.reset_index()
pie = unf_clean.groupby("Sex")
gather4 = pie.count()
did = pd.DataFrame(gather4["DrugRegimen"])
did

In [None]:
dip = did.rename(columns={"DrugRegimen":"Count"})
dip

In [None]:
dip.plot(kind="pie", autopct="%1.1f%%", subplots=True, title = "The Distribution of Female vs. Male Mice")
plt.show()

In [None]:
# Generate a pie plot showing the distribution of female versus male mice using pyplot
plt.pie(dip["Count"], autopct="%1.1f%%", labels=dip.index)
plt.legend(labels=dip.index, loc="best")
plt.ylabel("Count")
plt.title("The Distribution of Female vs. Male Mice")
plt.show()

## Quartiles, Outliers and Boxplots

In [None]:
# Calculate the final tumor volume of each mouse across four of the treatment regimens:  
# Capomulin, Ramicane, Infubinol, and Ceftamin

# Start by getting the last (greatest) timepoint for each mouse
bit = unf_clean.groupby("MouseID")
gatherdata = bit["Timepoint"].max()
gatherdata

# Merge this group df with the original dataframe to get the tumor volume at the last timepoint
mergedata = pd.merge(gatherdata, unf_clean, on=("MouseID","Timepoint"))
mergedata

In [None]:
# Put treatments into a list for for loop (and later for plot labels)
mergedata_isin = mergedata.loc[mergedata['DrugRegimen'].isin(['Capomulin','Ramicane','Infubinol','Ceftamin'])]
date = mergedata_isin[["DrugRegimen","Tumor Volume (mm3)"]]
date

# Create empty list to fill with tumor vol data (for plotting)
    

In [None]:
date_cap = date.loc[date.DrugRegimen=="Capomulin"]
vol_cap = date_cap["Tumor Volume (mm3)"].sort_values(ascending=True)

date_ceft = date.loc[date.DrugRegimen=="Ceftamin"]
vol_ceft = date_ceft["Tumor Volume (mm3)"].sort_values(ascending=True)

date_ram = date.loc[date.DrugRegimen=="Ramicane"]
vol_ram = date_ram["Tumor Volume (mm3)"].sort_values(ascending=True)

date_inf = date.loc[date.DrugRegimen=="Infubinol"]
vol_inf = date_inf["Tumor Volume (mm3)"].sort_values(ascending=True)


ceft_index = vol_ceft.reset_index()["Tumor Volume (mm3)"]

cap_index = vol_cap.reset_index()["Tumor Volume (mm3)"]

ram_index = vol_ram.reset_index()["Tumor Volume (mm3)"]

inf_index = vol_inf.reset_index()["Tumor Volume (mm3)"]

four_drugs = pd.DataFrame({"Ceftamin" : ceft_index,
             "Capomulin" : cap_index,
             "Ramicane" : ram_index,
             "Infubinol" : inf_index})
four_drugs

In [None]:
# Calculate the IQR and quantitatively determine if there are any potential outliers. 

    
    # Locate the rows which contain mice on each drug and get the tumor volumes
    
    
    # add subset 
quartile_cap = vol_cap.quantile([.25, .5, .75])
lowquart_cap = quartile_cap[.25]
medquart_cap = quartile_cap[.5]
upquart_cap = quartile_cap[.75]
iqrquart_cap = upquart_cap-lowquart_cap
lowbquart_cap = lowquart_cap - (1.5*iqrquart_cap)
upbquart_cap = upquart_cap + (1.5*iqrquart_cap)
max_cap = vol_cap.max()
min_cap = vol_cap.min()

quartile_ram = vol_ram.quantile([.25, .5, .75])
lowquart_ram = quartile_ram[.25]
medquart_ram = quartile_ram[.5]
upquart_ram = quartile_ram[.75]
iqrquart_ram = upquart_ram-lowquart_ram
lowbquart_ram = lowquart_ram - (1.5*iqrquart_ram)
upbquart_ram = upquart_ram + (1.5*iqrquart_ram)
max_ram = vol_ram.max()
min_ram = vol_ram.min()

quartile_inf = vol_inf.quantile([.25, .5, .75])
lowquart_inf = quartile_inf[.25]
medquart_inf = quartile_inf[.5]
upquart_inf = quartile_inf[.75]
iqrquart_inf = upquart_inf-lowquart_inf
lowbquart_inf = lowquart_inf - (1.5*iqrquart_inf)
upbquart_inf = upquart_inf + (1.5*iqrquart_inf)
max_inf = vol_inf.max()
min_inf = vol_inf.min()

quartile_ceft = vol_ceft.quantile([.25, .5, .75])
lowquart_ceft = quartile_ceft[.25]
medquart_ceft = quartile_ceft[.5]
upquart_ceft = quartile_ceft[.75]
iqrquart_ceft = upquart_ceft-lowquart_ceft
lowbquart_ceft = lowquart_ceft - (1.5*iqrquart_ceft)
upbquart_ceft = upquart_ceft + (1.5*iqrquart_ceft)
max_ceft = vol_ceft.max()
min_ceft = vol_ceft.min()

data = {"up" : [upquart_cap, upquart_ceft, upquart_inf, upquart_ram],
        "med": [medquart_cap, medquart_ceft, medquart_inf, medquart_ram],
        "low" : [lowquart_cap, lowquart_ceft, lowquart_inf, lowquart_ram,],
        "iqr" : [iqrquart_cap, iqrquart_ceft, iqrquart_inf, iqrquart_ram],
        "upb" : [upbquart_cap, upbquart_ceft, upbquart_inf, upbquart_ram],
        "lowb" : [lowbquart_cap, lowbquart_ceft, lowbquart_inf, lowbquart_ram],
        "max" : [max_cap, max_ceft, max_inf, max_ram],
        "min" : [min_cap, min_ceft, min_inf, min_ram]
}

outlier = pd.DataFrame(data, index = ["Capomulin", "Ceftamin", "Infubinol", "Ramicane"])
outlier

In [None]:
print(f"The outliners for Capomulin could be higher than {round(upquart_cap,2)} and lower than {round(lowbquart_cap,2)}")
print(f"The outliners for Ceftamin could be higher than {round(upquart_ceft,2)} and lower than {round(lowbquart_ceft,2)}")
print(f"The outliners for Infubinol could be higher than {round(upquart_inf,2)} and lower than {round(lowbquart_inf,2)}")
print(f"The outliners for Ramicane could be higher than {round(upquart_ram,2)} and lower than {round(lowbquart_ram,2)}")

In [None]:
vol_cap.loc[vol_cap <= lowbquart_cap].count()
vol_cap.loc[vol_cap >= upbquart_cap].count()
print(f"Number of lower outliers for Ceftamin: {vol_cap.loc[vol_cap <= lowbquart_cap].count()}")
print(f"Number of upper outliers for Ceftamin: {vol_cap.loc[vol_cap >= upbquart_cap].count()}")


vol_ceft.loc[vol_ceft <= lowbquart_ceft].count()
vol_ceft.loc[vol_ceft >= upbquart_ceft].count()
print(f"Number of lower outliers for Ceftamin: {vol_ceft.loc[vol_ceft <= lowbquart_ceft].count()}")
print(f"Number of upper outliers for Ceftamin: {vol_ceft.loc[vol_ceft >= upbquart_ceft].count()}")

vol_ram.loc[vol_ram <= lowbquart_ram].count()
vol_ram.loc[vol_ram >= upbquart_ram].count()
print(f"Number of lower outliers for Ramicane: {vol_ram.loc[vol_ram <= lowbquart_ram].count()}")
print(f"Number of upper outliers for Ramicane: {vol_ram.loc[vol_ram >= upbquart_ram].count()}")

vol_inf.loc[vol_inf <= lowbquart_inf].count()
vol_inf.loc[vol_inf >= upbquart_inf].count()
print(f"Number of lower outliers for Infubinol: {vol_inf.loc[vol_inf <= lowbquart_inf].count()}")
print(f"Number of upper outliers for Infubinol: {vol_inf.loc[vol_inf >= upbquart_inf].count()}")

In [None]:
# Generate a box plot of the final tumor volume of each mouse across four regimens of interest
fig1, ax1 = plt.subplots()
x = [1, 2, 3, 4]
ax1.boxplot([vol_cap, vol_ram, vol_inf, vol_ceft])
plt.xticks(x, ['Capomulin', 'Ramicane', 'Infubinol', 'Ceftamin'])
plt.ylabel("Final Tumor Volume")
plt.title("The Final Tumor Volume of Each Mouse Across Four Regimens of Interest")
plt.show()

## Line and Scatter Plots (Done)

In [None]:
# Generate a line plot of tumor volume vs. time point for a mouse treated with Capomulin

In [None]:
bdb = uf3.loc[uf3.DrugRegimen=='Capomulin']
bdb

In [None]:
one_mouse = bdb.loc[bdb.MouseID=='s185']
one_mouse

In [None]:

x_axis = np.arange(len(bdb))
tick_locations = list(x_axis)

plt.figure(figsize=(70,4))
plt.plot(x_axis, bdb["Tumor Volume (mm3)"], marker="o")
plt.show
plt.xlabel("Capomulin s185")
plt.ylabel("Tumor Volume for Each Time Point")

plt.xticks(tick_locations, bdb["Timepoint"])

plt.xlim(-1, len(bdb))
plt.tight_layout()

In [None]:
x_axis = np.arange(len(one_mouse))
tick_locations = list(x_axis)

plt.figure(figsize=(7,4))
plt.plot(x_axis, one_mouse["Tumor Volume (mm3)"], marker="o")
plt.show
plt.xlabel("Capomulin s185")
plt.ylabel("Tumor Volume for Each Time Point")

plt.xticks(tick_locations, one_mouse["Timepoint"])

plt.xlim(-1, len(one_mouse))
plt.tight_layout()
plt.title("Tumor Volume vs. Time Point for Mouse s185 Treated with Capomulin")
plt.show()

In [None]:
# Generate a scatter plot of average tumor volume vs. mouse weight for the Capomulin regimen
dib = bdb.groupby(["MouseID","Weight (g)"])
gather_data = dib["Tumor Volume (mm3)"].mean()
gather_data

In [None]:
drop_data = gather_data.droplevel(0)
pd.DataFrame(drop_data)

In [None]:
plt.scatter(pd.DataFrame(drop_data).index, pd.DataFrame(drop_data)["Tumor Volume (mm3)"])
plt.xlabel("Weight (g)")
plt.ylabel("Average Tumor Volume")
plt.title("Average Tumor Volume vs. Mouse Weight for the Capomulin Regimen")
plt.show()

## Correlation and Regression

In [None]:
# Calculate the correlation coefficient and linear regression model 
# for mouse weight and average tumor volume for the Capomulin regimen
correlation = st.pearsonr(pd.DataFrame(drop_data).index, pd.DataFrame(drop_data)["Tumor Volume (mm3)"])
print("The correlation between mouse weight and average tumor volume is " + str(round(correlation[0],2)))

In [None]:
m, b = np.polyfit(pd.DataFrame(drop_data).index, pd.DataFrame(drop_data)["Tumor Volume (mm3)"], 1)
plt.plot(pd.DataFrame(drop_data).index, m*pd.DataFrame(drop_data).index + b, color="red")
line_eq = "y = " + str(round(m,2)) + "x + " + str(round(b,2))
plt.scatter(pd.DataFrame(drop_data).index, pd.DataFrame(drop_data)["Tumor Volume (mm3)"])
plt.annotate(line_eq,(20,39),fontsize=15,color="red")
plt.xlabel("Weight (g)")
plt.ylabel("Average Tumor Volume")
plt.title("Average Tumor Volume vs. Mouse Weight for the Capomulin Regimen")
plt.show()