## Observations and Insights 

In [None]:
# Dependencies and Setup
import matplotlib.pyplot as plt
import pandas as pd
import scipy.stats as st
import numpy as np

# Study data files
mouse_metadata_path = "data/Mouse_metadata.csv"
study_results_path = "data/Study_results.csv"

# Read the mouse data and the study results
mouse_data = pd.read_csv(mouse_metadata_path)
study_results = pd.read_csv(study_results_path)

# Combine the data into a single dataset
merged1_df = pd.merge(mouse_data, study_results, on="Mouse ID", how="outer")

# Display the data table for preview
merged1_df

In [None]:
# Checking the number of mice.
mice = len(merged1_df["Mouse ID"].unique())
mice

In [None]:
# Getting the duplicate mice by ID number that shows up for Mouse ID and Timepoint. 
duplicate = merged1_df[merged1_df.duplicated(['Mouse ID', 'Timepoint'])]
print(duplicate)

In [None]:
# Create a clean DataFrame by dropping the duplicate mouse by its ID.
clean_df = merged1_df.drop_duplicates(['Mouse ID', 'Timepoint'], keep=False)



In [None]:
# Checking the number of mice in the clean DataFrame.
mice = len(clean_df['Mouse ID'].unique())
mice

In [None]:

drug_group = clean_df.groupby(['Mouse ID', 'Drug Regimen']).count()

drug_group = drug_group.reset_index()
drug_group

## Summary Statistics

In [None]:
# Generate a summary statistics table of mean, median, variance, standard deviation, and SEM of the tumor volume for each regimen
mean = clean_df.groupby("Drug Regimen").mean()["Tumor Volume (mm3)"]
median = clean_df.groupby("Drug Regimen").median()["Tumor Volume (mm3)"]
var = clean_df.groupby("Drug Regimen").var()["Tumor Volume (mm3)"]
std = clean_df.groupby("Drug Regimen").std()["Tumor Volume (mm3)"]
sem = clean_df.groupby("Drug Regimen").sem()["Tumor Volume (mm3)"]

# This method is the most straighforward, creating multiple series and putting them all together at the end.
summary_table = pd.DataFrame({"Mean Tumor Volume": mean,
                             "Median Tumor Volume": median,
                             "Tumor Volume Variance": var,
                             "Tumor Volume Std Dev": std,
                             "Tumor Volume SEM": sem})
summary_table

In [None]:
# Generate a summary statistics table of mean, median, variance, standard deviation, and SEM of the tumor volume for each regimen

# This method produces everything in a single groupby function
clean_df.groupby("Drug Regimen")[['Tumor Volume (mm3)']].agg(['mean','median','var','std', 'sem'])


In [None]:
#DataFrame showing number of MIce per Treatment
m_d = drug_group[["Mouse ID", "Drug Regimen"]]
mouse_per = m_d.groupby(['Drug Regimen']).count()
mouse_per.reset_index()

In [None]:
per_drug = mouse_per.rename(columns={'Drug Regimen':"Drug",'Mouse ID':"# of Mice"}).reset_index()
per_drug.columns=['Drug', '# of Mice']
per_drug

## Bar and Pie Charts

In [None]:
# Generate a bar plot showing the total number of mice for each treatment throughout the course of the study using pandas. 

per_drug.plot(kind='bar', title="Number of Mice Per Drug", figsize=(15,10)) 

In [None]:
# Generate a bar plot showing the total number of mice for each treatment throughout the course of the study using pyplot.
x_axis = np.arange(len(per_drug))
tick_locations = [value for value in x_axis ]
    
plt.title("Number of Mice Per Treatment")
plt.xlabel("Treatment")
plt.ylabel("Number of Mice")

plt.bar(x_axis, per_drug["# of Mice"], facecolor='b', alpha=0.75, align='center')
plt.xticks(tick_locations, per_drug["Drug Regimen"], rotation="vertical")
plt.show()


In [None]:
# Generate a pie plot showing the distribution of female versus male mice using pandas
s_d = clean_df[['Mouse ID','Sex']]
s_per = s_d.groupby(['Sex']).count()
s_per.reset_index()
s_per.plot.pie(y="Mouse ID", figsize=(5,5))

In [None]:
# Generate a pie plot showing the distribution of female versus male mice using pyplot
labels = ["Male", "Female"]
amount = [958, 925]
colors = ["blue", "pink"]
plt.pie(amount, labels=labels, colors=colors, autopct="%1.1f%%", shadow=True)

## Quartiles, Outliers and Boxplots

In [None]:
# Calculate the final tumor volume of each mouse across four of the treatment regimens:  
# Capomulin, Ramicane, Infubinol, and Ceftamin
df = clean_df.groupby(['Mouse ID']).max()
df = df.reset_index()
merged_data = df[["Mouse ID", "Timepoint"]].merge(clean_df,on=['Mouse ID','Timepoint'],how="left")
capomulin = merged_data.loc[merged_data['Drug Regimen'] == 'Capomulin'].groupby('Mouse ID')['Tumor Volume (mm3)'].max()
ramicane = merged_data.loc[merged_data['Drug Regimen'] == 'Ramicane'].groupby('Mouse ID')['Tumor Volume (mm3)'].max()
infubinol = merged_data.loc[merged_data['Drug Regimen'] == 'Infubinol'].groupby('Mouse ID')['Tumor Volume (mm3)'].max()
ceftamin = merged_data.loc[merged_data['Drug Regimen'] == 'Ceftamin'].groupby('Mouse ID')['Tumor Volume (mm3)'].max()

focused_df = pd.DataFrame({"Capomulin": capomulin,
                          "Ramicane": ramicane,
                           "Infubinol": infubinol,
                          "Ceftamin": ceftamin}) 
focused_df = focused_df.reset_index()
focused_df

In [None]:
#Random locate check to verify Mouse ID and final Timepoint & Tumor Volume
clean_df.loc[clean_df['Mouse ID'] == 'a203']

In [None]:
# Put treatments into a list for for loop (and later for plot labels)
treatments = ["Capomulin", "Ramicane", "Infubinol", "Ceftamin"]

# Create empty list to fill with tumor vol data (for plotting)
tumor_vol = []
#tumor_vol


# Calculate the IQR and quantitatively determine if there are any potential outliers. 
# Quantitatively determine capomulin outliers
capomulin = focused_df['Capomulin']
quartiles = capomulin.quantile([.25,.5,.75])
lowerq = quartiles[0.25]
upperq = quartiles[0.75]
iqr = upperq-lowerq

print(f"The lower quartile of capomulin is: {lowerq}")
print(f"The upper quartile of capomulin is: {upperq}")
print(f"The interquartile range of capomulin is: {iqr}")
print(f"The the median of capomulin is: {quartiles[0.5]} ")

lower_bound = lowerq - (1.5*iqr)
upper_bound = upperq + (1.5*iqr)
print(f"Values below {lower_bound} could be outliers.")
print(f"Values above {upper_bound} could be outliers.")

    

In [None]:
# Locate the rows which contain mice on each drug and get the tumor volumes

only_cap = focused_df.loc[focused_df["Capomulin"] > 1]
only_ram = focused_df.loc[focused_df["Ramicane"] > 1]
only_infu = focused_df.loc[focused_df["Infubinol"] > 1]
only_ceft = focused_df.loc[focused_df["Ceftamin"] > 1]

#max_df = pd.DataFrame({"Capomulin": only_cap, "Ramicane": only_ram, "Infubinol": only_infu, "Ceftamin": only_ceft})
#max_df
# add subset 
#subset =     
    
# Determine outliers using upper and lower bounds

In [None]:
# Generate a box plot of the final tumor volume of each mouse across four regimens of interest
#volumes = focused_df.loc[focused_df["Capomulin"]]

#fig1, ax1 = plt.subplots()
#ax1.set_title('Mice')
#ax1.set_ylabel('Max Tumor Volume')
#ax1.boxplot(volumes)
#plt.show()

## Line and Scatter Plots

In [None]:
# Generate a line plot of time point versus tumor volume for a mouse treated with Capomulin
plt.xlabel = ("Max Tumor Volume")
plt.ylabel = ("Timepoint")

mouse.plot(label="Capomulin")
(clean_df.loc['y973','Timepoint'], 
                        color="green",label=clean_df.loc['y973',"Tumor Volume (mm3)"])

# Show the chart
plt.show()

In [None]:
# Generate a scatter plot of mouse weight versus average tumor volume for the Capomulin regimen
x_values = clean_df.loc[clean_df['Drug Regimen'] == "Capomulin"]
y_values = clean_df['Tumor Volume (mm3)']
plt.scatter(x_values,y_values)
plt.xlabel('Mouse Weight')
plt.ylabel('Tumor Volume')
plt.show()

## Correlation and Regression

In [None]:
# Calculate the correlation coefficient and linear regression model 
# for mouse weight and average tumor volume for the Capomulin regimen
