## Observations and Insights 

In [2]:
# Dependencies and Setup
import matplotlib.pyplot as plt
import pandas as pd
import scipy.stats as st
from scipy.stats import linregress
import numpy as np

# Study data files
mouse_metadata_path = "Resources/Mouse_metadata.csv"
study_results_path = "Resources/Study_results.csv"

# Read the mouse data and the study results
mouse_data = pd.read_csv(mouse_metadata_path)
results = pd.read_csv(study_results_path)

# Combine the data into a single dataset
combined_df = pd.merge(mouse_data,results, on="Mouse ID", how='inner')
# Display the data table for preview
combined_df.head()

Unnamed: 0,Mouse ID,Drug Regimen,Sex,Age_months,Weight (g),Timepoint,Tumor Volume (mm3),Metastatic Sites
0,k403,Ramicane,Male,21,16,0,45.0,0
1,k403,Ramicane,Male,21,16,5,38.825898,0
2,k403,Ramicane,Male,21,16,10,35.014271,1
3,k403,Ramicane,Male,21,16,15,34.223992,1
4,k403,Ramicane,Male,21,16,20,32.997729,1


In [None]:
# Checking the number of mice.
combined_df["Mouse ID"].count()

In [None]:
# Getting the duplicate mice by ID number that shows up for Mouse ID and Timepoint. 
duplicate_mice = combined_df.loc[combined_df.duplicated(['Mouse ID', 'Timepoint'])]
duplicate_mice

In [None]:
# Optional: Get all the data for the duplicate mouse ID. 

In [None]:
# Create a clean DataFrame by dropping the duplicate mouse by its ID.
clean_df = combined_df.drop_duplicates(['Mouse ID',"Timepoint"],keep='first')
clean_df

In [None]:
# Checking the number of mice in the clean DataFrame.
clean_df["Mouse ID"].count()

## Summary Statistics

In [None]:
# Generate a summary statistics table of mean, median, variance, standard deviation, and SEM of the tumor volume for each regimen
# Use groupby and summary statistical methods to calculate the following properties of each drug regimen: 
# mean, median, variance, standard deviation, and SEM of the tumor volume. 
# Assemble the resulting series into a single summary dataframe.
groupby_drug =combined_df.groupby("Drug Regimen")["Tumor Volume (mm3)"]
drug_mean = groupby_drug.mean()
drug_median = groupby_drug.median()
drug_variance = groupby_drug.var()
drug_std = groupby_drug.std()
drug_sems = groupby_drug.sem()

summary_df = pd.DataFrame({"Mean Tumor Volume":drug_mean,"Median Tumor Volume":drug_median,
                           "Variance Tumor Volume":drug_variance,"Std Tumor Volume":drug_std, "SEM Tumor Volume ":drug_sems})
summary_df


In [None]:
# Generate a summary statistics table of mean, median, variance, standard deviation, and SEM of the tumor volume for each regimen

# Using the aggregation method, produce the same summary statistics in a single line
summary_aggreg = clean_df.groupby("Drug Regimen").agg({"Tumor Volume (mm3)":["mean", "median", "var", "std", "sem"]})
summary_aggreg

## Bar and Pie Charts

In [None]:
# Generate a bar plot showing the total number of unique mice tested on each drug regimen using pandas. 
unique_df = clean_df.drop_duplicates(("Mouse ID"),keep="first")
drug_data = unique_df["Drug Regimen"].value_counts()
drug_data.plot(kind="bar",title='Drug Treatment Count for Unique Mice',figsize=(8,4))
plt.ylabel("Number of Mice")

In [None]:
# Generate a bar plot showing the total number of unique mice tested on each drug regimen using pyplot.
x_axis = np.arange(len(drug_data))
ticks = [value for value in x_axis]
plt.bar(x_axis, drug_data,align="center",color="r",alpha=0.5,width = 0.5)
plt.xticks(ticks, drug_data.index,rotation="vertical")
plt.xlabel("Drug Regimen")
plt.ylabel("Number of Mice")
plt.title('Drug Treatment Count for Unique Mice')
plt.tight_layout()
plt.show()

In [None]:
# Generate a pie plot showing the distribution of female versus male mice using pandas
unique_df = clean_df.drop_duplicates(("Mouse ID"),keep="first")
gender_data = unique_df["Sex"].value_counts()
gender_data.plot(kind="pie",y="Mouse ID",autopct="%1.1f%%",startangle=60, title = "Gender Percentage of Unique Mice",legend=True,explode=[0.1,0])
plt.ylabel("")

In [None]:
# Generate a pie plot showing the distribution of female versus male mice using pyplot
colors = ["Yellow","Pink"]
explode = [0.1,0]
plt.pie(gender_data,autopct="%1.1f%%",startangle=60,colors=colors, labels=["Male","Female"],explode=explode)
plt.legend(loc="best")
plt.title("Gender Percentage of Unique Mice")
plt.show()

## Quartiles, Outliers and Boxplots

In [None]:
# Calculate the final tumor volume of each mouse across four of the treatment regimens:  
# Capomulin, Ramicane, Infubinol, and Ceftamin
# Start by getting the last (greatest) timepoint for each mouse
max_timepoint = clean_df.groupby("Mouse ID")["Timepoint"].max()
# Merge this group df with the original dataframe to get the tumor volume at the last timepoint
merge_df = pd.merge(max_timepoint,clean_df, on=["Timepoint","Mouse ID"],how="inner")
merge_df

In [None]:
# Put treatments into a list for for loop (and later for plot labels)
drug_list = ["Capomulin", "Ramicane", "Infubinol","Ceftamin"]

# Create empty list to fill with tumor vol data (for plotting)
tumor_vol = []

# Calculate the IQR and quantitatively determine if there are any potential outliers. 
for drug in drug_list:
    drug_df = merge_df.loc[:,"Drug Regimen"] == drug
    tumor_vol_data = merge_df.loc[drug_df,"Tumor Volume (mm3)"]
    tumor_vol.append(tumor_vol_data)
    quartiles = tumor_vol_data.quantile([0.25,0.5,0.75])
    first_q = quartiles[0.25]
    third_q = quartiles[0.75]
    iqr = third_q - first_q
    lower_b = round(first_q - (1.5*iqr),2)
    upper_b = round(third_q +(1.5*iqr),2)
    
    
    # Locate the rows which contain mice on each drug and get the tumor volumes
  
    
    # add subset 
    
    
    # Determine outliers using upper and lower bounds
    outlier=(tumor_vol_data>upper_b)|(tumor_vol_data<lower_b)
    outlier_count=len(tumor_vol_data.loc[outlier])
    print(f"{drug} has a lower bound of {lower_b} and an upper bound of {upper_b}")
    print(f"{drug} has {outlier_count} outliers")

In [None]:
# Generate a box plot of the final tumor volume of each mouse across four regimens of interest
fig,ax = plt.subplots()
outlier_marker = dict(markerfacecolor='r', marker='o')
ax.boxplot(tumor_vol,flierprops=outlier_marker)
x_axis = np.arange(len(drug_list))
ticks = [value+1 for value in x_axis]
plt.xticks(ticks,drug_list)
ax.set_xlabel("Drugs")
ax.set_ylabel("Tumor Volume (mm3)")
ax.set_title("Final Tumor Volume")

## Line and Scatter Plots

In [None]:
# Generate a line plot of tumor volume vs. time point for a mouse treated with Capomulin


In [None]:
# Generate a scatter plot of average tumor volume vs. mouse weight for the Capomulin regimen


## Correlation and Regression

In [None]:
# Calculate the correlation coefficient and linear regression model 
# for mouse weight and average tumor volume for the Capomulin regimen
