## Observations and Insights 

In [None]:
# Dependencies and Setup
import matplotlib.pyplot as plt
import pandas as pd
import scipy.stats as st

# Study data files
mouse_metadata_path = "data/Mouse_metadata.csv"
study_results_path = "data/Study_results.csv"

# Read the mouse data and the study results
mouse_metadata = pd.read_csv(mouse_metadata_path)
study_results = pd.read_csv(study_results_path)

# Combine the data into a single dataset
merged_df = pd.merge(mouse_metadata, study_results, on="Mouse ID", how="outer")
# Display the data table for preview
merged_df.head()

In [None]:
# Checking the number of mice.
miceCount = merged_df['Mouse ID'].count()
miceCountunq = merged_df['Mouse ID'].nunique()
print (miceCount)
print (miceCountunq)

print(f"The number of mice is {miceCountunq}")

In [None]:
# Getting the duplicate mice by ID number that shows up for Mouse ID and Timepoint. 
# Not quite sure if this is what the above instructions are asking 
duplicateMiceID = merged_df.loc[merged_df.duplicated(subset=['Mouse ID', 'Timepoint']), 'Mouse ID'].unique()
duplicateMiceID

In [None]:
# Optional: Get all the data for the duplicate mouse ID.
# Is this what they are asking for? 
duplicateMice_df = merged_df.loc[merged_df['Mouse ID']=='g989', :]
duplicateMice_df

In [None]:
# Create a clean DataFrame by dropping the duplicate mouse by its ID.
clean_df = merged_df[merged_df['Mouse ID'].isin(duplicateMiceID)==False]
clean_df.head()

In [None]:
# Checking the number of mice in the clean DataFrame.
cleanMiceCount = clean_df['Mouse ID'].nunique()

print (f" The number of mice (with unique IDs) in the clean DataFrame is {cleanMiceCount}")

## Summary Statistics

In [None]:
# Generate a summary statistics table of mean, median, variance, standard deviation, and SEM of the tumor volume for each regimen
# Use groupby and summary statistical methods to calculate the following properties of each drug regimen: 
# mean, median, variance, standard deviation, and SEM of the tumor volume. 
# Assemble the resulting series into a single summary dataframe.
mean = clean_df['Tumor Volume (mm3)'].groupby(clean_df['Drug Regimen']).mean()
median = clean_df['Tumor Volume (mm3)'].groupby(clean_df['Drug Regimen']).median()
var = clean_df['Tumor Volume (mm3)'].groupby(clean_df['Drug Regimen']).var()
std = clean_df['Tumor Volume (mm3)'].groupby(clean_df['Drug Regimen']).std()
sem = clean_df['Tumor Volume (mm3)'].groupby(clean_df['Drug Regimen']).sem()

# Create Data Frame Dictionary 
summaryStats = pd.DataFrame({"Mean": mean,
                             "Median": median,
                             "Variance": var,
                             "Standard Deviation": std,
                             "Standard Error": sem,
                            })
# Display
summaryStats


In [None]:
# Generate a summary statistics table of mean, median, variance, standard deviation, and SEM of the tumor volume for each regimen
# Using the aggregation method, produce the same summary statistics in a single line
Summary_df = clean_df.groupby('Drug Regimen').agg({'Tumor Volume (mm3)':['mean','median','var','std','sem']})
Summary_df

## Bar and Pie Charts

In [None]:
# Generate a bar plot showing the total number of timepoints for all mice tested for each drug regimen using Pandas.


In [None]:
# Generate a bar plot showing the total number of timepoints for all mice tested for each drug regimen using pyplot.


In [None]:
# Generate a pie plot showing the distribution of female versus male mice using Pandas
genderData = clean_df["Sex"].value_counts()
plt.title("Sex")
genderData.plot.pie(autopct = "%1.1f%%")
plt.show()


In [None]:
# Generate a pie plot showing the distribution of female versus male mice using pyplot
genderLabels = ["Female", "Male"]
sizes = [49.799, 50.200]
plot = genderData.plot.pie(y= 'Total Count', autopct = "%1.1f%%")
plt.title('Male vs Female Population')
plt.show()

## Quartiles, Outliers and Boxplots

In [None]:
# Calculate the final tumor volume of each mouse across four of the treatment regimens:  
# Capomulin, Ramicane, Infubinol, and Ceftamin
drugList = ['Capomulin', 'Ramicane', 'Infubinol', 'Cerftamin']
drug_df = merged_df[merged_df["Drug Regimen"].isin(drugList)]
drug_df.head()
# Start by getting the last (greatest) timepoint for each mouse

# Merge this group df with the original dataframe to get the tumor volume at the last timepoint
lastTimepoint = drug_df.groupby(['Drug Regimen', 'Mouse ID']).agg(tumorSize=('Tumor Volume (mm3)', lambda x: x.iloc[-1]))

In [None]:
# Put treatments into a list for for loop (and later for plot labels)
# Create empty list to fill with tumor vol data (for plotting)
# Calculate the IQR and quantitatively determine if there are any potential outliers. 
    # Locate the rows which contain mice on each drug and get the tumor volumes
    # add subset 
    # Determine outliers using upper and lower bounds
treatment = 0
for drug in drugList:
    quartiles = lastTimepoint[drug].quantile([.25,.5,.75]).round(2)
    lowerq = quartiles[0.25].round(2)
    upperq = quartiles[0.75].round(2)
    iqr = round(upperq-lowerq,2)
    lower_bound = round(lowerq - (1.5*iqr),2)
    upper_bound = round(upperq + (1.5*iqr),2)


    if treatment == 0:
        print(f"------------------------------------------------------------")
    print(f"The lower quartile of {drug} treatments is: {lowerq}")
    print(f"The upper quartile of {drug} treatments is: {upperq}")
    print(f"The interquartile range of {drug} treatments is: {iqr}")
    print(f"Values below {lower_bound} could be {drug} outliers.")
    print(f"Values above {upper_bound} could be {drug} outliers.")
    print(f"------------------------------------------------------------")
    treatment+=1
    

In [None]:
# Generate a box plot of the final tumor volume of each mouse across four regimens of interest
boxplotList = []
for drug in drugList:
    boxplot_list.append(list(lastTimepoint[drug].dropna()))
    
# Generate a box plot of the final tumor volume of each mouse across four regimens of interest
fig = plt.figure()
plt.xlabel("Regimen")
plt.xticks([1,2,3,4], drugList, rotation=45)
plt.ylabel("Tumor Volume")
plt.title("Tumor Volume by Drug Regimen")
plt.boxplot(boxplotList)
plt.show()

## Line and Scatter Plots

In [None]:
# Generate a line plot of tumor volume vs. time point for a mouse treated with Capomulin
forline_df = Capomulin_df.loc[Capomulin_df["Mouse ID"] == "l509",:]
forline_df.head()
x_axis = forline_df["Timepoint"]
tumsiz = forline_df["Tumor Volume (mm3)"]

fig1, ax1 = plt.subplots()
plt.title('Capomulin treatmeant of mouse l509')
plt.plot(x_axis, tumsiz,linewidth=2, markersize=15,marker="o",color="blue", label="Fahreneit")
plt.xlabel('Timepoint (Days)')
plt.ylabel('Tumor Volume (mm3)')

In [None]:
# Generate a scatter plot of average tumor volume vs. mouse weight for the Capomulin regimen


## Correlation and Regression

In [None]:
# Calculate the correlation coefficient and linear regression model 
# for mouse weight and average tumor volume for the Capomulin regimen
