## Observations and Insights 

In [353]:
# Dependencies and Setup
import matplotlib.pyplot as plt
import pandas as pd
import scipy.stats as st 
import numpy as np

# Study data files
mouse_metadata_path = "C:\\Users\\me\\Desktop\\BBC\\COPY_but-ind-data-pt-06-2020-u-c\\Unit_05_Matplotlib\\Homework\\Pymaceuticals\\data\\Mouse_metadata.csv"
study_results_path = "C:\\Users\\me\\Desktop\\BBC\\COPY_but-ind-data-pt-06-2020-u-c\\Unit_05_Matplotlib\\Homework\\Pymaceuticals\\data\\Study_results.csv"

# Read the mouse data and the study results
mouse_metadata = pd.read_csv(mouse_metadata_path)
study_results = pd.read_csv(study_results_path)


In [354]:
mouse_metadata.head(2)

Unnamed: 0,Mouse ID,Drug Regimen,Sex,Age_months,Weight (g)
0,k403,Ramicane,Male,21,16
1,s185,Capomulin,Female,3,17


In [355]:
study_results = study_results.rename(columns={'Tumor Volume (mm3)':'Tumor Volume'})

In [356]:
study_results.head(2)

Unnamed: 0,Mouse ID,Timepoint,Tumor Volume,Metastatic Sites
0,b128,0,45.0,0
1,f932,0,45.0,0


In [357]:
# Combine the data into a single dataset
mouse_study = pd.merge(mouse_metadata, study_results, on=["Mouse ID", "Mouse ID"])

# Display the data table for preview
mouse_study.head()

Unnamed: 0,Mouse ID,Drug Regimen,Sex,Age_months,Weight (g),Timepoint,Tumor Volume,Metastatic Sites
0,k403,Ramicane,Male,21,16,0,45.0,0
1,k403,Ramicane,Male,21,16,5,38.825898,0
2,k403,Ramicane,Male,21,16,10,35.014271,1
3,k403,Ramicane,Male,21,16,15,34.223992,1
4,k403,Ramicane,Male,21,16,20,32.997729,1


In [358]:
mouse_study.dtypes

Mouse ID             object
Drug Regimen         object
Sex                  object
Age_months            int64
Weight (g)            int64
Timepoint             int64
Tumor Volume        float64
Metastatic Sites      int64
dtype: object

In [359]:
# Checking the number of mice.
mouse_study["Mouse ID"].count()

1893

In [331]:
# Getting the duplicate mice by ID number that shows up for Mouse ID and Timepoint. 
dupes = mouse_study[mouse_study.duplicated(['Timepoint', 'Mouse ID'], keep = False)]

In [332]:
# Optional: Get all the data for the duplicate mouse ID. 
print(dupes)

    Mouse ID Drug Regimen     Sex  Age_months  Weight (g)  Timepoint  \
908     g989     Propriva  Female          21          26          0   
909     g989     Propriva  Female          21          26          0   
910     g989     Propriva  Female          21          26          5   
911     g989     Propriva  Female          21          26          5   
912     g989     Propriva  Female          21          26         10   
913     g989     Propriva  Female          21          26         10   
914     g989     Propriva  Female          21          26         15   
915     g989     Propriva  Female          21          26         15   
916     g989     Propriva  Female          21          26         20   
917     g989     Propriva  Female          21          26         20   

     Tumor Volume  Metastatic Sites  
908     45.000000                 0  
909     45.000000                 0  
910     48.786801                 0  
911     47.570392                 0  
912     51.745156

In [333]:
# Create a clean DataFrame by dropping the duplicate mouse by its ID.
mouse_study = mouse_study.drop([908, 909], axis=0)

In [334]:
mouse_study = mouse_study.drop([910, 911, 912, 913, 914, 915, 917], axis=0)

In [335]:
# Checking the number of mice in the clean DataFrame.
mouse_study["Mouse ID"].count()

1884

## Summary Statistics

In [7]:
# Generate a summary statistics table of mean, median, variance, standard deviation, and SEM of the tumor volume for each regimen

# This method is the most straighforward, creating multiple series and putting them all together at the end.

In [None]:
# Generate a summary statistics table of mean, median, variance, standard deviation, and SEM of the tumor volume for each regimen

# This method produces everything in a single groupby function

In [336]:
#regimen_mean = mouse_study.groupby(["Drug Regimen"]).mean()["Tumor Volume"]
#regimen_mean

In [337]:
#regimen_median = mouse_study.groupby(["Drug Regimen"]).median()["Tumor Volume"]
#regimen_median

In [338]:
#regimen_variance = mouse_study.groupby(["Drug Regimen"]).var()["Tumor Volume"]
#regimen_variance

In [339]:
#regimen_stdev = mouse_study.groupby(["Drug Regimen"]).std()["Tumor Volume"]
#regimen_stdev

In [340]:
#regimen_SEM = mouse_study.groupby(["Drug Regimen"]).sem()["Tumor Volume"]
#regimen_SEM

In [452]:
regimen_stats = pd.DataFrame({"Mean": mouse_study.groupby(["Drug Regimen"]).mean()["Tumor Volume"],
                             "Median": mouse_study.groupby(["Drug Regimen"]).median()["Tumor Volume"], 
                             "Variance": mouse_study.groupby(["Drug Regimen"]).var()["Tumor Volume"],
                             "Standard Deviation": mouse_study.groupby(["Drug Regimen"]).std()["Tumor Volume"],
                             "SEM": mouse_study.groupby(["Drug Regimen"]).sem()["Tumor Volume"]})   
regimen_stats

Unnamed: 0_level_0,Mean,Median,Variance,Standard Deviation,SEM
Drug Regimen,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
Capomulin,40.675741,41.557809,24.947764,4.994774,0.329346
Ceftamin,52.591172,51.776157,39.290177,6.268188,0.469821
Infubinol,52.884795,51.820584,43.128684,6.567243,0.492236
Ketapril,55.235638,53.698743,68.553577,8.279709,0.60386
Naftisol,54.331565,52.509285,66.173479,8.134708,0.596466
Placebo,54.033581,52.288934,61.168083,7.821003,0.581331
Propriva,52.322552,50.854632,42.35107,6.50777,0.512884
Ramicane,40.216745,40.673236,23.486704,4.846308,0.320955
Stelasyn,54.233149,52.431737,59.450562,7.710419,0.573111
Zoniferol,53.236507,51.818479,48.533355,6.966589,0.516398


## Bar and Pie Charts

In [342]:
# Generate a bar plot showing the total number of mice for each treatment throughout the course of the study using pandas.

In [477]:
per_treatment_mice = mouse_study[["Drug Regimen", "Mouse ID"]]
per_treatment_mice = per_treatment_mice.set_index("Drug Regimen")
per_treatment_mice.head()

Unnamed: 0_level_0,Mouse ID
Drug Regimen,Unnamed: 1_level_1
Ramicane,k403
Ramicane,k403
Ramicane,k403
Ramicane,k403
Ramicane,k403


In [478]:
mice_per_regimen = pd.DataFrame({"Mice Per Regimen": per_treatment_mice.groupby(["Drug Regimen"]).count()["Mouse ID"]})
mice_per_regimen

Unnamed: 0_level_0,Mice Per Regimen
Drug Regimen,Unnamed: 1_level_1
Capomulin,230
Ceftamin,178
Infubinol,178
Ketapril,188
Naftisol,186
Placebo,181
Propriva,161
Ramicane,228
Stelasyn,181
Zoniferol,182


In [510]:
mice_per_regimen.plot(kind="bar", facecolor="b")
plt.title("The Bloodthirst of Kings")
plt.ylabel("Number of Battles Participated In")
plt.xlabel("King")
tick_locations = [value for value in x_axis]
plt.xticks(tick_locations, regimens)
plt.xlim(-0.75, len(x_axis)-0.25)
plt.ylim(0, max(number_of_mice)+20)
plt.title("Total Amount of Mice Per Treatment")
plt.ylabel("Number of Mice")
plt.xlabel("Drug Regimen")
plt.tight_layout()
plt.show()

<IPython.core.display.Javascript object>

In [10]:
# Generate a bar plot showing the total number of mice for each treatment throughout the course of the study using pyplot.

In [462]:
regimen_mice = mouse_study.groupby(["Drug Regimen"]).count()["Mouse ID"]
regimen_mice

Drug Regimen
Capomulin    230
Ceftamin     178
Infubinol    178
Ketapril     188
Naftisol     186
Placebo      181
Propriva     161
Ramicane     228
Stelasyn     181
Zoniferol    182
Name: Mouse ID, dtype: int64

In [437]:
number_of_mice = regimen_mice.to_list()

In [468]:
plt.bar(x_axis, number_of_mice, color="b", align="center")

<BarContainer object of 10 artists>

In [464]:
number_of_mice = regimen_mice.to_list()
x_axis=np.arange(len(number_of_mice))
plt.bar(x_axis, number_of_mice, color="b", align="center")
tick_locations = [value for value in x_axis]
plt.xticks(tick_locations, regimens)
plt.xlim(-0.75, len(x_axis)-0.25)
plt.ylim(0, max(number_of_mice)+20)
plt.title("Total Amount of Mice Per Treatment")
plt.ylabel("Number of Mice")
plt.xlabel("Drug Regimen")
plt.tight_layout()
plt.show()

#q: does this really need a legend??
#regimen_handle = plt.plot(x_axis, color='blue', label="Drug Regimens")
#plt.legend(loc="best")

In [11]:
# Generate a pie plot showing the distribution of female versus male mice using pandas

In [517]:
gender_per = mouse_study.groupby(["Sex"]).count()["Mouse ID"]

In [518]:
gender_per

Sex
Female    935
Male      958
Name: Mouse ID, dtype: int64

In [525]:
gender_per_mouse.plot(kind="pie", subplots=True)
labels = ["Female", "Male"]
sizes = gender_per
colors = ["lightcoral", "lightskyblue"]
plt.show()

In [12]:
# Generate a pie plot showing the distribution of female versus male mice using pyplot

In [526]:
plt.pie(sizes,  labels=labels, colors=colors,
        autopct="%1.1f%%", shadow=True, startangle=140)
plt.show()

## Quartiles, Outliers and Boxplots

In [13]:
# Calculate the final tumor volume of each mouse across four of the treatment regimens:  
# Capomulin, Ramicane, Infubinol, and Ceftamin

# Start by getting the last (greatest) timepoint for each mouse


# Merge this group df with the original dataframe to get the tumor volume at the last timepoint


In [14]:
# Put treatments into a list for for loop (and later for plot labels)


# Create empty list to fill with tumor vol data (for plotting)


# Calculate the IQR and quantitatively determine if there are any potential outliers. 

    
    # Locate the rows which contain mice on each drug and get the tumor volumes
    
    
    # add subset 
    
    
    # Determine outliers using upper and lower bounds
    

In [15]:
# Generate a box plot of the final tumor volume of each mouse across four regimens of interest


## Line and Scatter Plots

In [16]:
# Generate a line plot of time point versus tumor volume for a mouse treated with Capomulin


In [17]:
# Generate a scatter plot of mouse weight versus average tumor volume for the Capomulin regimen


## Correlation and Regression

In [18]:
# Calculate the correlation coefficient and linear regression model 
# for mouse weight and average tumor volume for the Capomulin regimen
