## Observations and Insights 

In [1]:
# Dependencies and Setup
import matplotlib.pyplot as plt
import pandas as pd
import scipy.stats as st

# Study data files
mouse_metadata_path = "data/Mouse_metadata.csv"
study_results_path = "data/Study_results.csv"

# Read the mouse data and the study results
mouse_metadata = pd.read_csv(mouse_metadata_path)
study_results = pd.read_csv(study_results_path)

# Combine the data into a single dataset
mouse_df = pd.DataFrame(mouse_metadata)
study_df = pd.DataFrame(study_results)
full_data_df = mouse_df.merge(study_df, how='outer', on='Mouse ID')
# Display the data table for preview
# len(mouse_df) # 249 before dropping duplicate data
# len(full_data_df) # 1893 before dropping duplicate data
# len(study_df)
# study_df.head()
# print(full_data_df.loc[full_data_df["Mouse ID"] == 'g989'])
# full_data_df.head()

In [None]:
# Scratch code to delete for final submission
mice = {
    "Mouse" : ['A', 'B', 'C', 'D', 'E'],
    "Sex" : ['Male', 'Female', 'Male', 'Female', 'Male']
}
times = {
    "Mouse" : ['A', 'A', 'B', 'B', 'C', 'C', 'C', 'C', 'D', 'D', 'E', 'E'],
    "Time" : [0, 5, 0, 5, 0, 0, 5, 10, 0, 5, 0, 5]
}
micedf = pd.DataFrame(mice)
timesdf = pd.DataFrame(times)
fulldf = micedf.merge(timesdf, on="Mouse", how="outer")


# g = fulldf.drop_duplicates(["Mouse", "Time"], keep=False)
# g

s = fulldf.duplicated(["Mouse", "Time"])
dupes = []
for itm in s.iteritems():
    if itm[1]:
        dupes.append(fulldf.loc[itm[0]]["Mouse"])

indeces = fulldf[ fulldf['Mouse'] == dupes[0] ].index
fulldf.drop(indeces, inplace=True)
fulldf

In [2]:
# Check the number of mice. Delete: g989
# The below (commented) method worked to remove duplicate timepoints, 
# but there were still 3 rows for mouse g989 because it had 3 unique
# timepoints. The instructions say to remove ALL DATA associated with 
# these mice so I am not using the below approach (drop_dupliccates() function)
#f = full_data_df.drop_duplicates(["Mouse ID", "Timepoint"], keep=False)
#print(full_data_df.loc[full_data_df["Mouse ID"] == 'g989'])


In [3]:
# Get the duplicate mice by ID number that shows up for Mouse ID 
# and Timepoint. 

# Optional: Get all the data for the duplicate mouse ID.

# We will get the Mouse IDs that have duplicate timepoints and then 
# delete ALL DATA associated with that Mouse ID:
timepoint_dup_series = full_data_df.duplicated(["Mouse ID", "Timepoint"])

timepoint_dups = [] # empty list to keep track of duplicated Mouse IDs

for dup_data in timepoint_dup_series.iteritems():
    if dup_data[1]: # True means it has duplicate Timepoints
        timepoint_dups.append(full_data_df.loc[dup_data[0]]["Mouse ID"])


In [4]:
# Create a clean DataFrame by dropping the duplicate mouse by its ID.

# use the set function to remove duplicates from this list
# use the list function on the set to turn it back into a list
timepoint_dups = list(set(timepoint_dups)) 

# Now we need the indeces of the rows with duplicate data
indeces_to_drop = full_data_df[ full_data_df['Mouse ID'] == timepoint_dups[0] ].index
    
# Drop these rows from the full_data_df
full_data_df.drop(indeces_to_drop, inplace=True)


In [5]:
# Check the number of mice in the clean DataFrame.
len(full_data_df["Mouse ID"].value_counts())

248

## Summary Statistics

In [None]:
# Method 1:  Generate a summary statistics table of mean, median, variance, standard deviation, and SEM of the tumor volume for each regimen

# This method is the most straighforward, creating multiple series and putting them all together at the end.



In [None]:
# Method 2(optional): Generate a summary statistics table of mean, median, variance, standard deviation, and SEM of the tumor volume for each regimen

# This method produces everything in a single groupby function. (https://pandas.pydata.org/pandas-docs/version/0.22.0/generated/pandas.core.groupby.DataFrameGroupBy.agg.html)


## Bar and Pie Charts

In [None]:
# Use Pandas to generate a bar plot showing the total number of mice in each treatment regimen throughout the course of the study. 

## Note: this plot will be identical to the one that uses Pyplot

In [None]:
# Use Pyplot to generate a bar plot showing the total number of mice treatment in each treatment regimen throughout the course of the study.

##  Note: this plot will be identical to the one that uses Pandas

In [None]:
# Use Pandas to generate a pie plot showing the distribution of female versus male mice

## Note: this plot will be identical to the one that uses Pyplot

In [None]:
# Use Pyplot to generate a pie plot showing the distribution of female versus male mice

##  Note: this plot will be identical to the one that uses Pandas

## Quartiles, Outliers and Boxplots

In [None]:
# Calculate the final tumor volume of each mouse across four of the drug regimens:  
# Capomulin, Ramicane, Infubinol, and Ceftamin

# Start by getting the last (latest) timepoint for each mouse


# Merge this group df with the original dataframe to get the tumor volume at the last timepoint


In [None]:
# Put the four drug regimens into a list that can be iterated over in a for-loop 
# (and later used for plot labels)


# Create empty list to fill with tumor vol data (for plotting)


# Calculate the IQR and quantitatively determine if there are any potential outliers. 

    
    # Locate the rows which contain mice on each drug and get the tumor volumes
    
    
    # add subset 
    
    
    # Determine outliers using upper and lower bounds
    

In [None]:
# Generate a box plot of the final tumor volume of each mouse across four regimens of interest


## Line and Scatter Plots

In [None]:
# Generate a line plot of tumor volume vs. timepoint for a mouse treated with Capomulin


In [None]:
# Generate a scatter plot of average tumor volume vs. mouse weight for the Capomulin regimen. 
# Note: this means mouse weight goes on the x-axis, with average tumor volume on the y-axis. 


## Correlation and Regression

In [None]:
# Calculate the correlation coefficient and linear regression model 
# for mouse weight and average tumor volume for the Capomulin regimen.
