# Place Analysis Here

In [None]:
#Analysis, link file?

# Collecting and Processing Data

In [32]:
# Dependencies and Setup
import matplotlib.pyplot as plt
import pandas as pd
import scipy.stats as st
import numpy as np

# Study data files
mouse_metadata_path = "data/Mouse_metadata.csv"
study_results_path = "data/Study_results.csv"

# Read the mouse data and the study results
mouse_metadata = pd.read_csv(mouse_metadata_path)
study_results = pd.read_csv(study_results_path)


In [6]:
# Combine the data into a single dataset
combined_study_data = pd.merge(mouse_metadata, study_results, how ="outer", on ="Mouse ID")


# Display the data table for preview
combined_study_data.head(15)


Unnamed: 0,Mouse ID,Drug Regimen,Sex,Age_months,Weight (g),Timepoint,Tumor Volume (mm3),Metastatic Sites
0,k403,Ramicane,Male,21,16,0,45.0,0
1,k403,Ramicane,Male,21,16,5,38.825898,0
2,k403,Ramicane,Male,21,16,10,35.014271,1
3,k403,Ramicane,Male,21,16,15,34.223992,1
4,k403,Ramicane,Male,21,16,20,32.997729,1
5,k403,Ramicane,Male,21,16,25,33.464577,1
6,k403,Ramicane,Male,21,16,30,31.099498,1
7,k403,Ramicane,Male,21,16,35,26.546993,1
8,k403,Ramicane,Male,21,16,40,24.365505,1
9,k403,Ramicane,Male,21,16,45,22.050126,1


In [3]:
# Checking the number of mice.
number_of_mice = combined_study_data["Mouse ID"].nunique()
print("There are", number_of_mice, "mice.")

There are 249 mice.


In [10]:
# Getting the duplicate mice by ID number that shows up for Mouse ID and Timepoint. 
duplicate_mouse_id = combined_study_data.loc[combined_study_data.duplicated(["Mouse ID", "Timepoint"]),"Mouse ID"].unique()
print(duplicate_mouse_id)

['g989']


In [11]:
# Optional: Get all the data for the duplicate mouse ID. 
duplicate_mouse = combined_study_data[combined_study_data.duplicated()]
print(duplicate_mouse)

    Mouse ID Drug Regimen     Sex  Age_months  Weight (g)  Timepoint  \
909     g989     Propriva  Female          21          26          0   

     Tumor Volume (mm3)  Metastatic Sites  
909                45.0                 0  


array(['g989'], dtype=object)

In [18]:
# Create a clean DataFrame by dropping the duplicate mouse by its ID.

clean_df = combined_study_data[combined_study_data["Mouse ID"].isin(duplicate_mouse_id) == False]



#Verify duplicate drop by seeing difference in length of column


print(clean_df["Mouse ID"].nunique())

# #Renaming cleaned DataFrame

combined_study_data_cln = clean_df


248


# Summary Statistics

In [31]:
# # Generate a summary statistics table of mean, median, variance, standard deviation, and SEM of the tumor volume for each regimen


summary_mean = combined_study_data_cln.groupby("Drug Regimen").mean()["Tumor Volume (mm3)"]
print(summary_mean)

summary_median = combined_study_data_cln.groupby("Drug Regimen").median()["Tumor Volume (mm3)"]
print(summary_median)

summary_variance = combined_study_data_cln.groupby("Drug Regimen").var()["Tumor Volume (mm3)"]
print(summary_variance)

summary_stdev = combined_study_data_cln.groupby("Drug Regimen").std()["Tumor Volume (mm3)"]
print(summary_stdev)

summary_SEM = combined_study_data_cln.groupby("Drug Regimen").sem()["Tumor Volume (mm3)"]
print(summary_SEM)


summary_statistics_df = pd.DataFrame ({"Mean Tumor Volume" : summary_mean, "Median Tumor Volume" : summary_median, "Median Tumor Variance" :  summary_variance, "ST. DEV of Tumor Volume" :  summary_stdev, "SEM of Tumor Volume" :  summary_SEM})
summary_statistics_df.head()

Drug Regimen
Capomulin    40.675741
Ceftamin     52.591172
Infubinol    52.884795
Ketapril     55.235638
Naftisol     54.331565
Placebo      54.033581
Propriva     52.320930
Ramicane     40.216745
Stelasyn     54.233149
Zoniferol    53.236507
Name: Tumor Volume (mm3), dtype: float64
Drug Regimen
Capomulin    41.557809
Ceftamin     51.776157
Infubinol    51.820584
Ketapril     53.698743
Naftisol     52.509285
Placebo      52.288934
Propriva     50.446266
Ramicane     40.673236
Stelasyn     52.431737
Zoniferol    51.818479
Name: Tumor Volume (mm3), dtype: float64
Drug Regimen
Capomulin    24.947764
Ceftamin     39.290177
Infubinol    43.128684
Ketapril     68.553577
Naftisol     66.173479
Placebo      61.168083
Propriva     43.852013
Ramicane     23.486704
Stelasyn     59.450562
Zoniferol    48.533355
Name: Tumor Volume (mm3), dtype: float64
Drug Regimen
Capomulin    4.994774
Ceftamin     6.268188
Infubinol    6.567243
Ketapril     8.279709
Naftisol     8.134708
Placebo      7.821003
Pro

Unnamed: 0_level_0,Mean Tumor Volume,Median Tumor Volume,Median Tumor Variance,ST. DEV of Tumor Volume,SEM of Tumor Volume
Drug Regimen,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
Capomulin,40.675741,41.557809,24.947764,4.994774,0.329346
Ceftamin,52.591172,51.776157,39.290177,6.268188,0.469821
Infubinol,52.884795,51.820584,43.128684,6.567243,0.492236
Ketapril,55.235638,53.698743,68.553577,8.279709,0.60386
Naftisol,54.331565,52.509285,66.173479,8.134708,0.596466


In [None]:
 # Using the aggregation method, produce the same summary statistics in a single line.
summary_stats_aggregated_df = combined_study_data_cln


# Pie and Bar Charts

In [None]:
# Generate a bar plot showing the total number of timepoints for all mice tested for each drug regimen using Pandas.

#x_axis = np.arange...
#plt.bar(x_axis, ..., color = , alpha =, align = "center)

In [None]:
# Generate a bar plot showing the total number of timepoints for all mice tested for each drug regimen using pyplot.


In [None]:
# Generate a pie plot showing the distribution of female versus male mice using Pandas

#plt.pie

In [None]:
# Generate a pie plot showing the distribution of female versus male mice using pyplot

# Quartiles, Outliers, and Box Plots

In [None]:
# Calculate the final tumor volume of each mouse across four of the treatment regimens:  
# Capomulin, Ramicane, Infubinol, and Ceftamin

# Start by getting the last (greatest) timepoint for each mouse


# Merge this group df with the original DataFrame to get the tumor volume at the last timepoint

In [None]:
# Put treatments into a list for for loop (and later for plot labels)


# Create empty list to fill with tumor vol data (for plotting)



    

In [None]:
# Calculate the IQR and quantitatively determine if there are any potential outliers. 

    
    # Locate the rows which contain mice on each drug and get the tumor volumes
    
    
    # add subset 
    
    
    # Determine outliers using upper and lower bounds


# Reference

In [None]:
# # Determine if there are any potential outliers in the average occupancy in California
# quartiles = california_data['AveOccup'].quantile([.25,.5,.75])
# lowerq = quartiles[0.25]
# upperq = quartiles[0.75]
# iqr = upperq-lowerq

# print(f"The lower quartile of occupancy is: {lowerq}")
# print(f"The upper quartile of occupancy is: {upperq}")
# print(f"The interquartile range of occupancy is: {iqr}")
# print(f"The the median of occupancy is: {quartiles[0.5]} ")

# lower_bound = lowerq - (1.5*iqr)
# upper_bound = upperq + (1.5*iqr)
# print(f"Values below {lower_bound} could be outliers.")
# print(f"Values above {upper_bound} could be outliers.")

# outlier_occupancy = california_data.loc[(california_data['AveOccup'] < lower_bound) | (california_data['AveOccup'] > upper_bound)]
# outlier_occupancy

In [None]:
# Generate a box plot that shows the distrubution of the tumor volume for each treatment group.

# Line and Scatter Plots

In [None]:
# Generate a line plot of tumor volume vs. time point for a mouse treated with Capomulin


In [None]:
# Generate a scatter plot of average tumor volume vs. mouse weight for the Capomulin regimen


# Correlation and Regression

In [None]:
# Calculate the correlation coefficient and linear regression model 
# for mouse weight and average tumor volume for the Capomulin regimen