## Observations and Insights 

In [10]:
# Dependencies and Setup
import matplotlib.pyplot as plt
import pandas as pd
import scipy.stats as st

# Study data files

mouse_metadata_path = "data/Mouse_metadata.csv"
study_results_path = "data/Study_results.csv"


# Read the mouse data and the study results
 
Mousedataframe=pd.read_csv(mouse_metadata_path)
  
Studyresultsdataframe= pd.read_csv(study_results_path)
              

In [11]:
Mousedataframe.head() 

Unnamed: 0,Mouse ID,Drug Regimen,Sex,Age_months,Weight (g)
0,k403,Ramicane,Male,21,16
1,s185,Capomulin,Female,3,17
2,x401,Capomulin,Female,16,15
3,m601,Capomulin,Male,22,17
4,g791,Ramicane,Male,11,16


In [12]:
Studyresultsdataframe.head() 

Unnamed: 0,Mouse ID,Timepoint,Tumor Volume (mm3),Metastatic Sites
0,b128,0,45.0,0
1,f932,0,45.0,0
2,g107,0,45.0,0
3,a457,0,45.0,0
4,c819,0,45.0,0


In [101]:
# Combine the data into a single dataset
# Display the data table for preview
Combine_df = pd.merge(Mousedataframe, Studyresultsdataframe, on="Mouse ID")
Combine_df.head(15)

Unnamed: 0,Mouse ID,Drug Regimen,Sex,Age_months,Weight (g),Timepoint,Tumor Volume (mm3),Metastatic Sites
0,k403,Ramicane,Male,21,16,0,45.0,0
1,k403,Ramicane,Male,21,16,5,38.825898,0
2,k403,Ramicane,Male,21,16,10,35.014271,1
3,k403,Ramicane,Male,21,16,15,34.223992,1
4,k403,Ramicane,Male,21,16,20,32.997729,1
5,k403,Ramicane,Male,21,16,25,33.464577,1
6,k403,Ramicane,Male,21,16,30,31.099498,1
7,k403,Ramicane,Male,21,16,35,26.546993,1
8,k403,Ramicane,Male,21,16,40,24.365505,1
9,k403,Ramicane,Male,21,16,45,22.050126,1


In [82]:
# Checking the number of mice.
len(Combine_df)




1883

In [45]:
# Getting the duplicate mice by ID number that shows up for Mouse ID and Timepoint. 
Combine_df[Combine_df.duplicated(subset=['Mouse ID', 'Timepoint'], keep=False)].head(40)





Unnamed: 0,Mouse ID,Drug Regimen,Sex,Age_months,Weight (g),Timepoint,Tumor Volume (mm3),Metastatic Sites


In [4]:
# Optional: Get all the data for the duplicate mouse ID. 



In [42]:
# Create a clean DataFrame by dropping the duplicate mouse by its ID.
Combine_df.drop_duplicates(subset=['Mouse ID','Timepoint'], keep=False, inplace=True)



In [93]:
# Checking the number of mice in the clean DataFrame.
#len(Combine_df)
print(Combine_df["Mouse ID"].value_counts())

z578    10
s337    10
y769    10
h246    10
l733    10
        ..
b447     1
x226     1
x336     1
t573     1
h428     1
Name: Mouse ID, Length: 249, dtype: int64


## Summary Statistics

In [96]:
# Generate a summary statistics table of mean, median, variance, standard deviation, and 
#SEM of the tumor volume for each regimen

# Use groupby and summary statistical methods to calculate the following properties of each drug regimen: 
# mean, median, variance, standard deviation, and SEM of the tumor volume. 
# Assemble the resulting series into a single summary dataframe.

drugr_group =Combine_df.groupby(["Drug Regimen"])

drugraverage = drugr_group.mean()
print (drugraverage)
drugrmed = drugr_group.median()
print (drugrmed)
drugrstandarddeviation = drugr_group.std()
print(drugrstandarddeviation)
drugrvarience=drugr_group.var()
print(drugrvarience)
drugrSEM=drugr_group.sem()
print(drugrSEM)

              Age_months  Weight (g)  Timepoint  Tumor Volume (mm3)  \
Drug Regimen                                                          
Capomulin      13.456522   19.965217  21.565217           40.675741   
Ceftamin       13.247191   27.398876  19.747191           52.591172   
Infubinol      16.230337   27.196629  18.174157           52.884795   
Ketapril       15.659574   27.861702  19.707447           55.235638   
Naftisol       12.000000   27.166667  19.623656           54.331565   
Placebo        10.734807   27.928177  18.674033           54.033581   
Propriva       10.225166   27.112583  17.317881           52.458254   
Ramicane       10.684211   19.679825  21.425439           40.216745   
Stelasyn       12.784530   27.856354  19.226519           54.233149   
Zoniferol      12.598901   27.692308  19.368132           53.236507   

              Metastatic Sites  
Drug Regimen                    
Capomulin             0.713043  
Ceftamin              1.179775  
Infubinol      

In [79]:
# Generate a summary statistics table of mean, median, variance, standard deviation, and 
#SEM of the tumor volume for each regimen

# Using the aggregation method, produce the same summary statistics in a single line
drugregimen_group = Combine_df.groupby(["Drug Regimen"])
drugregimen_group.head(20)
statistics=drugregimen_group.agg(['mean','median','var','std','sem'])["Tumor Volume (mm3)"]
statistics

Unnamed: 0_level_0,mean,median,var,std,sem
Drug Regimen,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
Capomulin,40.675741,41.557809,24.947764,4.994774,0.329346
Ceftamin,52.591172,51.776157,39.290177,6.268188,0.469821
Infubinol,52.884795,51.820584,43.128684,6.567243,0.492236
Ketapril,55.235638,53.698743,68.553577,8.279709,0.60386
Naftisol,54.331565,52.509285,66.173479,8.134708,0.596466
Placebo,54.033581,52.288934,61.168083,7.821003,0.581331
Propriva,52.458254,50.854632,44.053659,6.637293,0.540135
Ramicane,40.216745,40.673236,23.486704,4.846308,0.320955
Stelasyn,54.233149,52.431737,59.450562,7.710419,0.573111
Zoniferol,53.236507,51.818479,48.533355,6.966589,0.516398


## Bar and Pie Charts

In [104]:
# Generate a bar plot showing the total number of unique mice tested on each drug regimen using pandas.
mice_unique=Combine_df("Mouse ID").unique()

mice_drugregimen=pd.DataFrame(mice_unique["Mouse ID"].value_counts())

mice_drugregiment.columns=["Drug Regiment", "Mouse ID"]
mice_drugregiment.head()


TypeError: 'DataFrame' object is not callable

In [10]:
# Generate a bar plot showing the total number of unique mice tested on each drug regimen using pyplot.



In [11]:
# Generate a pie plot showing the distribution of female versus male mice using pandas



In [12]:
# Generate a pie plot showing the distribution of female versus male mice using pyplot



## Quartiles, Outliers and Boxplots

In [13]:
# Calculate the final tumor volume of each mouse across four of the treatment regimens:  
# Capomulin, Ramicane, Infubinol, and Ceftamin

# Start by getting the last (greatest) timepoint for each mouse


# Merge this group df with the original dataframe to get the tumor volume at the last timepoint


In [14]:
# Put treatments into a list for for loop (and later for plot labels)


# Create empty list to fill with tumor vol data (for plotting)


# Calculate the IQR and quantitatively determine if there are any potential outliers. 

    
    # Locate the rows which contain mice on each drug and get the tumor volumes
    
    
    # add subset 
    
    
    # Determine outliers using upper and lower bounds
    

In [15]:
# Generate a box plot of the final tumor volume of each mouse across four regimens of interest


## Line and Scatter Plots

In [16]:
# Generate a line plot of tumor volume vs. time point for a mouse treated with Capomulin


In [17]:
# Generate a scatter plot of average tumor volume vs. mouse weight for the Capomulin regimen


## Correlation and Regression

In [18]:
# Calculate the correlation coefficient and linear regression model 
# for mouse weight and average tumor volume for the Capomulin regimen
