## Observations and Insights 

In [None]:
# Dependencies and Setup
import matplotlib.pyplot as plt
import pandas as pd
import scipy.stats as st
import numpy as np


# Study data files
mouse_metadata_path = "data/Mouse_metadata.csv"
study_results_path = "data/Study_results.csv"

# Read the mouse data and the study results
mouse_metadata = pd.read_csv(mouse_metadata_path)
study_results = pd.read_csv(study_results_path)

mouse_metadata.head()




In [None]:
study_results.head()


In [None]:
# Combine the data into a single dataset

merged_data_df=pd.merge(mouse_metadata, study_results, how= 'outer', on="Mouse ID")


In [None]:
# Display the data table for preview
merged_data_df



In [None]:
# Checking the number of mice.
# num_Mice =merged_data_df["Mouse ID"].count()
# num_Mice

len(np.unique(merged_data_df["Mouse ID"]))


In [None]:
# Getting the duplicate mice by ID number that shows up for Mouse ID and Timepoint. 

duplicate_mice = merged_data_df[merged_data_df.duplicated(['Mouse ID', 'Timepoint'])]


In [None]:
# Optional: Get all the data for the duplicate mouse ID. 
duplicate_mice



In [None]:
# Create a clean DataFrame by dropping the duplicate mouse by its ID.
merged_data_df.drop( merged_data_df[merged_data_df['Mouse ID']== 'g989'].index, inplace=True)

In [None]:

#the number of rows decreased by 13 due to duplicates (1893 to 1880 rows)
merged_data_df

In [None]:
# Checking the number of mice in the clean DataFrame.
len(merged_data_df)

## Summary Statistics

In [None]:
# Generate a summary statistics table of mean, median, variance, standard deviation, and SEM of the tumor volume for each regimen
#merged_data_df.describe() stats overall


In [None]:
# Use groupby and summary statistical methods to calculate the following properties of each drug regimen: 

merged_data_regimen_df=merged_data_df.groupby('Drug Regimen')
merged_data_regimen_df.count().head(20)

In [None]:
# mean, median, variance, standard deviation, and SEM of the tumor volume. 
mean_tumor_vol= merged_data_df.groupby('Drug Regimen').mean()['Tumor Volume (mm3)']
median_tumor_vol= merged_data_df.groupby('Drug Regimen').median()['Tumor Volume (mm3)']
var_tumor_vol=merged_data_df.groupby('Drug Regimen').var()['Tumor Volume (mm3)']
std_tumor_vol=merged_data_df.groupby('Drug Regimen').std()['Tumor Volume (mm3)']
SEM_tumor_vol= merged_data_df.groupby('Drug Regimen').sem()['Tumor Volume (mm3)']


# Assemble the resulting series into a single summary dataframe.

summary_df = pd.DataFrame({"Mean": mean_tumor_vol, "Median": median_tumor_vol, "Variance": var_tumor_vol, "Standard Deviation": std_tumor_vol, "SEM": SEM_tumor_vol})


In [None]:
# Generate a summary statistics table of mean, median, variance, standard deviation, and SEM of the tumor volume for each regimen
summary_df

In [None]:
# Using the aggregation method, produce the same summary statistics in a single line
merged_data_regimen_df.agg(Mean=('Tumor Volume (mm3)', np.mean), \
                           Median=('Tumor Volume (mm3)', np.median ), \
                           Variance = ('Tumor Volume (mm3)', np.var ), \
                           Standard_Deviation = ('Tumor Volume (mm3)', np.std),\
                           SEM = ('Tumor Volume (mm3)', st.sem)) 



## Bar and Pie Charts

In [None]:
# Generate a bar plot showing the total number of timepoints for all mice tested for each drug regimen using Pandas.

Regimen_list = summary_df.index.tolist()

Regimen_list

In [None]:
Timepoints = (merged_data_df.groupby(["Drug Regimen"])["Timepoint"].count()).tolist()
Timepoints

In [None]:
# bar_plot = Regimen_list, merged_data_df["Timepoint"].count().plot.bar(width=0.7, zorder=3)

# # Set labels for axes
# bar_plot.set_xlabel("Drug Regimen")
# bar_plot.set_ylabel("Number of Timepoints")
# bar_plot.set_title("Number of Timepoints for Each Treatment Regimen")


#pandas_bp_data = merged_data_df.groupby('Drug Regimen').nunique()
# pandas_bp = pandas_bp_data.plot.bar(title='Mice per Drug Regimen Tested')

plt.bar(Regimen_list,Timepoints, color='b', alpha=0.5, align="center")

plt.figure(figsize=(20,5))


plt.title("Number of Timepoints per Drug Regimen")
plt.xlabel("Drug Regimen")
plt.ylabel("Number of Timepoints")


In [None]:
# Generate a bar plot showing the total number of timepoints for all mice tested for each drug regimen using pyplot.



In [None]:
# Generate a pie plot showing the distribution of female versus male mice using Pandas



In [None]:
# Generate a pie plot showing the distribution of female versus male mice using pyplot



## Quartiles, Outliers and Boxplots

In [None]:
# Calculate the final tumor volume of each mouse across four of the treatment regimens:  
# Capomulin, Ramicane, Infubinol, and Ceftamin

# Start by getting the last (greatest) timepoint for each mouse


# Merge this group df with the original dataframe to get the tumor volume at the last timepoint


In [None]:
# Put treatments into a list for for loop (and later for plot labels)


# Create empty list to fill with tumor vol data (for plotting)


# Calculate the IQR and quantitatively determine if there are any potential outliers. 

    
    # Locate the rows which contain mice on each drug and get the tumor volumes
    
    
    # add subset 
    
    
    # Determine outliers using upper and lower bounds
    

In [None]:
# Generate a box plot of the final tumor volume of each mouse across four regimens of interest


## Line and Scatter Plots

In [None]:
# Generate a line plot of tumor volume vs. time point for a mouse treated with Capomulin


In [None]:
# Generate a scatter plot of average tumor volume vs. mouse weight for the Capomulin regimen


## Correlation and Regression

In [None]:
# Calculate the correlation coefficient and linear regression model 
# for mouse weight and average tumor volume for the Capomulin regimen
