## Observations and Insights 

In [None]:
# Dependencies and Setup
import matplotlib.pyplot as plt
import pandas as pd
import scipy.stats as st

# Study data files
mouse_metadata_path = "data/Mouse_metadata.csv"
study_results_path = "data/Study_results.csv"

# Read the mouse data and the study results
mouse_metadata = pd.read_csv(mouse_metadata_path)
study_results = pd.read_csv(study_results_path)

# Combine the data into a single dataset
combined_data_df = pd.merge(mouse_metadata, study_results, how='outer', on="Mouse ID") 

# Display the data table for preview
combined_data_df.head()

In [None]:
# Checking the number of mice.
combined_data_df['Mouse ID'].nunique()

In [None]:
combined_data_df.count()

In [None]:
# Getting the duplicate mice by ID number that shows up for Mouse ID and Timepoint.
# Optional: Get all the data for the duplicate mouse ID. 
duplicates_df = combined_data_df[combined_data_df.duplicated(subset=['Mouse ID', 'Timepoint'], keep=False)]

duplicates_df

In [None]:
# Create a clean DataFrame by dropping the duplicate mouse by its ID.
clean_combined_data_df = combined_data_df[combined_data_df['Mouse ID'] != 'g989']

clean_combined_data_df

In [None]:
# Checking the number of mice in the clean DataFrame.
clean_combined_data_df['Mouse ID'].nunique()

## Summary Statistics

In [None]:
# Generate a summary statistics table of mean, median, variance, standard deviation, and SEM of the tumor volume for each regimen
mean = clean_combined_data_df.groupby('Drug Regimen')["Tumor Volume (mm3)"].mean()
median = clean_combined_data_df.groupby('Drug Regimen')["Tumor Volume (mm3)"].median()
variance = clean_combined_data_df.groupby('Drug Regimen')["Tumor Volume (mm3)"].var()
std = clean_combined_data_df.groupby('Drug Regimen')["Tumor Volume (mm3)"].std()
sem = clean_combined_data_df.groupby('Drug Regimen')["Tumor Volume (mm3)"].sem()

statistics_df = pd.DataFrame({'Mean':mean, 
                              'Median':median, 
                              'Variance':variance, 
                              'Standard Deviation':std, 
                              'SEM':sem})

statistics_df

# This method is the most straighforward, creating multiple series and putting them all together at the end.

In [None]:
# Generate a summary statistics table of mean, median, variance, standard deviation, and SEM of the tumor volume for each regimen
summary_statistics_df = clean_combined_data_df.groupby('Drug Regimen')

tumor_mean = summary_statistics_df["Tumor Volume (mm3)"].mean()
tumor_median = summary_statistics_df["Tumor Volume (mm3)"].median()
tumor_var = summary_statistics_df["Tumor Volume (mm3)"].var()
tumor_std = summary_statistics_df["Tumor Volume (mm3)"].std()
tumor_sem = summary_statistics_df["Tumor Volume (mm3)"].sem()

summary_statistics_df = pd.DataFrame({'Mean':tumor_mean, 
                                      'Median':tumor_median, 'Variance':tumor_var, 
                                      'Standard Deviation':tumor_std, 
                                      'SEM':tumor_sem})

summary_statistics_df

# This method produces everything in a single groupby function

## Bar and Pie Charts

In [None]:
# Group the cleaned data by Drug Regimen and show the number of unique mice that each drug was used in tumor treatment
mice_drug_group = clean_combined_data_df.groupby('Drug Regimen')
number_mice_drug = mice_drug_group["Mouse ID"].nunique()

number_mice_drug

In [None]:
# Create a Dataframe from this series
number_mice_drug_df = pd.DataFrame({"Number of Mice": number_mice_drug})

number_mice_drug_df

In [None]:
# Create a bar chart from the previously created series
mice_drug_bar = number_mice_drug.plot(kind='bar')

plt.title("Number of Mice for Each Treatment")
plt.xlabel("Drug Regimen")
plt.ylabel("Number of Mice")

plt.tight_layout()

plt.show()

In [None]:
# Specify x and y axis values for alternate bar plot creation
x_axis = number_mice_drug_df.index
y_axis = number_mice_drug_df["Number of Mice"].tolist()

In [None]:
# Generate a bar plot showing the total number of mice for each treatment throughout the course of the study using pandas. 
number_mice_drug_df.plot(kind='bar', legend=False)

plt.title("Number of Mice for Each Treatment")
plt.xlabel("Drug Regimen")
plt.ylabel("Number of Mice")

plt.tight_layout()

plt.show()

In [None]:
# Generate a bar plot showing the total number of mice for each treatment throughout the course of the study using pyplot.
plt.bar(x_axis, y_axis)

plt.title("Number of Mice for Each Treatment")
plt.xlabel("Drug Regimen")
plt.ylabel("Number of Mice")

plt.xticks(rotation=90)

plt.tight_layout()

plt.show()

In [None]:
# Group the cleaned data to find the distribution of female versus male mice
sex_group = clean_combined_data_df.groupby('Sex')
sex_count = sex_group["Mouse ID"].nunique()

sex_count

In [None]:
# Create a Dataframe from this series
sex_count_df = pd.DataFrame({"Number of Mice": sex_count})

sex_count_df

In [None]:
# Specify labels, values and colours for pie plot creation
labels = sex_count_df.index
values = sex_count_df["Number of Mice"].tolist()
colors = ["pink", "lightblue"]

In [None]:
# Generate a pie plot showing the distribution of female versus male mice using pandas
pie_plot = sex_count.plot.pie(autopct="%1.1f%%", ylabel=(''), colors=colors, shadow=True, startangle=90, title='Distribution by Sex')

plt.show()

In [None]:
# Generate a pie plot showing the distribution of female versus male mice using pyplot
plt.pie(values, labels=labels, autopct='%1.1f%%', colors=colors, shadow=True, startangle=90)

plt.title("Distribution by Sex")

plt.show()

## Quartiles, Outliers and Boxplots

In [None]:
# Calculate the final tumor volume of each mouse across four of the treatment regimens:  
# Capomulin, Ramicane, Infubinol, and Ceftamin

# Start by getting the last (greatest) timepoint for each mouse


# Merge this group df with the original dataframe to get the tumor volume at the last timepoint


In [None]:
# Put treatments into a list for for loop (and later for plot labels)


# Create empty list to fill with tumor vol data (for plotting)


# Calculate the IQR and quantitatively determine if there are any potential outliers. 

    
    # Locate the rows which contain mice on each drug and get the tumor volumes
    
    
    # add subset 
    
    
    # Determine outliers using upper and lower bounds
    

In [None]:
# Generate a box plot of the final tumor volume of each mouse across four regimens of interest


## Line and Scatter Plots

In [None]:
# Generate a line plot of time point versus tumor volume for a mouse treated with Capomulin


In [None]:
# Generate a scatter plot of mouse weight versus average tumor volume for the Capomulin regimen


## Correlation and Regression

In [None]:
# Calculate the correlation coefficient and linear regression model 
# for mouse weight and average tumor volume for the Capomulin regimen
