## Observations and Insights 

In [None]:
# Dependencies and Setup
import matplotlib.pyplot as plt
import pandas as pd
import scipy.stats as st
import numpy as np

# Study data files
mouse_metadata_path = "data/Mouse_metadata.csv"
study_results_path = "data/Study_results.csv"

# Read the mouse data and the study results
mouse_metadata = pd.read_csv(mouse_metadata_path)
study_results = pd.read_csv(study_results_path)

# Combine the data into a single dataset
combined_data_df = pd.merge(mouse_metadata, study_results, how='outer', on="Mouse ID") 

# Display the data table in a preview
combined_data_df.head()

In [None]:
# Checking the number of mice by counting the unique values
combined_data_df['Mouse ID'].nunique()

In [None]:
# Count values of dataframe
combined_data_df.count()

In [None]:
# Getting the duplicate mice by ID number that shows up for Mouse ID and Timepoint.
# Optional: Get all the data for the duplicate mouse ID. 
duplicates_df = combined_data_df[combined_data_df.duplicated(subset=['Mouse ID', 'Timepoint'], keep=False)]

# Show dataframe
duplicates_df

In [None]:
# Create a clean DataFrame by dropping the duplicate mouse by its ID.
clean_combined_data_df = combined_data_df[combined_data_df['Mouse ID'] != 'g989']

# Show dataframe
clean_combined_data_df

In [None]:
# Checking the number of mice in the clean DataFrame.
clean_combined_data_df['Mouse ID'].nunique()

## Summary Statistics

In [None]:
# Generate a summary statistics table of mean, median, variance, standard deviation, and SEM of the tumor volume for each regimen
# Calculate statistics
mean = clean_combined_data_df.groupby('Drug Regimen')["Tumor Volume (mm3)"].mean()
median = clean_combined_data_df.groupby('Drug Regimen')["Tumor Volume (mm3)"].median()
variance = clean_combined_data_df.groupby('Drug Regimen')["Tumor Volume (mm3)"].var()
std = clean_combined_data_df.groupby('Drug Regimen')["Tumor Volume (mm3)"].std()
sem = clean_combined_data_df.groupby('Drug Regimen')["Tumor Volume (mm3)"].sem()

# Create a dataframe to hold statistics
statistics_df = pd.DataFrame({'Mean':mean, 
                              'Median':median, 
                              'Variance':variance, 
                              'Standard Deviation':std, 
                              'SEM':sem})

statistics_df = statistics_df.rename(columns={"Mean":"Mean Tumor Volume", 
                                              "Median":"Median Tumor Volume",
                                              "Variance":"Tumor Volume Variance",
                                              "Standard Deviation":"Tumor Volume Std. Dev",
                                              "SEM":"Tumor Volume Std. Err."})

# Show dataframe
statistics_df

# This method is the most straighforward, creating multiple series and putting them all together at the end.

In [None]:
# Generate a summary statistics table of mean, median, variance, standard deviation, and SEM of the tumor volume for each regimen
summary_statistics_df = clean_combined_data_df.groupby("Drug Regimen").agg({"Tumor Volume (mm3)":
                                                                            ["mean","median","var","std","sem"]})

# Show dataframe
summary_statistics_df

# This method produces everything in a single groupby function

In [None]:
# Generate a summary statistics table of mean, median, variance, standard deviation, and SEM of the tumor volume for each regimen
summary_statistics2_df = clean_combined_data_df.groupby('Drug Regimen')

# Calculate statistics
tumor_mean = summary_statistics2_df["Tumor Volume (mm3)"].mean()
tumor_median = summary_statistics2_df["Tumor Volume (mm3)"].median()
tumor_var = summary_statistics2_df["Tumor Volume (mm3)"].var()
tumor_std = summary_statistics2_df["Tumor Volume (mm3)"].std()
tumor_sem = summary_statistics2_df["Tumor Volume (mm3)"].sem()

# Create a dataframe to hold statistics
summary_statistics2_df = pd.DataFrame({'Mean':tumor_mean, 
                                      'Median':tumor_median, 'Variance':tumor_var, 
                                      'Standard Deviation':tumor_std, 
                                      'SEM':tumor_sem})

summary_statistics2_df = summary_statistics2_df.rename(columns={"Mean":"Mean Tumor Volume", 
                                                                "Median":"Median Tumor Volume",
                                                                "Variance":"Tumor Volume Variance",
                                                                "Standard Deviation":"Tumor Volume Std. Dev",
                                                                "SEM":"Tumor Volume Std. Err."})

# Show dataframe
summary_statistics2_df

# This is another method produces everything in a single groupby function

## Bar and Pie Charts

In [None]:
# Group the cleaned data by Drug Regimen and show the number of unique mice that each drug was used in tumor treatment
mice_drug_group = clean_combined_data_df.groupby('Drug Regimen')
number_mice_drug = mice_drug_group["Mouse ID"].nunique()

# Show series
number_mice_drug

In [None]:
# Create a Dataframe from this series
number_mice_drug_df = pd.DataFrame({"Number of Mice": number_mice_drug})

# Show dataframe
number_mice_drug_df

In [None]:
# Create a bar chart from the previously created series
mice_drug_bar = number_mice_drug.plot(kind='bar')

# Set main, x and y titles
plt.title("Number of Mice for Each Treatment")
plt.xlabel("Drug Regimen")
plt.ylabel("Number of Mice")

# Format bar plot
plt.tight_layout()

# Show bar plot
plt.show()

In [None]:
# Specify x and y axis values for alternate bar plot creation
x_axis = number_mice_drug_df.index
y_axis = number_mice_drug_df["Number of Mice"].tolist()

In [None]:
# Generate a bar plot showing the total number of mice for each treatment throughout the course of the study using pandas. 
number_mice_drug_df.plot(kind='bar', legend=False)

# Set main, x and y titles
plt.title("Number of Mice for Each Treatment")
plt.xlabel("Drug Regimen")
plt.ylabel("Number of Mice")

# Format bar plot
plt.tight_layout()

# Show bar plot
plt.show()

In [None]:
# Generate a bar plot showing the total number of mice for each treatment throughout the course of the study using pyplot.
plt.bar(x_axis, y_axis)

# Set main, x and y titles
plt.title("Number of Mice for Each Treatment")
plt.xlabel("Drug Regimen")
plt.ylabel("Number of Mice")

# Format bar plot
plt.xticks(rotation=90)
plt.tight_layout()

# Show bar plot
plt.show()

In [None]:
# Group the cleaned data to find the distribution of female versus male mice
sex_group = clean_combined_data_df.groupby('Sex')
sex_count = sex_group["Mouse ID"].nunique()

# Show series
sex_count

In [None]:
# Create a Dataframe from this series
sex_count_df = pd.DataFrame({"Number of Mice": sex_count})

# Show dataframe
sex_count_df

In [None]:
# Specify labels, values and colours for pie plot creation and formatting
labels = sex_count_df.index
values = sex_count_df["Number of Mice"].tolist()
colors = ["pink", "lightblue"]

In [None]:
# Generate a pie plot showing the distribution of female versus male mice using pandas, set formatting and title
pie_plot = sex_count.plot.pie(autopct="%1.1f%%", ylabel=(''), colors=colors, shadow=True, startangle=90, title='Distribution by Sex')

# Show pie plot
plt.show()

In [None]:
# Generate a pie plot showing the distribution of female versus male mice using pyplot, set formatting and title
plt.pie(values, labels=labels, autopct='%1.1f%%', colors=colors, shadow=True, startangle=90)

# Set pie plot title
plt.title("Distribution by Sex")

# Show pie plot
plt.show()

## Quartiles, Outliers and Boxplots

In [None]:
# Calculate the final tumor volume of each mouse across four of the treatment regimens:  
# Capomulin, Ramicane, Infubinol, and Ceftamin

# Start by getting the last (greatest) timepoint for each mouse
max_timepoint_df = pd.DataFrame(clean_combined_data_df.groupby('Mouse ID')['Timepoint'].max().sort_values())

# Merge this group df with the original dataframe to get the tumor volume at the last timepoint
merged_df = pd.merge(clean_combined_data_df, max_timepoint_df, on='Mouse ID')
merged_df = merged_df.rename(columns={'Timepoint_x': 'Timepoint', 'Timepoint_y': 'Max Timepoint'})

merged_df.head()

In [None]:
# Put treatments into a list for for loop (and later for plot labels)
regimens = ['Capomulin', 'Ramicane', 'Infubinol', 'Ceftamin']

# Create empty list to fill with tumor vol data (for plotting)
tumor_vol = []

# Calculate the IQR and quantitatively determine if there are any potential outliers.
for regimen in regimens:
    
    # Locate the rows which contain mice on each drug and get the tumor volumes
    regimen_df = merged_df.loc[merged_df['Drug Regimen'] == regimen]  
    
    # add subset
    final_volume_df = regimen_df.loc[regimen_df['Timepoint'] == regimen_df['Max Timepoint']]
    
    # Create a series that appends all final volume values for each regimen into tumor_vol list
    final_vol = final_volume_df['Tumor Volume (mm3)']
    tumor_vol.append(final_vol)
    
    # Calculate the IQR for each drug regimen
    quartiles = final_vol.quantile([.25,.5,.75])
    lowerq = quartiles[0.25]
    upperq = quartiles[0.75]
    iqr = upperq-lowerq
    print(f'The IQR for {regimen}: {iqr}')
    
    # Determine outliers using upper and lower bounds for each regimen
    lower_bound = lowerq - (1.5*iqr)
    upper_bound = upperq + (1.5*iqr)
    print(f'Lower Bound for {regimen}: {lower_bound}')
    print(f'Upper Bound for {regimen}: {upper_bound}')
    
    # Check for and count outliers
    outliers = (final_vol.loc[(merged_df['Tumor Volume (mm3)'] >= upper_bound) | 
                              (merged_df['Tumor Volume (mm3)'] <= lower_bound)]).count()
    print(f'Number of {regimen} outliers: {outliers}')
    print(f"---------------------------------------------")

In [None]:
# Identify Infubinol outlier

# Locate the rows which contain mice on Infubinol and get the tumor volumes
regimen_df = merged_df.loc[merged_df['Drug Regimen'] == ('Infubinol')]  
    
# add subset
final_volume_df = regimen_df.loc[regimen_df['Timepoint'] == regimen_df['Max Timepoint']]
    
# Create a series that appends all final volume values for Infubinol into tumor_vol list
final_vol = final_volume_df['Tumor Volume (mm3)']
tumor_vol.append(final_vol)
    
# Calculate the IQR for each Infubinol
quartiles = final_vol.quantile([.25,.5,.75])
lowerq = quartiles[0.25]
upperq = quartiles[0.75]
iqr = upperq-lowerq
   
# Determine outliers using upper and lower bounds for Infubinol
lower_bound = lowerq - (1.5*iqr)
upper_bound = upperq + (1.5*iqr)
    
# Check for and count outliers
outlier = (final_vol.loc[(merged_df['Tumor Volume (mm3)'] >= upper_bound) | 
                          (merged_df['Tumor Volume (mm3)'] <= lower_bound)])

index = list(outlier.index)
index


In [None]:
mouse_id = final_volume_df.at[669,'Mouse ID']
mouse_id

In [None]:
print(f'The Mouse ID of the Infubinol outlier is: {mouse_id}')

In [None]:
# Generate a box plot of the final tumor volume of each mouse across four regimens of interest
fig1, ax1 = plt.subplots()


ax1.boxplot(tumor_vol)

#  ax1.set_title('Reaction Times at Baseball Batting Cage')
ax1.set_ylabel('Final Tumor Volume (mm3)')

plt.show()

## Line and Scatter Plots

In [None]:
# Generate a line plot of time point versus tumor volume for a mouse treated with Capomulin


In [None]:
# Generate a scatter plot of mouse weight versus average tumor volume for the Capomulin regimen


## Correlation and Regression

In [None]:
# Calculate the correlation coefficient and linear regression model 
# for mouse weight and average tumor volume for the Capomulin regimen
