## Observations and Insights 

In [None]:
# Dependencies and Setup
import matplotlib.pyplot as plt
import pandas as pd
import scipy.stats as st

# Study data files
mouse_metadata_path = "data/mouse_metadata.csv"
study_results_path = "data/study_results.csv"

# Read the mouse data and the study results
mouse_metadata = pd.read_csv(mouse_metadata_path)
study_results = pd.read_csv(study_results_path)




In [None]:
# Combine the data into a single dataset
combined_data = pd.merge(mouse_metadata, study_results,how='inner', on='Mouse ID')

# Display the data table for preview
combined_data.head()

In [None]:
# Checking the number of mice.
combined_data['Mouse ID'].nunique()

In [None]:
# Getting the duplicate mice by ID number that shows up for Mouse ID and Timepoint. 
duplicate_mouseid_timepoint = combined_data[combined_data.duplicated(['Mouse ID', 'Timepoint'], keep=False)]
duplicate_mouseid_timepoint

In [None]:
# Create a clean DataFrame by dropping the duplicate mouse by its ID.
dropped = combined_data[combined_data['Mouse ID'] == 'g989'].index
clean_df = combined_data.drop(dropped)

In [None]:
# Checking the number of mice in the clean DataFrame.
clean_df['Mouse ID'].nunique()

## Summary Statistics

In [None]:
# Generate a summary statistics table of mean, median, variance, standard deviation, and SEM of the tumor volume for each regimen
regimen_stats = clean_df.groupby('Drug Regimen')['Tumor Volume (mm3)'].describe()
regiment_var = clean_df.groupby('Drug Regimen')['Tumor Volume (mm3)'].var()
regimen_stats['variance'] = regiment_var
regimen_sem = clean_df.groupby('Drug Regimen')['Tumor Volume (mm3)'].sem()
regimen_stats['SEM'] = regimen_sem
reg_stats = regimen_stats.drop(columns=['count','min','25%','75%','max'])
reg_final = reg_stats.rename(columns={'50%':'median','std':'standard deviation'})
reg_final = reg_final[['mean', 'median','variance','standard deviation','SEM']]
reg_final

In [None]:
# Generate a summary statistics table of mean, median, variance, standard deviation, and SEM of the tumor volume for each regimen
# Using the aggregation method, produce the same summary statistics in a single line
reg_agg = clean_df.groupby('Drug Regimen').agg({'Tumor Volume (mm3)': ['mean','median','var','std', 'sem']})
reg_agg

## Bar and Pie Charts

In [None]:
# Generate a bar plot showing the total number of measurements taken on each drug regimen using pandas.
total_regimen = clean_df['Drug Regimen'].value_counts()
regimen_count_df = pd.DataFrame(total_regimen)
regimen_count_df.rename(columns={'Drug Regimen':'Total'},inplace=True)
regimen_count_df.index.name = 'Drug Regimen' 

In [None]:
reg_bar = regimen_count_df.plot.bar(stacked=True)
reg_bar.set_ylabel('Total')

In [None]:
# Generate a bar plot showing the total number of measurements taken on each drug regimen using pyplot.
drug_regimen = regimen_count_df.index
drug_total = regimen_count_df['Total'].values.tolist()
plt.bar(drug_regimen , drug_total, color ='b', alpha = 0.5, align = 'center')
plt.xticks(rotation=90)
plt.xlabel('Drug Regimen')
plt.ylabel('Total')

In [None]:
# Generate a pie plot showing the distribution of female versus male mice using pandas
gender_distribution = clean_df['Sex'].value_counts()
gend_dis = pd.DataFrame(gender_distribution)
gend_dis.rename(columns={'Sex':'Total'},inplace=True)
gend_dis.index.name = 'Gender' 
gend_pie = gend_dis.plot.pie(y='Total', stacked=True, autopct='%1.1f%%', colors = ['blue','orange'])
gend_dis

In [None]:
# Generate a pie plot showing the distribution of female versus male mice using pyplot
gender_label = clean_df['Sex'].unique()
gender_count = gend_dis['Total'].values.tolist()
plt.pie(gender_count, labels = gender_label, autopct='%1.1f%%', colors = ['blue','orange'])

## Quartiles, Outliers and Boxplots

In [None]:
# Calculate the final tumor volume of each mouse across four of the treatment regimens:  
# Capomulin, Ramicane, Infubinol, and Ceftamin
timepoint_max = clean_df[clean_df.groupby(['Mouse ID'])['Timepoint'].transform(max) == clean_df['Timepoint']]

clean_final = timepoint_max[~timepoint_max['Drug Regimen'].isin(['Placebo', 'Stelasyn', 'Zoniferol', 'Ketapril', 'Propriva', 'Naftisol'])]
clean_final[['Mouse ID', 'Drug Regimen','Timepoint' ,'Tumor Volume (mm3)']]

In [None]:
# Calculate the quartiles and IQR and quantitatively determine if there are any potential outliers across all four treatment regimens.

In [None]:
# Create seperate dataframe for Capomulin
capomulin = clean_final.groupby(['Drug Regimen'])
capomulin_df = capomulin.get_group('Capomulin')

In [None]:
# Create seperate dataframe for Ceftamin 
ceftamin = clean_final.groupby(['Drug Regimen'])
ceftamin_df = capomulin.get_group('Ceftamin')

In [None]:
# Create seperate dataframe for Infubinol 
infubinol = clean_final.groupby(['Drug Regimen'])
infubinol_df = capomulin.get_group('Infubinol')


In [None]:
# Create seperate dataframe for Ramicane 
ramicane = clean_final.groupby(['Drug Regimen'])
ramicane_df = capomulin.get_group('Ramicane')

In [None]:
# Create seperate dataframe for Ramicane 
ramicane = clean_final.groupby(['Drug Regimen'])
ramicane_df = capomulin.get_group('Ramicane')

In [None]:
# Find lower quartiles of every drug 
q1 = clean_final.groupby('Drug Regimen')['Tumor Volume (mm3)'].quantile(0.25)
q1

In [None]:
# Find upper quartiles of every drug 
q3 = clean_final.groupby('Drug Regimen')['Tumor Volume (mm3)'].quantile(0.75)
q3

In [None]:
# Find the IQR of every drug 
IQR = q3 - q1 
IQR

In [None]:
# Determine if there are any potential outliers in Capomulin
capomulin_outlier_lower = q1[0] - (1.5*IQR[0])
capomulin_outlier_upper = q3[0] + (1.5*IQR[0])
capomulin_filt = [(capomulin_df['Tumor Volume (mm3)'] < capomulin_outlier_lower) | (capomulin_df['Tumor Volume (mm3)'] > capomulin_outlier_upper)]
cap_count_df = pd.DataFrame(capomulin_outlier)
cap_count_dft = cap_count_df.T
cap_count_dft['Tumor Volume (mm3)'].value_counts()
cap_out_count = (cap_count_dft['Tumor Volume (mm3)']).values.sum()

In [None]:
# Print capomulin result 
print(f'Values below {capomulin_outlier_lower} could be outliers.')
print(f'Values above {capomulin_outlier_upper} could be outliers.')
print(f'There could be {cap_out_count} outlier(s) in this dataset')

In [None]:
# Determine if there are any potential outliers in Ceftamin
ceftamin_outlier_lower = q1[1] - (1.5*IQR[1])
ceftamin_outlier_upper = q3[1] + (1.5*IQR[1])
ceftamin_filt = (ceftamin_df['Tumor Volume (mm3)'] < ceftamin_outlier_lower) | (ceftamin_df['Tumor Volume (mm3)'] > ceftamin_outlier_upper)
cef_count_df = pd.DataFrame(ceftamin_filt)
cef_count_df['Tumor Volume (mm3)'].value_counts()
cef_out_count = (cef_count_df['Tumor Volume (mm3)']).values.sum()

In [None]:
# Print ceftamin result 
print(f'Values below {ceftamin_outlier_lower} could be outliers.')
print(f'Values above {ceftamin_outlier_upper} could be outliers.')
print(f'There could be {cef_out_count} outlier(s) in this dataset')

In [None]:
# Determine if there are any potential outliers in Infubinol
infubinol_outlier_lower = q1[2] - (1.5*IQR[2])
infubinol_outlier_upper = q3[2] + (1.5*IQR[2])
infubinol_filt = (infubinol_df['Tumor Volume (mm3)'] < infubinol_outlier_lower) | (infubinol_df['Tumor Volume (mm3)'] > infubinol_outlier_upper)
inf_count_df = pd.DataFrame(infubinol_filt)
inf_count_df['Tumor Volume (mm3)'].value_counts()
inf_out_count = (inf_count_df['Tumor Volume (mm3)']).values.sum()


In [None]:
# Print infubinol result 
print(f'Values below {infubinol_outlier_lower} could be outliers.')
print(f'Values above {infubinol_outlier_upper} could be outliers.')
print(f'There could be {inf_out_count} outlier(s) in this dataset')
infubinol_df[infubinol_filt]

In [None]:
# Determine if there are any potential outliers in Ramicane
ramicane_outlier_lower = q1[3] - (1.5*IQR[3])
ramicane_outlier_upper = q3[3] + (1.5*IQR[3])
ramicane_outlier = [(ramicane_df['Tumor Volume (mm3)'] < ramicane_outlier_lower) | (ramicane_df['Tumor Volume (mm3)'] > ramicane_outlier_upper)]
ram_count_df = pd.DataFrame(ramicane_outlier)
ram_count_dft = ram_count_df.T
ram_count_dft['Tumor Volume (mm3)'].value_counts()
ram_out_count = (ram_count_dft['Tumor Volume (mm3)']).values.sum()
print(f'Values below {ramicane_outlier_lower} could be outliers.')
print(f'Values above {ramicane_outlier_upper} could be outliers.')
print(f'There could be {ram_out_count} outlier(s) in this dataset')


In [None]:
# Generate a box plot of the final tumor volume of each mouse across four regimens of interest


## Line and Scatter Plots

In [None]:
# Generate a line plot of tumor volume vs. time point for a mouse treated with Capomulin


In [None]:
# Generate a scatter plot of average tumor volume vs. mouse weight for the Capomulin regimen


## Correlation and Regression

In [None]:
# Calculate the correlation coefficient and linear regression model 
# for mouse weight and average tumor volume for the Capomulin regimen
