# Pymaceuticals Inc.
---

### Analysis

- The weight of the mouse in the Capomulin treatment showed a significant positve correlation, r-value = 0.84, to the average tumor volume meaning the heavier mice tended to have larger tumors.


- Mice in the Capomulin and Ramicane treatments ended treatments with the lowest  observed tumor volumes.


- There was only one observed outlier over all of the treatments. Which consisted of a mouse from the Infubinol trial ending the study with a tumor volume size comparable to the mean final tumor volume size in the Ramicane and Capomulin treatments.
 

In [None]:
# Dependencies and Setup
import matplotlib.pyplot as plt
import pandas as pd
import scipy.stats as st
from scipy.stats import linregress
import numpy as np

# Study data files
mouse_metadata_path = "data/Mouse_metadata.csv"
study_results_path = "data/Study_results.csv"

# Read the mouse data and the study results
mouse_metadata = pd.read_csv(mouse_metadata_path)
study_results = pd.read_csv(study_results_path)

# Combine the data into a single DataFrame

mouse_study_complete = pd.merge(mouse_metadata,study_results, on = 'Mouse ID' , how = 'outer')

# Display the data table for preview
mouse_study_complete.head()


In [None]:
# Checking the number of mice.
mouse_count = mouse_metadata['Mouse ID'].count()

mouse_count

In [None]:
# Our data should be uniquely identified by Mouse ID and Timepoint
# Get the duplicate mice by ID number that shows up for Mouse ID and Timepoint. 
duplicate_mice = mouse_study_complete.loc[mouse_study_complete.duplicated(subset=['Mouse ID','Timepoint'],keep=False),'Mouse ID'].unique()
duplicate_mice

In [None]:
# Optional: Get all the data for the duplicate mouse ID. 
duplicates = mouse_study_complete.loc[mouse_study_complete['Mouse ID'] == 'g989']
duplicates

In [None]:
# Create a clean DataFrame by dropping the duplicate mouse by its ID.
clean_df = mouse_study_complete[mouse_study_complete["Mouse ID"].isin(duplicate_mice) == False]
clean_df

In [None]:
# Checking the number of mice in the clean DataFrame.
mice = clean_df['Mouse ID'].unique()
mouse_count = len(mice)
mouse_count



## Summary Statistics

In [None]:
# Generate a summary statistics table of mean, median, variance, standard deviation, and SEM of the tumor volume for each regimen

# Use groupby and summary statistical methods to calculate the following properties of each drug regimen: 
# mean, median, variance, standard deviation, and SEM of the tumor volume. 
# Assemble the resulting series into a single summary DataFrame.


In [None]:
# A more advanced method to generate a summary statistics table of mean, median, variance, standard deviation,
# and SEM of the tumor volume for each regimen (only one method is required in the solution)

# Using the aggregation method, produce the same summary statistics in a single line
drug_df = clean_df.groupby(['Drug Regimen']).agg(
    mean=pd.NamedAgg(column='Tumor Volume (mm3)', aggfunc= 'mean'),
    median=pd.NamedAgg(column='Tumor Volume (mm3)',aggfunc='median'),
    variance=pd.NamedAgg(column='Tumor Volume (mm3)',aggfunc= 'var'),
    std=pd.NamedAgg(column='Tumor Volume (mm3)',aggfunc='std'),
    sem = pd.NamedAgg(column='Tumor Volume (mm3)',aggfunc='sem')
)

drug_df = drug_df.rename(columns={'mean' : 'Mean Tumor Volume',
                        'median' : "Median Tumor Volume",
                        'variance' : 'Tumor Volume Variance',
                        'std' : 'Tumor Volume Std. Dev.',
                        'sem' : 'Tumor Volume Std. Err.'
                        })
drug_df

## Bar and Pie Charts

In [None]:
# Generate a bar plot showing the total number of rows (Mouse ID/Timepoints) for each drug regimen using Pandas.
timepoint_count = clean_df.groupby(['Drug Regimen'])['Timepoint'].count()
timepoint_count = timepoint_count.sort_values(ascending=False)
observation_df = pd.DataFrame({'Timepoint Count' : timepoint_count})

print(timepoint_count)

observation_plot = observation_df.plot(kind='bar', figsize= (7,4), xlabel= 'Drug Regimen',ylabel='# of Observed Mouse Timepoints') 


In [None]:
# Generate a bar plot showing the total number of rows (Mouse ID/Timepoints) for each drug regimen using pyplot.
drugs = clean_df['Drug Regimen'].unique()
x_axis = np.arange(len(drugs))
tick_locations = [value for value in x_axis]

plt.figure(figsize=(7,4))
plt.bar(x_axis,timepoint_count,color='b',alpha=0.5,align='center')
plt.xticks(tick_locations,drugs,rotation='vertical')

plt.xlim(-0.75, len(x_axis))
plt.ylim(0, max(timepoint_count) + 20)

plt.xlabel("Drug Regimem")
plt.ylabel("# of Observed Mouse Timepoints")

plt.tight_layout()

plt.show()

In [None]:
# Generate a pie plot showing the distribution of female versus male mice using Pandas
sex_counts = clean_df['Sex'].value_counts()
sex_counts

sex_plot = sex_counts.plot.pie(y='Sex', autopct='%1.1f%%', figsize = (5,5))
sex_plot

In [None]:
# Generate a pie plot showing the distribution of female versus male mice using pyplot
labels = ["Male","Female"]
plt.figure(figsize=(5,5))
plt.ylabel('Sex')
pie_plot = plt.pie(sex_counts, labels = labels, autopct='%1.1f%%')

pie_plot

## Quartiles, Outliers and Boxplots

In [None]:
# Calculate the final tumor volume of each mouse across four of the treatment regimens:  
# Capomulin, Ramicane, Infubinol, and Ceftamin

# Start by getting the last (greatest) timepoint for each mouse
greatest_timepoint = mouse_study_complete.groupby('Mouse ID')
timepoint_max = greatest_timepoint['Timepoint'].max()


# Merge this group df with the original DataFrame to get the tumor volume at the last timepoint
greatest_timepoint_filtered = pd.merge(timepoint_max,mouse_study_complete, on=['Mouse ID','Timepoint'], how = 'left')
greatest_timepoint_filtered

In [None]:
# Put treatments into a list for for loop (and later for plot labels)
treatment_regimen = ['Capomulin','Ramicane','Infubinol','Ceftamin']


# Create empty list to fill with tumor vol data (for plotting)
tumor_vol_data = []

# Calculate the IQR and quantitatively determine if there are any potential outliers. 
for drug in treatment_regimen:
    # Locate the rows which contain mice on each drug and get the tumor volumes
    final_tumor_vol = greatest_timepoint_filtered.loc[greatest_timepoint_filtered['Drug Regimen']== drug,"Tumor Volume (mm3)"]

    # add subset 
    tumor_vol_data.append(final_tumor_vol)

    quartiles = final_tumor_vol.quantile([0.25,0.5,0.75])
    lowerq = quartiles[0.25]
    upperq = quartiles[0.75]
    iqr = upperq-lowerq

    #Determine outliers using upper and lower bounds
    
    lowerbound = lowerq -(1.5*iqr)
    upperbound = upperq +(1.5*iqr)

    outliers = [x for x in final_tumor_vol  if x < lowerbound or x > upperbound] 
    
    print(f"{drug}'s potential outliers : {outliers} ")


In [None]:
# Generate a box plot that shows the distrubution of the tumor volume for each treatment group.
flierprops = dict(markerfacecolor = 'r', markersize = 10)
plt.boxplot(tumor_vol_data , labels= treatment_regimen, flierprops=flierprops)
plt.ylabel('Final Tumor Volume (mm3)')


## Line and Scatter Plots

In [None]:
# Generate a line plot of tumor volume vs. time point for a single mouse treated with Capomulin
capomulin_df = clean_df.loc[clean_df['Drug Regimen'] == 'Capomulin',:]


x_data = capomulin_df.loc[capomulin_df['Mouse ID'] == 'i557', "Timepoint"]
y_data = capomulin_df.loc[capomulin_df['Mouse ID'] == 'i557', "Tumor Volume (mm3)"]

plt.plot(x_data,y_data)
plt.ylabel('Tumor Volume (mm3)')
plt.xlabel('Timepoint(days)')
plt.title('Capomulin Treatment of mouse I557')


capomulin_df

In [None]:
# Generate a scatter plot of mouse weight vs. the average observed tumor volume for the entire Capomulin regimen

grouped_mice = capomulin_df.groupby(['Mouse ID'])['Weight (g)', "Tumor Volume (mm3)"].mean()

weights = grouped_mice['Weight (g)']
volumes = grouped_mice['Tumor Volume (mm3)']


plt.scatter(weights,volumes,marker='o',facecolors = 'blue')
plt.xlabel("Weight (g)")
plt.ylabel("Average Tumor Volume (mm3)")
plt.title("Capomulin Treatment: Average Tumor Volume vs Mouse Weight")



## Correlation and Regression

In [None]:
# Calculate the correlation coefficient and a linear regression model 
# for mouse weight and average observed tumor volume for the entire Capomulin regimen
(slope, intercept, rvalue, pvalue, stderr) = st.linregress(weights,volumes)
regress_values = slope * weights + intercept
line_eq = 'y = ' + str(round(slope,2)) + 'x = ' + str(round(intercept,2))
plt.scatter(weights,volumes)
plt.plot(weights,regress_values,"r-")
plt.xlabel("Weight (g)")
plt.ylabel("Average Tumor Volume (mm3)")
plt.title("Capomulin Treatment: Average Tumor Volume vs Mouse Weight")
print(f'The correlation between mouse weight and the average tumor volume is {round(rvalue,2)}')
