## Observations and Insights 

In [1]:
# Dependencies and Setup
import matplotlib.pyplot as plt
import pandas as pd
import scipy.stats as st
import numpy as np

# Study data files
mouse_metadata_path = "data/Mouse_metadata.csv"
study_results_path = "data/Study_results.csv"

# Read the mouse data and the study results
mouse_metadata = pd.read_csv(mouse_metadata_path)
study_results = pd.read_csv(study_results_path)

# Combine the data into a single dataset
combined_set = pd.merge(mouse_metadata, study_results, how = "outer", on= "Mouse ID" )
# Display the data table for preview
combined_set

Unnamed: 0,Mouse ID,Drug Regimen,Sex,Age_months,Weight (g),Timepoint,Tumor Volume (mm3),Metastatic Sites
0,k403,Ramicane,Male,21,16,0,45.000000,0
1,k403,Ramicane,Male,21,16,5,38.825898,0
2,k403,Ramicane,Male,21,16,10,35.014271,1
3,k403,Ramicane,Male,21,16,15,34.223992,1
4,k403,Ramicane,Male,21,16,20,32.997729,1
...,...,...,...,...,...,...,...,...
1888,z969,Naftisol,Male,9,30,25,63.145652,2
1889,z969,Naftisol,Male,9,30,30,65.841013,3
1890,z969,Naftisol,Male,9,30,35,69.176246,4
1891,z969,Naftisol,Male,9,30,40,70.314904,4


In [18]:
# Checking the number of mice.
combined_set["Mouse ID"].count()

1893

In [19]:
#unique number of mice
combined_set["Mouse ID"].nunique()

249

In [2]:
# Optional: Get all the data for the duplicate mouse ID. 
duplicated_data = combined_set[combined_set.duplicated(["Mouse ID", "Timepoint"])]
duplicated_data

Unnamed: 0,Mouse ID,Drug Regimen,Sex,Age_months,Weight (g),Timepoint,Tumor Volume (mm3),Metastatic Sites
909,g989,Propriva,Female,21,26,0,45.0,0
911,g989,Propriva,Female,21,26,5,47.570392,0
913,g989,Propriva,Female,21,26,10,49.880528,0
915,g989,Propriva,Female,21,26,15,53.44202,0
917,g989,Propriva,Female,21,26,20,54.65765,1


In [2]:
# Create a clean DataFrame by dropping the duplicate mouse by its ID.
deleted_dups = combined_set.drop_duplicates(["Mouse ID", "Timepoint"])
deleted_dups

Unnamed: 0,Mouse ID,Drug Regimen,Sex,Age_months,Weight (g),Timepoint,Tumor Volume (mm3),Metastatic Sites
0,k403,Ramicane,Male,21,16,0,45.000000,0
1,k403,Ramicane,Male,21,16,5,38.825898,0
2,k403,Ramicane,Male,21,16,10,35.014271,1
3,k403,Ramicane,Male,21,16,15,34.223992,1
4,k403,Ramicane,Male,21,16,20,32.997729,1
...,...,...,...,...,...,...,...,...
1888,z969,Naftisol,Male,9,30,25,63.145652,2
1889,z969,Naftisol,Male,9,30,30,65.841013,3
1890,z969,Naftisol,Male,9,30,35,69.176246,4
1891,z969,Naftisol,Male,9,30,40,70.314904,4


In [4]:
# Checking the number of mice in the clean DataFrame.
deleted_dups["Mouse ID"].count() 

1888

## Summary Statistics

In [3]:
# Generate a summary statistics table of mean, median, variance, standard deviation, and SEM of the tumor volume for each regimen
# This method is the most straighforward, creating multiple series and putting them all together at the end.

drug_groups = deleted_dups.groupby("Drug Regimen")
summary_df = pd.DataFrame({
    "Mean" : drug_groups["Tumor Volume (mm3)"].mean(),
    "Median" : drug_groups["Tumor Volume (mm3)"].median(),
    "Variance" :  drug_groups["Tumor Volume (mm3)"].var(),
    "Standard Deviation" : drug_groups["Tumor Volume (mm3)"].std(),
    "Standard Error" : drug_groups["Tumor Volume (mm3)"].sem()
    
})


summary_df

Unnamed: 0_level_0,Mean,Median,Variance,Standard Deviation,Standard Error
Drug Regimen,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
Capomulin,40.675741,41.557809,24.947764,4.994774,0.329346
Ceftamin,52.591172,51.776157,39.290177,6.268188,0.469821
Infubinol,52.884795,51.820584,43.128684,6.567243,0.492236
Ketapril,55.235638,53.698743,68.553577,8.279709,0.60386
Naftisol,54.331565,52.509285,66.173479,8.134708,0.596466
Placebo,54.033581,52.288934,61.168083,7.821003,0.581331
Propriva,52.393463,50.909965,43.138803,6.568014,0.525862
Ramicane,40.216745,40.673236,23.486704,4.846308,0.320955
Stelasyn,54.233149,52.431737,59.450562,7.710419,0.573111
Zoniferol,53.236507,51.818479,48.533355,6.966589,0.516398


In [9]:
# Generate a summary statistics table of mean, median, variance, standard deviation, and SEM of the tumor volume for each regimen
# This method produces everything in a single groupby function
summary_df_2 =  deleted_dups.groupby("Drug Regimen")["Tumor Volume (mm3)"].agg(["mean", "median", "var", "std", "sem"])
summary_df_2

Unnamed: 0_level_0,mean,median,var,std,sem
Drug Regimen,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
Capomulin,40.675741,41.557809,24.947764,4.994774,0.329346
Ceftamin,52.591172,51.776157,39.290177,6.268188,0.469821
Infubinol,52.884795,51.820584,43.128684,6.567243,0.492236
Ketapril,55.235638,53.698743,68.553577,8.279709,0.60386
Naftisol,54.331565,52.509285,66.173479,8.134708,0.596466
Placebo,54.033581,52.288934,61.168083,7.821003,0.581331
Propriva,52.393463,50.909965,43.138803,6.568014,0.525862
Ramicane,40.216745,40.673236,23.486704,4.846308,0.320955
Stelasyn,54.233149,52.431737,59.450562,7.710419,0.573111
Zoniferol,53.236507,51.818479,48.533355,6.966589,0.516398


In [10]:
summary_df.index

Index(['Capomulin', 'Ceftamin', 'Infubinol', 'Ketapril', 'Naftisol', 'Placebo',
       'Propriva', 'Ramicane', 'Stelasyn', 'Zoniferol'],
      dtype='object', name='Drug Regimen')

## Bar and Pie Charts

In [11]:
regimen_mice_count = drug_groups["Mouse ID"].nunique().sort_values()
regimen_mice_count

Drug Regimen
Stelasyn     24
Capomulin    25
Ceftamin     25
Infubinol    25
Ketapril     25
Naftisol     25
Placebo      25
Propriva     25
Ramicane     25
Zoniferol    25
Name: Mouse ID, dtype: int64

In [4]:
# Generate a bar plot showing the total number of mice for each treatment throughout the course of the study using pandas. 
%matplotlib widget

# drug_groups is the Drug Regimen groupby object
regimen_mice_count = drug_groups["Mouse ID"].nunique().sort_values()
label = summary_df.index

# bar plot
bar_plot = regimen_mice_count.plot(kind = "bar", figsize = (8,4), title = "Mice Count per Drug Regimen")

# changing y axis limits
bar_plot.set_ylim(0,max(regimen_mice_count) +3)
#bar_plot.set_xlim(-, len(label))

# name axis
bar_plot.set_xticklabels(labels = label, rotation=45)
bar_plot.set_ylabel("Frequency")
plt.tight_layout()

Canvas(toolbar=Toolbar(toolitems=[('Home', 'Reset original view', 'home', 'home'), ('Back', 'Back to previous …

In [5]:
 #if I don't include it in the cell then it graphs on the previous grapgh 
%matplotlib widget

# get base color like graph above
import seaborn as sb 
base_color = sb.color_palette()[0]

# Generate a bar plot showing the total number of mice for each treatment throughout the course of the study using pyplot.
x_axis = np.arange(len(drug_groups))
tick_locations = [value for value in x_axis]

label = summary_df.index

plt.bar(x_axis, regimen_mice_count, color = base_color, alpha = 1, align="center")
plt.xticks(tick_locations, label, rotation="vertical")

#set title for the chart 
plt.title("Mice Count per Drug Regimen")

#labelling axis 
plt.ylabel("Frequency") 
plt.xlabel("Drug Regimen")
plt.xticks(rotation = 45)
plt.ylim(0, max(regimen_mice_count)+ 3)

plt.tight_layout()
plt.show()

Canvas(toolbar=Toolbar(toolitems=[('Home', 'Reset original view', 'home', 'home'), ('Back', 'Back to previous …

In [7]:
#prepare gender distributed data from clean datafrane: deleted_dups
gender_data = deleted_dups.groupby('Sex')['Mouse ID'].nunique()
gender_data

Sex
Female    124
Male      125
Name: Mouse ID, dtype: int64

In [18]:
gender_data.index

Index(['Female', 'Male'], dtype='object', name='Sex')

In [8]:
%matplotlib widget
# Generate a pie plot showing the distribution of female versus male mice using pandas
explode = [0.01, 0]
pie_chart = gender_data.plot(kind = "pie", startangle = 90, autopct="%1.1f%%", 
                             explode = explode, title = "Mice Gender Distribution")

#removing "Mouse ID" from graph display
pie_chart.set_ylabel("")
pie_chart

Canvas(toolbar=Toolbar(toolitems=[('Home', 'Reset original view', 'home', 'home'), ('Back', 'Back to previous …

<matplotlib.axes._subplots.AxesSubplot at 0x21517f4c3c8>

In [9]:
%matplotlib widget
# Generate a pie plot showing the distribution of female versus male mice using pyplot
explode = [0.01, 0]
plt.pie(gender_data, labels = gender_data.index, explode = explode, autopct="%1.1f%%", 
        startangle= 90)
# Tells matplotlib that we want a pie chart with equal axes
plt.axis("equal")
plt.title("Mice Gender Distribution")

Canvas(toolbar=Toolbar(toolitems=[('Home', 'Reset original view', 'home', 'home'), ('Back', 'Back to previous …

Text(0.5, 1.0, 'Mice Gender Distribution')

## Quartiles, Outliers and Boxplots

In [206]:
# Calculate the final tumor volume of each mouse across four of the treatment regimens:  
# Capomulin, Ramicane, Infubinol, and Ceftamin

In [207]:
deleted_dups["Drug Regimen"].unique()

array(['Ramicane', 'Capomulin', 'Infubinol', 'Placebo', 'Ceftamin',
       'Stelasyn', 'Zoniferol', 'Ketapril', 'Propriva', 'Naftisol'],
      dtype=object)

In [10]:
#creating a dataframe with this drug regimens: Capomulin, Ramicane, Infubinol, and Ceftamin and this columns:"Mouse ID", "Drug Regimen", "Timepoint", "Tumor Volume (mm3)"
reduced_df = deleted_dups[deleted_dups["Drug Regimen"].isin(["Capomulin", "Ramicane", "Infubinol", "Ceftamin"])][["Mouse ID", "Drug Regimen", "Timepoint", "Tumor Volume (mm3)"]]

#creating a 'filter' 
f =  {"Timepoint": ["max"] , "Tumor Volume (mm3)": ["last"]}

#grouping by drug regimen and mouse and applying filter
filtered_reduced = reduced_df.groupby(["Drug Regimen", "Mouse ID"]).agg(f)

#undoing the groupby for easier data handling later
filtered_reseted = filtered_reduced.reset_index()

In [209]:
filtered_reseted

Unnamed: 0,Drug Regimen,Mouse ID,Timepoint,Tumor Volume (mm3)
0,Capomulin,b128,45,38.982878
1,Capomulin,b742,45,38.939633
2,Capomulin,f966,20,30.485985
3,Capomulin,g288,45,37.074024
4,Capomulin,g316,45,40.159220
...,...,...,...,...
95,Ramicane,s508,45,30.276232
96,Ramicane,u196,45,40.667713
97,Ramicane,w678,5,43.166373
98,Ramicane,y449,15,44.183451


In [11]:
#preparing panda series for plotting the Tumor Volumes corresponding to each Drug Regimen 
capomulin_series = filtered_reseted[filtered_reseted["Drug Regimen"].isin(["Capomulin"])]["Tumor Volume (mm3)"]['last']
ramicane_series =  filtered_reseted[filtered_reseted["Drug Regimen"].isin(["Ramicane"])]["Tumor Volume (mm3)"]['last']
infubinol_series =  filtered_reseted[filtered_reseted["Drug Regimen"].isin(["Infubinol"])]["Tumor Volume (mm3)"]['last']
ceftamin_series=  filtered_reseted[filtered_reseted["Drug Regimen"].isin(["Ceftamin"])]["Tumor Volume (mm3)"]['last']

In [12]:
#IQR Capomulin

cap_quartiles = capomulin_series.quantile([.25,.5,.75])
cap_lowerq = round(cap_quartiles[0.25], 2)
cap_upperq = round(cap_quartiles[0.75], 2)
cap_iqr = cap_upperq - cap_lowerq

print(f"The lower quartile of Capomuling is: {cap_lowerq}")
print(f"The upper quartile of Capomuling is: {cap_upperq}")
print(f"The interquartile range of Capomuling is: {round(cap_iqr, 2)}")
print(f"The the median of Capomuling is: {round(cap_quartiles[0.5],2)} ")

cap_lower_bound = cap_lowerq - (1.5 * cap_iqr)
cap_upper_bound = cap_upperq + (1.5 * cap_iqr)
print(f"Values below {round(cap_lower_bound, 2)} could be outliers for Capomulin.")
print(f"Values above {round(cap_upper_bound, 2)} could be outliers for Capomulin.")


The lower quartile of Capomuling is: 32.38
The upper quartile of Capomuling is: 40.16
The interquartile range of Capomuling is: 7.78
The the median of Capomuling is: 38.13 
Values below 20.71 could be outliers for Capomulin.
Values above 51.83 could be outliers for Capomulin.


In [13]:
#IQR Ramicane

ram_quartiles = ramicane_series.quantile([.25,.5,.75])
ram_lowerq = round(ram_quartiles[0.25], 2)
ram_upperq = round(ram_quartiles[0.75], 2)
ram_iqr = ram_upperq - ram_lowerq

print(f"The lower quartile of Ramicane is: {ram_lowerq}")
print(f"The upper quartile of Ramicane is: {ram_upperq}")
print(f"The interquartile range of Ramicane is: {round(ram_iqr, 2)}")
print(f"The the median of Ramicane is: {round(ram_quartiles[0.5],2)} ")

ram_lower_bound = ram_lowerq - (1.5*ram_iqr)
ram_upper_bound = ram_upperq + (1.5*ram_iqr)
print(f"Values below {round(ram_lower_bound, 2)} could be outliers for Ramicane.")
print(f"Values above {round(ram_upper_bound, 2)} could be outliers for Ramicane.")


The lower quartile of Ramicane is: 31.56
The upper quartile of Ramicane is: 40.66
The interquartile range of Ramicane is: 9.1
The the median of Ramicane is: 36.56 
Values below 17.91 could be outliers for Ramicane.
Values above 54.31 could be outliers for Ramicane.


In [14]:
#IQR Infubinol

inf_quartiles = infubinol_series.quantile([.25,.5,.75])
inf_lowerq = round(inf_quartiles[0.25], 2)
inf_upperq = round(inf_quartiles[0.75], 2)
inf_iqr = inf_upperq- inf_lowerq

print(f"The lower quartile of Infubinol is: {inf_lowerq}")
print(f"The upper quartile of Infubinol is: {inf_upperq}")
print(f"The interquartile range of Infubinol is: {round(inf_iqr, 2)}")
print(f"The the median of Infubinol is: {round(inf_quartiles[0.5], 2)} ")

inf_lower_bound = inf_lowerq - (1.5*inf_iqr)
inf_upper_bound = inf_upperq + (1.5*inf_iqr)
print(f"Values below {round(inf_lower_bound, 2)} could be outliers for Infubinol.")
print(f"Values above {round(inf_upper_bound, 2)} could be outliers for Infubinol.")


The lower quartile of Infubinol is: 54.05
The upper quartile of Infubinol is: 65.53
The interquartile range of Infubinol is: 11.48
The the median of Infubinol is: 60.17 
Values below 36.83 could be outliers for Infubinol.
Values above 82.75 could be outliers for Infubinol.


In [15]:
#IQR Ceftamin

quartiles = ceftamin_series.quantile([.25,.5,.75])
lowerq = quartiles[0.25]
upperq = quartiles[0.75]
iqr = upperq-lowerq

print(f"The lower quartile of Ceftamin is: {round(lowerq, 2)}")
print(f"The upper quartile of Ceftamin is: {round(upperq, 2)}")
print(f"The interquartile range of Ceftamin is: {round(iqr, 2)}")
print(f"The the median of Ceftamin is: {round(quartiles[0.5], 2)} ")

lower_bound = lowerq - (1.5*iqr)
upper_bound = upperq + (1.5*iqr)
print(f"Values below {round(lower_bound, 2)} could be outliers for Ceftamin.")
print(f"Values above {round(upper_bound, 2)} could be outliers for Ceftamin.")

The lower quartile of Ceftamin is: 48.72
The upper quartile of Ceftamin is: 64.3
The interquartile range of Ceftamin is: 15.58
The the median of Ceftamin is: 59.85 
Values below 25.36 could be outliers for Ceftamin.
Values above 87.67 could be outliers for Ceftamin.


In [16]:
#labels 
treatment_names = np.sort(reduced_df["Drug Regimen"].unique())
treatment_names

array(['Capomulin', 'Ceftamin', 'Infubinol', 'Ramicane'], dtype=object)

In [17]:
# Generate a box plot of the final tumor volume of each mouse across four regimens of interest
%matplotlib widget

plt.title('Drug Regimens Effect on Tumors')
plt.ylabel('Tumor Volume (mm3)')
plt.boxplot([capomulin_series,ceftamin_series, infubinol_series, ramicane_series], labels = treatment_names)
plt.ylim(15, 75)
plt.show()

Canvas(toolbar=Toolbar(toolitems=[('Home', 'Reset original view', 'home', 'home'), ('Back', 'Back to previous …

## Line and Scatter Plots

In [18]:
#data frame that contains all timepoint data
reduced_df

Unnamed: 0,Mouse ID,Drug Regimen,Timepoint,Tumor Volume (mm3)
0,k403,Ramicane,0,45.000000
1,k403,Ramicane,5,38.825898
2,k403,Ramicane,10,35.014271
3,k403,Ramicane,15,34.223992
4,k403,Ramicane,20,32.997729
...,...,...,...,...
1868,z581,Infubinol,25,54.316407
1869,z581,Infubinol,30,56.286750
1870,z581,Infubinol,35,58.628399
1871,z581,Infubinol,40,60.053740


In [19]:
# Generate a line plot of time point versus tumor volume for a mouse treated with Capomulin

#Creating a dataframe with only capomulin data 
capomulin_df = reduced_df[reduced_df["Drug Regimen"].isin(["Capomulin"])]

#data frame for a singlie mouse treated with capomulin
capomulin_mouse_df = capomulin_df[capomulin_df['Mouse ID'] == 's185']

#capomulin mouse required data points
mouse_timepoint = capomulin_mouse_df['Timepoint']
mouse_tumor = capomulin_mouse_df['Tumor Volume (mm3)']


In [20]:
#line plot
%matplotlib widget
plt.plot(mouse_timepoint, mouse_tumor, color = base_color)
plt.title("Tumor Evolution for Mouse ID s185 Treated with Capomulin")
plt.xlabel('Timepoint')
plt.ylabel("Tumor Volume (mm3)")
plt.grid()

Canvas(toolbar=Toolbar(toolitems=[('Home', 'Reset original view', 'home', 'home'), ('Back', 'Back to previous …

In [21]:
capomulin_weights = deleted_dups[deleted_dups["Drug Regimen"].isin(["Capomulin"])][["Mouse ID", "Weight (g)"  ,"Tumor Volume (mm3)"]]
capomulin_weight_tumor_means = capomulin_weights.groupby('Mouse ID').agg({"Weight (g)": "mean", "Tumor Volume (mm3)":  "mean"}).reset_index()
capomulin_weight_tumor_means["Weight (g)"]

0     22
1     21
2     17
3     19
4     22
5     24
6     20
7     23
8     21
9     21
10    19
11    17
12    19
13    25
14    17
15    25
16    17
17    23
18    17
19    17
20    21
21    23
22    21
23    15
24    17
Name: Weight (g), dtype: int64

In [22]:
# Generate a scatter plot of mouse weight versus average tumor volume for the Capomulin regimen

#creating a data frame for Capomulin with Weight and Tummor volume
capomulin_weights = deleted_dups[deleted_dups["Drug Regimen"].isin(["Capomulin"])][["Mouse ID", "Weight (g)"  ,"Tumor Volume (mm3)"]]
capomulin_weight_tumor_means = capomulin_weights.groupby('Mouse ID').agg({"Weight (g)": "mean", "Tumor Volume (mm3)":  "mean"}).reset_index()

#preparing x and y data
weight_means = capomulin_weight_tumor_means['Weight (g)']
tumor_means = capomulin_weight_tumor_means['Tumor Volume (mm3)']

%matplotlib widget
plt.scatter(weight_means, tumor_means)
plt.title("Average Tumor vs Average Weight for Capomulin Treated Mice")
plt.xlabel('Weight (g)')
plt.ylabel("Tumor Volume (mm3)")

Canvas(toolbar=Toolbar(toolitems=[('Home', 'Reset original view', 'home', 'home'), ('Back', 'Back to previous …

Text(0, 0.5, 'Tumor Volume (mm3)')

## Correlation and Regression

In [23]:
# Calculate the correlation coefficient and linear regression model 
# for mouse weight and average tumor volume for the Capomulin regimen

#correlation coefficient
correlation = round(st.pearsonr(weight_means, tumor_means)[0], 2)
print(f'The correlation between mouse weight and average tumor volume for the Capomulin regimen is {correlation}')

correlation_coefficient_conclusion = "This value indicates that there is a strong positive correlation between weight and tumor volume. The heavier the mouse, the bigger the tumor"
print(correlation_coefficient_conclusion)

The correlation between mouse weight and average tumor volume for the Capomulin regimen is 0.84
This value indicates that there is a strong positive correlation between weight and tumor volume. The heavier the mouse, the bigger the tumor


In [24]:
#linear regression
(slope, intercept, rvalue, pvalue, stderr) = st.linregress(weight_means, tumor_means)

# Get regression values
regress_values = weight_means * slope + intercept

# Create line equation string
line_eq = "y = " + str(round(slope,2)) + "x +" + str(round(intercept,2))

#plotting
%matplotlib widget
plt.scatter(weight_means, tumor_means)
plt.plot(weight_means,regress_values,"orange")

#labels
plt.title("Average Tumor vs Average Weight for Capomulin Treated Mice")
plt.xlabel('Weight (g)')
plt.ylabel("Tumor Volume (mm3)")
plt.annotate(line_eq,(17.8,37.8),fontsize=15,color="orange")

Canvas(toolbar=Toolbar(toolitems=[('Home', 'Reset original view', 'home', 'home'), ('Back', 'Back to previous …

Text(17.8, 37.8, 'y = 0.95x +21.55')

In [25]:
print(line_eq)

y = 0.95x +21.55
