## Observations and Insights 

In [165]:
# Dependencies and Setup
import matplotlib.pyplot as plt
import pandas as pd
from scipy import stats
import numpy as np

# Study data files
mouse_metadata_path = "data/Mouse_metadata.csv"
study_results_path = "data/Study_results.csv"

# Read the mouse data and the study results
mouse_metadata = pd.read_csv(mouse_metadata_path)
study_results = pd.read_csv(study_results_path)

# Combine the data into a single dataset
combined_data_df = pd.merge(study_results, mouse_metadata, on='Mouse ID')
# Display the data table for preview
combined_data_df

Unnamed: 0,Mouse ID,Timepoint,Tumor Volume (mm3),Metastatic Sites,Drug Regimen,Sex,Age_months,Weight (g)
0,b128,0,45.000000,0,Capomulin,Female,9,22
1,b128,5,45.651331,0,Capomulin,Female,9,22
2,b128,10,43.270852,0,Capomulin,Female,9,22
3,b128,15,43.784893,0,Capomulin,Female,9,22
4,b128,20,42.731552,0,Capomulin,Female,9,22
...,...,...,...,...,...,...,...,...
1888,m601,25,33.118756,1,Capomulin,Male,22,17
1889,m601,30,31.758275,1,Capomulin,Male,22,17
1890,m601,35,30.834357,1,Capomulin,Male,22,17
1891,m601,40,31.378045,1,Capomulin,Male,22,17


In [166]:
# Checking the number of mice.
total_mice = len(combined_data_df["Mouse ID"].unique())
print("Total number of mice: " + str(total_mice))

Total number of mice: 249


In [167]:
# Getting the duplicate mice by ID number that shows up for Mouse ID and Timepoint. 
duplicate = combined_data_df[combined_data_df.duplicated()]
dup_mouse = duplicate['Mouse ID']
dupmouse = dup_mouse.iloc[0]
dupmouse

'g989'

In [168]:
# Optional: Get all the data for the duplicate mouse ID. 
dup = combined_data_df.loc[combined_data_df['Mouse ID'] == dupmouse]
dup

Unnamed: 0,Mouse ID,Timepoint,Tumor Volume (mm3),Metastatic Sites,Drug Regimen,Sex,Age_months,Weight (g)
860,g989,0,45.0,0,Propriva,Female,21,26
861,g989,0,45.0,0,Propriva,Female,21,26
862,g989,5,48.786801,0,Propriva,Female,21,26
863,g989,5,47.570392,0,Propriva,Female,21,26
864,g989,10,51.745156,0,Propriva,Female,21,26
865,g989,10,49.880528,0,Propriva,Female,21,26
866,g989,15,51.325852,1,Propriva,Female,21,26
867,g989,15,53.44202,0,Propriva,Female,21,26
868,g989,20,55.326122,1,Propriva,Female,21,26
869,g989,20,54.65765,1,Propriva,Female,21,26


In [169]:
#pd.set_option("display.max_rows", None)
# Create a clean DataFrame by dropping the duplicate mouse by its ID.
clean_data_df = combined_data_df[combined_data_df["Mouse ID"].str.contains(dupmouse)==False]
clean_data_df                                                   

Unnamed: 0,Mouse ID,Timepoint,Tumor Volume (mm3),Metastatic Sites,Drug Regimen,Sex,Age_months,Weight (g)
0,b128,0,45.000000,0,Capomulin,Female,9,22
1,b128,5,45.651331,0,Capomulin,Female,9,22
2,b128,10,43.270852,0,Capomulin,Female,9,22
3,b128,15,43.784893,0,Capomulin,Female,9,22
4,b128,20,42.731552,0,Capomulin,Female,9,22
...,...,...,...,...,...,...,...,...
1888,m601,25,33.118756,1,Capomulin,Male,22,17
1889,m601,30,31.758275,1,Capomulin,Male,22,17
1890,m601,35,30.834357,1,Capomulin,Male,22,17
1891,m601,40,31.378045,1,Capomulin,Male,22,17


In [170]:
# Checking the number of mice in the clean DataFrame.
new_mice_total = len(clean_data_df['Mouse ID'].unique())
new_mice_total

248

## Summary Statistics

In [203]:
# Generate a summary statistics table of mean, median, variance, standard deviation, and SEM of the tumor volume for each regimen
regimean = (clean_data_df[['Drug Regimen','Tumor Volume (mm3)']].groupby('Drug Regimen').mean())['Tumor Volume (mm3)']
regimed = (clean_data_df[['Drug Regimen','Tumor Volume (mm3)']].groupby('Drug Regimen').median())['Tumor Volume (mm3)']
regimed = (clean_data_df[['Drug Regimen','Tumor Volume (mm3)']].groupby('Drug Regimen').median())['Tumor Volume (mm3)']

In [204]:
#regi_median = clean_data_df.groupby('Drug Regimen').median()
#regimedian = regi_median['Tumor Volume (mm3)']
#regimedian

In [205]:
regi_var = clean_data_df.groupby('Drug Regimen').var()
regivar = regi_var['Tumor Volume (mm3)']
regivar

Drug Regimen
Capomulin    24.947764
Ceftamin     39.290177
Infubinol    43.128684
Ketapril     68.553577
Naftisol     66.173479
Placebo      61.168083
Propriva     43.852013
Ramicane     23.486704
Stelasyn     59.450562
Zoniferol    48.533355
Name: Tumor Volume (mm3), dtype: float64

In [206]:
regi_sd = clean_data_df.groupby('Drug Regimen').std()
regisd = regi_sd['Tumor Volume (mm3)']
regisd

Drug Regimen
Capomulin    4.994774
Ceftamin     6.268188
Infubinol    6.567243
Ketapril     8.279709
Naftisol     8.134708
Placebo      7.821003
Propriva     6.622085
Ramicane     4.846308
Stelasyn     7.710419
Zoniferol    6.966589
Name: Tumor Volume (mm3), dtype: float64

In [207]:
regi_sem = clean_data_df.groupby('Drug Regimen').sem()
regisem = regi_sem['Tumor Volume (mm3)']
regisem

Drug Regimen
Capomulin    0.329346
Ceftamin     0.469821
Infubinol    0.492236
Ketapril     0.603860
Naftisol     0.596466
Placebo      0.581331
Propriva     0.544332
Ramicane     0.320955
Stelasyn     0.573111
Zoniferol    0.516398
Name: Tumor Volume (mm3), dtype: float64

In [208]:
# Assemble the resulting series into a single summary dataframe.
describe_summary_df = pd.DataFrame({
    'Tumor Volume Mean': regimean,
    'Tumor Volume Median': regimed,
    'Tumor Volume Variance': regivar,
    'Tumor Volume Standard Deviation': regisd,
    'Tumor Volume SEM': regisem
})
describe_summary_df

Unnamed: 0_level_0,Tumor Volume Mean,Tumor Volume Median,Tumor Volume Variance,Tumor Volume Standard Deviation,Tumor Volume SEM
Drug Regimen,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
Capomulin,40.675741,41.557809,24.947764,4.994774,0.329346
Ceftamin,52.591172,51.776157,39.290177,6.268188,0.469821
Infubinol,52.884795,51.820584,43.128684,6.567243,0.492236
Ketapril,55.235638,53.698743,68.553577,8.279709,0.60386
Naftisol,54.331565,52.509285,66.173479,8.134708,0.596466
Placebo,54.033581,52.288934,61.168083,7.821003,0.581331
Propriva,52.32093,50.446266,43.852013,6.622085,0.544332
Ramicane,40.216745,40.673236,23.486704,4.846308,0.320955
Stelasyn,54.233149,52.431737,59.450562,7.710419,0.573111
Zoniferol,53.236507,51.818479,48.533355,6.966589,0.516398


In [None]:
# Using the aggregation method, produce the same summary statistics in a single line
agg_df = clean_data_df.groupby('Drug Regimen').agg(["mean","median","var","std","sem"])
agg_df['Tumor Volume (mm3)']

## Bar and Pie Charts

In [None]:
# Generate a bar plot showing the total number of unique mice tested on each drug regimen using pandas.
mice_per_regi = clean_data_df.groupby('Drug Regimen').count().reset_index()
mice_perdrug = mice_per_regi[['Drug Regimen', 'Mouse ID']]
mice_perdrug = mice_perdrug.set_index('Drug Regimen')
sorted_mice_perdrug = mice_perdrug.sort_values(by='Mouse ID', ascending=False)
sorted_mice_perdrug

In [None]:
# Use DataFrame.plot() in order to create a bar chart of the data
sorted_mice_perdrug.plot(kind="bar", figsize=(7,3))

# Set a title for the chart
plt.title("Mice per Drug Regimen")
plt.ylabel("Number of mice")

plt.show()
plt.tight_layout()

In [None]:
mice_per_regi2 = clean_data_df.groupby('Drug Regimen').count().reset_index()
mice_perdrug2 = mice_per_regi[['Drug Regimen', 'Mouse ID']]
sorted_mice_perdrug2 = mice_perdrug2.sort_values(by='Mouse ID', ascending=False)
sorted_mice_perdrug2

In [None]:
# Generate a bar plot showing the total number of unique mice tested on each drug regimen using pyplot.
# Set x axis and tick locations
x_axis = np.arange(len(sorted_mice_perdrug2))
tick_locations = [value for value in x_axis]
x_axis

In [None]:
# Create a list indicating where to write x labels
plt.figure(figsize=(7,3))
plt.bar(x_axis, sorted_mice_perdrug2["Mouse ID"], alpha=1, width=.55, align="center")
plt.xticks(tick_locations, sorted_mice_perdrug2["Drug Regimen"], rotation="vertical")

# Set a title for the chart
plt.title("Mice per Drug Regimen")
plt.xlabel("Drug Regimen")
plt.ylabel("Number of mice")

# Set x and y limits
plt.xlim(-0.75, len(x_axis))
plt.ylim(0, max(sorted_mice_perdrug2["Mouse ID"])+10)

plt.show()
plt.tight_layout()

In [None]:
#Modify data to only list one entry per mouse
all_mice = clean_data_df.drop_duplicates(subset='Mouse ID', keep='first')
male_female = all_mice['Sex'].value_counts()
male_female

In [None]:
male_female_df = pd.DataFrame({
    "Mice per Gender": male_female
})
male_female_df

In [None]:
explode = (0.1, 0)
# Generate a pie plot showing the distribution of female versus male mice using pandas
#Modify data to only list one entry per mouse
male_female_df.plot.pie(y='Mice per Gender', startangle=360, shadow=True, explode=explode, autopct="%1.1f%%", )
plt.show()

In [None]:
# Generate a pie plot showing the distribution of female versus male mice using pyplot
# Labels for the sections of our pie chart
labels = ["Male", "Female"]

# The values of each section of the pie chart
sizes = [125, 123]

# Tells matplotlib to seperate the "Male mice" from "Female mice" section from the others
explode = (0.1, 0)

In [None]:
plt.pie(sizes, explode=explode, labels=labels,
        autopct="%1.1f%%", shadow=True, startangle=360)

# Tells matplotlib that we want a pie chart with equal axes
plt.axis("equal")
plt.show()

## Quartiles, Outliers and Boxplots

In [None]:
# Calculate the final tumor volume of each mouse across four of the treatment regimens:  
# Capomulin, Ramicane, Infubinol, and Ceftamin | 

drugs = ["Capomulin", "Ramicane", "Infubinol", "Ceftamin"]
fin_tumvol = []

for drug in drugs:
    
    #tumor_vol_list.append(final_tumor_vol)
    second = clean_data_df[["Mouse ID", "Timepoint"]].groupby("Mouse ID").max()
    second = pd.DataFrame(second)
    final_tp = pd.merge(clean_data_df, second, how="inner", on=["Mouse ID","Timepoint"])
    final_tp
    
    tp_perdrug = final_tp[final_tp['Drug Regimen'] == drug]

    drug_tumvol = tp_perdrug['Tumor Volume (mm3)']
    
    fin_tumvol.append(drug_tumvol.values.flatten())
   
    quartiles = drug_tumvol.quantile([.25,.5,.75])
    Q1 = drug_tumvol.quantile(0.25)
    Q3 = drug_tumvol.quantile(0.75)
    IQR = Q3 - Q1
    LB = Q1 - (1.5*IQR)
    UB = Q3 + (1.5*IQR)
    
    outliers = drug_tumvol.loc[(drug_tumvol < LB) | (drug_tumvol > UB)]
    print(f"{drug}'s potential outliers: {outliers}")

In [None]:
# Generate a box plot of the final tumor volume of each mouse across four regimens of interest
fig1, ax1 = plt.subplots()
ax1.set_title('Drug Regimens')
ax1.set_ylabel('Tumor Volumen (mm3)')
ax1.boxplot(fin_tumvol)

plt.xticks([1, 2, 3, 4],
           ["Capomulin", "Ramicane", "Infubinol", "Ceftamin"])

plt.show()

## Line and Scatter Plots

In [None]:
# Generate a line plot of tumor volume vs. time point for a mouse treated with Capomulin
selected_mouse = clean_data_df.loc[clean_data_df['Mouse ID'] == "l509"]
tps = selected_mouse["Timepoint"]
tv = selected_mouse["Tumor Volume (mm3)"]

plt.plot(tps, tv,)
plt.title("Capomulin treatment of mouse l509")
plt.xlabel("Timepoint (days)")
plt.ylabel("Tumor Volume (mm3)")

In [None]:
# Generate a scatter plot of average tumor volume vs. mouse weight for the Capomulin regimen
scat_df = clean_data_df.loc[clean_data_df["Drug Regimen"] == 'Capomulin', ["Mouse ID", "Weight (g)", "Tumor Volume (mm3)"]].groupby("Mouse ID").mean()
scat_weight = scat_df["Weight (g)"]
weight = scat_df["Weight (g)"]
avgtumvol = scat_df["Tumor Volume (mm3)"]

plt.scatter(weight, avgtumvol)
plt.title("Avg tumor volume vs mouse weight for Capomulin treatment")
plt.xlabel("Weight (g)")
plt.ylabel("Average tumor volume (mm3)")

## Correlation and Regression

In [None]:
# Calculate the correlation coefficient and linear regression model 
# for mouse weight and average tumor volume for the Capomulin regimen
vc_slope, vc_int, vc_r, vc_p, vc_std_err = stats.linregress(weight, avgtumvol)
vc_fit = vc_slope * weight + vc_int

plt.scatter(weight,avgtumvol)
plt.plot(weight,vc_fit,"--", color='red')
plt.xlabel('Year')
plt.ylabel('Violent Crime Rate')
plt.show()