## Observations and Insights

## Dependencies and starter code

In [1]:
%matplotlib notebook

In [2]:
# Dependencies and Setup
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
import seaborn as sns
import scipy.stats as st

# Study data files
mouse_metadata = "data/Mouse_metadata.csv"
study_results = "data/Study_results.csv"

# Read the mouse data and the study results
mouse_metadata = pd.read_csv(mouse_metadata)
study_results = pd.read_csv(study_results)

# Combine the data into a single dataset
merged_df = study_results.merge(mouse_metadata, on = 'Mouse ID')

#Cleaning up the table for data header readability
merged_df = merged_df.rename(columns={"Tumor Volume (mm3)":"Tumor Volume","Weight (g)":"Weight"})

merged_df.head()

Unnamed: 0,Mouse ID,Timepoint,Tumor Volume,Metastatic Sites,Drug Regimen,Sex,Age_months,Weight
0,b128,0,45.0,0,Capomulin,Female,9,22
1,b128,5,45.651331,0,Capomulin,Female,9,22
2,b128,10,43.270852,0,Capomulin,Female,9,22
3,b128,15,43.784893,0,Capomulin,Female,9,22
4,b128,20,42.731552,0,Capomulin,Female,9,22


## Summary statistics

In [3]:
# Generate a summary statistics table of mean, median, variance, standard deviation, and SEM of the tumor volume for each regimen

#Using groupby "Drug Regimen" and calculating the reqs
regimen=merged_df.groupby(["Drug Regimen"])["Tumor Volume"].agg(mean, median, var, std, sem, count)

#Creating the DF
#Use the calculated data and create the columns with labels
regimen_df= pd.DataFrame({"Drug": reg_drug, "Mean": reg_mean, "Median": reg_median,"Variance":reg_var,"STD": reg_std,"SEM": reg_sem,"Data Pts":reg_data})


#Format the columns with two decimal place format
reg_cols=['Mean','Median', 'Variance','STD','SEM']
regimen_df[reg_cols]=regimen_df[reg_cols].applymap(lambda x:'{:.2f}'.format(x))

regimen_df.head(20)

NameError: name 'mean' is not defined

## Bar plots

In [None]:
# Generate a bar plot showing number of data points for each treatment regimen using Pandas

In [None]:
# Filter the DataFrame down only to those columns to chart
drug_data = regimen_df[["Drug","Data Pts"]]

drug_data.head()

In [None]:
# Set the index to be "State" so they will be used as labels
panda_data = drug_data.set_index("Drug")

# Use DataFrame.plot() in order to create a bar chart of the data
panda_data.plot(kind="bar", figsize=(8,5))

# Set a Title and labels
plt.title("Data Points per Drug Regimen")
plt.xlabel("Drug")
plt.ylabel("No. of Data Points")

# Save our graph and show the grap
plt.tight_layout()
plt.savefig("../Images/drug_data_pda.png")
plt.show()

In [None]:
# Generate a bar plot showing number of data points for each treatment regimen using pyplot

In [None]:
# Set x axis and tick locations
x_axis = np.arange(len(drug_data))
tick_locations = [1,2,3,4,5,6,7,8,9,10]

In [None]:
# Create a list indicating where to write x labels and set figure size to adjust for space
plt.figure(figsize=(8,4))
plt.bar(x_axis, drug_data["Data Pts"], color='b', alpha=0.75, align="center")
plt.xticks(tick_locations, drug_data["Drug"], rotation="vertical")

# Set a Title and labels
plt.title("Data Points per Drug Regimen")
plt.xlabel("Drug")
plt.ylabel("No. of Data Points")

# Save our graph and show the grap
plt.tight_layout()
plt.savefig("../Images/drug_data_plt.png")
plt.show()

## Pie plots

In [None]:
# Generate a pie plot showing the distribution of female versus male mice using pandas

In [None]:
# Split up our data into groups based upon 'Sex'
gender_grp = merged_df.groupby(['Drug Regimen','Sex'])

gender_grp.head()

In [None]:
# Find out how many data points in each gender/ sex
gender_data = gender_grp['Tumor Volume'].count()

gender_data.head(30)

In [None]:
# Generate a pie plot showing the distribution of female versus male mice using pyplot
#Creating a pie plot for a specifi regimen

# Make a variable called reg_name amd store a Drug Regimen in this
reg_name = "Capomulin"

# Collect the trips of the 'Drug Regimen' above
specific_reg = gender_data.loc[reg_name]

# Place the Sex type data into a list
regimen_list = specific_reg.keys()

# Create a pie chart based upon the trip duration of that single bike
regimen_pie = specific_reg.plot(kind="pie", y=regimen_list, title=("Distribution of female versus male mice using " + reg_name))
regimen_pie.set_ylabel("No. of Mice")

plt.show()
plt.tight_layout()
plt.axis("equal")



## Quartiles, outliers and boxplots

In [None]:
# Calculate the final tumor volume of each mouse across four of the most promising treatment regimens. Calculate the IQR and quantitatively determine if there are any potential outliers. 

# Creating a pivot table to summarize our data so to verify what are the four most promising treatment regimens based on the final voume at the last time point
# Here we can see the average tumor volume per time point for each drug
tumor_response = pd.pivot_table(merged_df, index=["Timepoint"], values="Tumor Volume", columns=["Drug Regimen"], aggfunc=np.mean)
tumor_response

In [4]:
# Based on the table above, using the final tumor volume at the last timepoint, the top 4 are Ramicane, Capomulin, Ceftamin and Infubinol
#Droping the not needed drugs from the already pivot table
top_four = tumor_response.drop(['Ketapril','Naftisol','Placebo','Propriva','Stelasyn','Zoniferol'], axis=1
top_four

NameError: name 'tumor_response' is not defined

In [None]:
plt.boxplot(top_four[Capomulin])

In [None]:
# Generate a box plot of the final tumor volume of each mouse across four regimens of interest

# Setting the lists of drugs, colors and markers respectively
drugs=["Capomulin","Ceftamin","Infubinol","Ramicane"]
colors=["red","blue","green","yellow"]
markers=["o","^","s","d"]

for drug in drugs:
    # Setting the x_axis data,y_axis data and standard error of the mean for the graph
    x_data=top_four.index.values
    y_data=top_four[drug] 
    
    # Setting the x_axis and y_axis limits
    plt.xlim(0,45)
    plt.ylim(30,70)
    
    # Finding the index of each drug to get its respective color and marker
    index=drugs.index(drug)
    
    # Setting up the plot
    plt.boxplot(x_data,y_data)
    
    # Setting the legend
    plt.legend(loc="upper left", fancybox=True,numpoints=2,edgecolor="black")


# Setting the title,x_axis and y_axis labels
plt.title("Tumor Response to Treatment", fontsize=10)
plt.xlabel("Time(Days)")
plt.ylabel("Tumor Volume (mm3)")

# Display the graph
plt.show()

## Line and scatter plots

In [None]:
# Generate a line plot of time point versus tumor volume for a mouse treated with Capomulin

In [None]:
# Generate a scatter plot of mouse weight versus average tumor volume for the Capomulin regimen

In [None]:
# Calculate the correlation coefficient and linear regression model for mouse weight and average tumor volume for the Capomulin regimen