In [None]:
# Dependencies and Setup
import matplotlib.pyplot as plt
import pandas as pd
import scipy.stats as st

# Study data files
mouse_metadata_path = "data/Mouse_metadata.csv"
study_results_path = "data/Study_results.csv"

# Read the mouse data and the study results
mouse_metadata = pd.read_csv(mouse_metadata_path)
study_results = pd.read_csv(study_results_path)

# Combine the data into a single DataFrame
combined_data=pd.merge(study_results,mouse_metadata, how="outer",on="Mouse ID")

# Display the data table for preview
combined_data.head()

In [None]:
# Checking the number of mice.
number_of_mice=combined_data["Mouse ID"].unique()
print(len(number_of_mice))


In [None]:
# Our data should be uniquely identified by Mouse ID and Timepoint
# Get the duplicate mice by ID number that shows up for Mouse ID and Timepoint. 
duplicated_mice = combined_data.loc[combined_data.duplicated(subset=['Mouse ID', 'Timepoint']),'Mouse ID'].unique()
duplicated_mice

In [None]:
# Optional: Get all the data for the duplicate mouse ID. 
#considering that is the mouse g989, we will get the info of this mouse
g989_mouse=combined_data.loc[combined_data["Mouse ID"]=="g989"][:]
g989_mouse

In [None]:
# Create a clean DataFrame by dropping the duplicate mouse by its ID.
unique_combined_data=combined_data.drop_duplicates(subset="Mouse ID",keep="first")
unique_combined_data


In [None]:
# Checking the number of mice in the clean DataFrame.
number_of_mice2=unique_combined_data["Mouse ID"].unique()
print(len(number_of_mice2))


In [None]:
# Generate a summary statistics table of mean, median, variance, standard deviation, and SEM of the tumor volume for each regimen
# Use groupby and summary statistical methods to calculate the following properties of each drug regimen: 
grouped_drug=unique_combined_data.groupby(["Drug Regimen"])
# mean, median, variance, standard deviation, and SEM of the tumor volume. 
#MEAN
mean_tumor_volume = grouped_drug["Tumor Volume (mm3)"].mean()
#MEDIAN
median_tumor_volume = grouped_drug["Tumor Volume (mm3)"].median()
#VARIANCE
variance_tumor_volume = grouped_drug["Tumor Volume (mm3)"].var()
#variance_tumor_volume=variance_tumor_volume_a["Tumor Volume (mm3)"]
#STANDAR DEVIATION
std_tumor_volume= grouped_drug["Tumor Volume (mm3)"].std()
#SEM
sem_tumor_volume= grouped_drug["Tumor Volume (mm3)"].sem()

# Assemble the resulting series into a single summary DataFrame.
summary_statics=pd.DataFrame({"Mean Tumor Volume":mean_tumor_volume,"Median Tumor Volume":median_tumor_volume,"Tumor Volume Variance":variance_tumor_volume,"Tumor Volume Variance":variance_tumor_volume,"Tumor Volume Std. Dev.":std_tumor_volume,"Tumor Volume Std. Err.":sem_tumor_volume})
summary_statics

In [None]:
# Generate a bar plot showing the total number of rows (Mouse ID/Timepoints) for each drug regimen using Pandas.
info=grouped_drug["Mouse ID"].size()
info=info.sort_values(ascending=False)
info.plot(kind="bar")
plt.tight_layout()

plt.xlabel("Drug Regimen")
plt.ylabel("# of observed Mouse Timepoints")

In [None]:
# Generate a bar plot showing the total number of rows (Mouse ID/Timepoints) for each drug regimen using pyplot.
#Seting axis
x_axis=grouped_drug["Drug Regimen"].first()
y_axis=grouped_drug["Mouse ID"].size()
#Sort data highest to lowest
y_axis=y_axis.sort_values(ascending=False)
#bar chart rotate ticks so they fit the chart
bar_chart=plt.bar(x_axis,y_axis)
plt.tight_layout()

plt.xticks(rotation=45)
plt.xlabel("Drug Regimen")
plt.ylabel("# of observed Mouse Timepoints")
#plt.set_xticklabels(x_axis,rotation=45)

In [None]:
# Generate a pie plot showing the distribution of female versus male mice using Pandas
sex2=unique_combined_data.groupby(['Mouse ID'],as_index=False)['Timepoint'].max()
sex2
sex=unique_combined_data["Sex"].value_counts()

df = pd.DataFrame({'Sex': sex},index=['Male', 'Female'])                
#plot = df.plot.pie(y='sex')

#label0=["Male","Female"]
#plt.pie(sex,labels=label0, autopct="%1.1f%%")
#plt.title("Sex")


In [None]:
# Generate a pie plot showing the distribution of female versus male mice using pyplot
sex=unique_combined_data["Sex"].value_counts()
#percentage calculation
sex_total=len(unique_combined_data["Sex"])
percentage=sex/sex_total*100
label0=["Male","Female"]
plt.pie(percentage,labels=label0, autopct="%1.1f%%")
plt.title("Sex")

In [None]:
# Calculate the final tumor volume of each mouse across four of the treatment regimens:  
# Capomulin, Ramicane, Infubinol, and Ceftamin
# Start by getting the last (greatest) timepoint for each mouse
#Gruup by mice and pass the max timepoint
timepoint45=unique_combined_data.groupby(['Mouse ID'],as_index=False)['Timepoint'].max()

# Merge this group df with the original DataFrame to get the tumor volume at the last timepoint
merge_timepoint=pd.merge(unique_combined_data,timepoint45,on=["Mouse ID","Timepoint"],how="right")
merge_timepoint.head()

In [None]:

# Put treatments into a list for for loop (and later for plot labels)
treatments=["Capomulin", "Ramicane", "Infubinol", "Ceftamin"]


# Create empty list to fill with tumor vol data (for plotting) it is actualy a list of lists[0]-Capomulin etc
tumor_vol_data=[]

for treatment in treatments:
    a1=merge_timepoint.loc[merge_timepoint["Drug Regimen"]==treatment]
    tumor_vol_data.append(a1["Tumor Volume (mm3)"])

#---------------------------------crating dataframe for easier manage of list of lists-------------------
Capomulin=tumor_vol_data[0]
Ramicane=tumor_vol_data[1]
Infubinol=tumor_vol_data[2]
Ceftamin=tumor_vol_data[3]
vol_data_df=pd.DataFrame({"Capomulin":Capomulin,"Ramicane":Ramicane,"Infubinol":Infubinol,"Ceftamin":Ceftamin})


#------------------------------------------------------------
# Calculate the IQR and quantitatively determine if there are any potential outliers. 

for treatment0 in treatments:
    quartiles = vol_data_df[treatment0].quantile([.25,.5,.75]) 
    lowerq = quartiles[0.25]
    upperq = quartiles[0.75]
    iqr = upperq-lowerq
    lower_bound = lowerq - (1.5*iqr)
    upper_bound = upperq + (1.5*iqr)
    a=vol_data_df.loc[(vol_data_df[treatment0]>upper_bound)|(vol_data_df[treatment0]<lower_bound)]
    b=[]
    b=a[treatment0]
    print(f"{treatment0} potential outliers {b}") 

In [None]:
# Generate a box plot that shows the distrubution of the tumor volume for each treatment group.
#pandas box plot
vol_data_df.boxplot(column=["Capomulin","Ramicane","Infubinol","Ceftamin"] )

fig1,ax1=plt.subplots()
ax1.boxplot(tumor_vol_data,labels=treatments)

In [None]:
# Generate a line plot of tumor volume vs. time point for a single mouse treated with Capomulin Mouse 1509
y_axis=unique_combined_data.loc[unique_combined_data["Mouse ID"]=="l509"]["Tumor Volume (mm3)"]
x_axis=unique_combined_data.loc[unique_combined_data["Mouse ID"]=="l509"]["Timepoint"]
plt.plot(x_axis,y_axis)
plt.xlabel("Timepoints (Days)")
plt.ylabel("Tumor Volume (mm3)")
plt.title("Capomulin treatment of mouse l509")

In [None]:
# Generate a scatter plot of mouse weight vs. the average observed tumor volume for the entire Capomulin regimen

x_axis=merge_timepoint.loc[merge_timepoint["Drug Regimen"]=="Capomulin"]["Weight (g)"]
y_axis=unique_combined_data.groupby(["Mouse ID","Drug Regimen"],as_index=False)["Tumor Volume (mm3)"].mean()
y_axis_df=pd.DataFrame(y_axis)
y_axis=y_axis.loc[y_axis["Drug Regimen"]=="Capomulin"]["Tumor Volume (mm3)"]
plt.scatter(x_axis,y_axis)
plt.xlabel("Weight (g)")
plt.ylabel("Average Tumor Volume (mm3)")

In [None]:
# Calculate the correlation coefficient and a linear regression model 
print(f"The correlation between mouse weight and the average tumor volume is  {round(st.pearsonr(x_axis,y_axis)[0],2)}")
# for mouse weight and average observed tumor volume for the entire Capomulin regimen
pe_slope,pe_int,pe_r,pe_p,pe_std_err=st.linregress(x_axis,y_axis)
pe_fit=pe_slope*x_axis+pe_int
plt.scatter(x_axis,y_axis)
plt.plot(x_axis,pe_fit,"--",color="red")
plt.xlabel("Weight (g)")
plt.ylabel("Average Tumor Volume (mm3)")