## Observations and Insights

## 1. The tumor volume has a positive correlation with the weight of the mice
## 2.  In the line plot the volume of the tumor increases and decreases with timepoint for mouse_id = b128


## Dependencies and starter code

In [None]:
# Dependencies and Setup
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import scipy.stats as st
from scipy.stats import linregress

# Study data files
mouse_metadata = "data/Mouse_metadata.csv"
study_results = "data/Study_results.csv"

# Read the mouse data and the study results
mouse_metadata = pd.read_csv(mouse_metadata)
study_results = pd.read_csv(study_results)


# Combine the data into a single dataset
Mouse_Study_combined_df = pd.merge(study_results, mouse_metadata, on="Mouse ID")
Mouse_Study_combined_df



## Summary statistics

In [None]:
# Generate a summary statistics table of mean, median, variance, standard deviation, and SEM of the tumor volume for each regimen
drug_summary_df=Mouse_Study_combined_df.groupby(['Drug Regimen'])['Tumor Volume (mm3)'].agg({'Mean':np.mean, 'SD':np.std,'Variance':np.var,'Median':np.median,'SEM':st.sem}).reset_index()
drug_summary_df


## Bar plots

In [None]:
# Generate a bar plot showing number of data points for each treatment regimen using pandas
drug_list = drug_summary_df['Drug Regimen']
my_colors = list(['b', 'r', 'g', 'y', 'k'])
multi_plot = drug_summary_df.plot(kind="bar", figsize=(15,10),color=my_colors)
multi_plot.set_xticklabels(drug_list)
multi_plot.set_ylabel("Tumor Volume (mm3)")
multi_plot.set_xlabel("Drug Regimen")
multi_plot.set_title("Tumor Volume Statitics")
plt.show()
plt.tight_layout()


In [None]:
# Generate a bar plot showing number of data points for each treatment regimen using pyplot
# Setting the positions and width for the bars
pos = list(range(len(drug_summary_df['Mean']))) 
width = 0.1
  #  'b', 'r', 'g', 'y', 'k'
# Plotting the bars
fig, ax = plt.subplots(figsize=(15,10))
#plt.axis('off')
plt.grid(b=None)

# Create a bar with pre_score data,
# in position pos,
plt.bar(pos, 
        drug_summary_df['Mean'], 
        # of width
        width, 
        # with alpha 0.5
        alpha=0.5, 
        # with color
        color='b', 
        # with label the first value in first_name
        label=drug_summary_df['Drug Regimen'][0]) 

# Create a bar with mid_score data,
# in position pos + some width buffer,
plt.bar([p + width for p in pos], 
        #using df['mid_score'] data,
        drug_summary_df['SD'],
        # of width
        width, 
        # with alpha 0.5
        alpha=0.5, 
        # with color
       color='r', 
        # with label the second value in first_name
        label=drug_summary_df['Drug Regimen'][1]) 

# Create a bar with post_score data,
# in position pos + some width buffer,
plt.bar([p + width*2 for p in pos], 
        #using df['post_score'] data,
        drug_summary_df['Variance'], 
        # of width
        width, 
        # with alpha 0.5
        alpha=0.5, 
        # with color
       color='g', 
        # with label the third value in first_name
        label=drug_summary_df['Drug Regimen'][2]) 

plt.bar([p + width*3 for p in pos], 
        #using df['post_score'] data,
        drug_summary_df['Median'], 
        # of width
        width, 
        # with alpha 0.5
        alpha=0.5, 
        # with color
        color='y', 
        # with label the third value in first_name
        label=drug_summary_df['Drug Regimen'][3]) 

plt.bar([p + width*4 for p in pos], 
        #using df['post_score'] data,
        drug_summary_df['SEM'], 
        # of width
        width, 
        # with alpha 0.5
        alpha=0.5, 
        # with color
        color='k', 
        # with label the third value in first_name
        label=drug_summary_df['Drug Regimen'][4]) 

# Set the y axis label
ax.set_ylabel('Tumor Volume (mm3)')

# Set the y axis label
ax.set_xlabel('Drug Regimen')

# Set the chart's title
ax.set_title('Tumor Volume Statitics')

# Set the position of the x ticks
ax.set_xticks([p + 1.5 * width for p in pos])

# Set the labels for the x ticks
ax.set_xticklabels(drug_summary_df['Drug Regimen'])


# Adding the legend and showing the plot
plt.legend(['Mean', 'SD', 'Variance','Median', 'SEM'], loc='upper right')
plt.grid()
plt.show()





## Pie plots

In [None]:
# Generate a pie plot showing the distribution of female versus male mice using pandas
mouse_sex_group = mouse_metadata.groupby('Sex' ,as_index=False )['Mouse ID'].count().rename(columns={'Mouse ID':'Number Of Mice by Sex'})
mouse_sex_group.plot(kind='pie',  y = 'Number Of Mice by Sex', autopct='%1.1f%%' , startangle=40, shadow=True, labels=mouse_sex_group['Sex'], legend = False, fontsize=14)


In [None]:
# Generate a pie plot showing the distribution of female versus male mice using pyplot
# Labels for the sections of our pie chart
mouse_sex_list = mouse_sex_group["Sex"].values.tolist()
mouse_number_list = mouse_sex_group["Number Of Mice by Sex"].values.tolist()
plt.pie(mouse_number_list,  labels=mouse_sex_list, autopct="%1.1f%%", shadow=True, startangle=40) 
plt.title("Number Of Mice by Sex")

## Quartiles, outliers and boxplots

In [None]:
# Calculate the final tumor volume of each mouse across four of the most promising treatment regimens. 
#Calculate the IQR and quantitatively determine if there are any potential outliers.
Mouse_Study_combined_subset = Mouse_Study_combined_df[['Mouse ID','Tumor Volume (mm3)']].loc[Mouse_Study_combined_df['Drug Regimen'].isin(['Capomulin','Propriva','Ketapril','Ramicane'])]
Mouse_Study_combined_subset_average_vol = Mouse_Study_combined_subset.groupby('Mouse ID' ,as_index=False)['Tumor Volume (mm3)'].agg({'Last':'last'})

del Mouse_Study_combined_subset_average_vol['Mouse ID']
Mouse_Study_combined_subset_average_vol_list = Mouse_Study_combined_subset_average_vol['Last']
Mouse_Study_combined_subset_average_vol_list
quartiles = Mouse_Study_combined_subset_average_vol_list.quantile([.25,.5,.75])
lowerq = quartiles[0.25]
upperq = quartiles[0.75]
iqr = upperq-lowerq

print(f"The lower quartile of Tumor Volume (mm3) is: {lowerq}")
print(f"The upper quartile of Tumor Volume (mm3) is: {upperq}")
print(f"The interquartile range of Tumor Volume (mm3) is: {iqr}")
print(f"The  median of Tumor Volume (mm3) is: {quartiles[0.5]} ")

lower_bound = lowerq - (1.5*iqr)
upper_bound = upperq + (1.5*iqr)
print(f"Values below {lower_bound} could be outliers.")
print(f"Values above {upper_bound} could be outliers.")




In [None]:
# Generate a box plot of the final tumor volume of each mouse across four regimens of interest

fig1, ax1 = plt.subplots()
ax1.set_title('final tumor volume of each mouse across Drug Regimens - Capomulin,Propriva,Ketapril,Ramicane ')
ax1.set_ylabel('Final Tumor Volume in (mm3) ')
ax1.boxplot(Mouse_Study_combined_subset_average_vol_list)
plt.show()


## Line and scatter plots

In [None]:
# Generate a line plot of time point versus tumor volume for a mouse treated with Capomulin
Mouse_Study_combined_df_b128 = Mouse_Study_combined_df.loc[Mouse_Study_combined_df['Mouse ID']=='b128']
Mouse_Study_combined_df_b128.loc[Mouse_Study_combined_df_b128['Drug Regimen']=='Capomulin'].plot(kind='line',x='Timepoint',y='Tumor Volume (mm3)',color='red',title="mouseid = b128")


In [None]:
%matplotlib notebook
# Generate a scatter plot of mouse weight versus average tumor volume for the Capomulin regimen
cap_weight = Mouse_Study_combined_df[['Weight (g)','Mouse ID']].loc[Mouse_Study_combined_df['Drug Regimen']=='Capomulin']
x_axis = cap_weight.groupby(['Mouse ID'])['Weight (g)'].agg({'Weight':'first'})
x_axis = x_axis['Weight']
x_axis = x_axis.reset_index()
x_axis = x_axis["Weight"]
cap_vol = Mouse_Study_combined_df[['Tumor Volume (mm3)','Mouse ID']].loc[Mouse_Study_combined_df['Drug Regimen']=='Capomulin']
y_axis = cap_vol.groupby(['Mouse ID'])['Tumor Volume (mm3)'].agg({'Mean':np.mean})
y_axis = y_axis.reset_index()
y_axis = y_axis['Mean']
(slope, intercept, rvalue, pvalue, stderr) = linregress(x_axis, y_axis)
regress_values = x_axis * slope + intercept
line_eq = "y = " + str(round(slope,2)) + "x + " + str(round(intercept,2))
plt.scatter(x_axis,y_axis)

plt.xlabel('Mouse Weight')
plt.ylabel('Tumor Volume (mm3)')
plt.show()




In [None]:
plt.plot(x_axis,regress_values,"r-")
plt.annotate(line_eq,(19,38),fontsize=15,color="red")
print(f"The correlation coefficient is: {rvalue}")
plt.show()
