## Observations and Insights

## Dependencies and starter code

In [None]:
# Dependencies and Setup
import os
import matplotlib.pyplot as plt
import pandas as pd
import scipy.stats as st
import numpy as np

# Study data files
mouse_metadata = "Resources/Mouse_metadata copy.csv"
study_results = "Resources/Study_results copy.csv"

# Read the mouse data and the study results
mouse_metadata = pd.read_csv(mouse_metadata)
study_results = pd.read_csv(study_results)

# Combine the data into a single dataset

mouse_study=pd.merge(mouse_metadata,study_results,on=['Mouse ID','Mouse ID'], how='left')
mouse_study.head()

## Summary statistics

In [None]:
# Generate a summary statistics table of mean, median, variance, standard deviation, and SEM of the tumor volume for each regimen

mean=mouse_study.groupby(['Drug Regimen']).mean()['Tumor Volume (mm3)']

median=mouse_study.groupby(['Drug Regimen']).median()['Tumor Volume (mm3)']

variance=mouse_study.groupby(['Drug Regimen']).var()['Tumor Volume (mm3)']

sd=mouse_study.groupby(['Drug Regimen']).std()['Tumor Volume (mm3)']

SEM=mouse_study.groupby(['Drug Regimen']).sem()['Tumor Volume (mm3)']

Summary_Regimen=pd.DataFrame({'Average':mean,
                            'Median': median,
                            'Variance': variance,
                            'Standard Deviation':sd,
                            'Standard Error': SEM})

Summary_Regimen

## Bar plots

In [None]:
# Generate a bar plot showing number of data points of tumor volume for each treatment regimen using pandas
#hint use count 
data_points=mouse_study.groupby(['Drug Regimen']).count()['Tumor Volume (mm3)']
data_points.plot(kind='bar',figsize=(10,5))
plt.xlabel('Drug Regimen')
plt.ylabel('Tumor Volume (mm3)')
plt.title('Data Points by Drug')

In [None]:
# Generate a bar plot showing number of data points for each treatment regimen using pyplot
x_axis=list(data_points.index.values.tolist())
x_ticks=[value for value in data_points]
plt.figure(figsize=(10,5))
plt.bar(x_axis,data_points,color='r', alpha=0.5, align="center")
plt.xlabel('Drug Regimen')
plt.ylabel('Tumor Volume (mm3)')
plt.title('Data Points by Drug')

## Pie plots

In [None]:
# Generate a pie plot showing the distribution of female versus male mice using pandas
sex_group=mouse_study.groupby('Sex')
count_sex=sex_group['Sex'].count()

count_sex.plot(kind='pie', figsize=(17,7),autopct="%1.1f%%",shadow=True,startangle=90)
plt.axis('equal')

In [None]:
# Generate a pie plot showing the distribution of female versus male mice using pyplot

plt.pie(count_sex,autopct="%1.1f%%", shadow=True, startangle=90,)
plt.axis('equal')

## Quartiles, outliers and boxplots

In [None]:
# Calculate the final tumor volume of each mouse across four of the most promising treatment regimens. Calculate the IQR and quantitatively determine if there are any potential outliers. 
#Drug Regimen Ramicane, Capomulin, Infubinol, Ceftamine, are the most promising 

#four_drugs=mouse_study.loc[(mouse_study['Drug Regimen']=='Ramicane')|(mouse_study['Drug Regimen']=='Capomulin')|(mouse_study['Drug Regimen']=='Infubinol')|(mouse_study['Drug Regimen']=='Ceftamine'),:]
Ramicane=mouse_study.loc[(mouse_study['Drug Regimen']=='Ramicane') ,:]
Ramicane_tumor_group=Ramicane.groupby(['Mouse ID']).sum()['Tumor Volume (mm3)']

quartiles = Ramicane_tumor_group.quantile([.25,.5,.75])
lowerq = quartiles[0.25]
upperq = quartiles[0.75]
iqr = upperq-lowerq

lower_bound = lowerq - (1.5*iqr)
upper_bound = upperq + (1.5*iqr)

print(f"The interquartile range for Ramicane is: {iqr}")
print(f"Values below {lower_bound} could be outliers.")
print(f"Values above {upper_bound} could be outliers.")

In [None]:
# Calculate the final tumor volume of each mouse across four of the most promising treatment regimens. Calculate the IQR and quantitatively determine if there are any potential outliers. 
#Drug Regimen Capomulin
Capomulin=mouse_study.loc[(mouse_study['Drug Regimen']=='Capomulin') ,:]
Capomulin_tumor_group=Capomulin.groupby(['Mouse ID']).sum()['Tumor Volume (mm3)']

quartiles = Capomulin_tumor_group.quantile([.25,.5,.75])
lowerq = quartiles[0.25]
upperq = quartiles[0.75]
iqr = upperq-lowerq

lower_bound = lowerq - (1.5*iqr)
upper_bound = upperq + (1.5*iqr)

print(f"The interquartile range for Capomulin is: {iqr}")
print(f"Values below {lower_bound} could be outliers.")
print(f"Values above {upper_bound} could be outliers.")

In [None]:
# Calculate the final tumor volume of each mouse across four of the most promising treatment regimens. Calculate the IQR and quantitatively determine if there are any potential outliers. 
#Drug Regimen Infubinol

#four_drugs=mouse_study.loc[(mouse_study['Drug Regimen']=='Ramicane')|(mouse_study['Drug Regimen']=='Capomulin')|(mouse_study['Drug Regimen']=='Infubinol')|(mouse_study['Drug Regimen']=='Ceftamine'),:]
Infubinol=mouse_study.loc[(mouse_study['Drug Regimen']=='Infubinol') ,:]
Infubinol_tumor_group=Infubinol.groupby(['Mouse ID']).sum()['Tumor Volume (mm3)']

quartiles = Infubinol_tumor_group.quantile([.25,.5,.75])
lowerq = quartiles[0.25]
upperq = quartiles[0.75]
iqr = upperq-lowerq

lower_bound = lowerq - (1.5*iqr)
upper_bound = upperq + (1.5*iqr)

print(f"The interquartile range for Infubinol is: {iqr}")
print(f"Values below {lower_bound} could be outliers.")
print(f"Values above {upper_bound} could be outliers.")

In [None]:
# Calculate the final tumor volume of each mouse across four of the most promising treatment regimens. Calculate the IQR and quantitatively determine if there are any potential outliers. 
#Drug Regimen Ceftamine 

#four_drugs=mouse_study.loc[(mouse_study['Drug Regimen']=='Ramicane')|(mouse_study['Drug Regimen']=='Capomulin')|(mouse_study['Drug Regimen']=='Infubinol')|(mouse_study['Drug Regimen']=='Ceftamine'),:]
Ceftamin=mouse_study.loc[(mouse_study['Drug Regimen']=='Ceftamin') ,:]
Ceftamin_tumor_group=Ceftamin.groupby(['Mouse ID']).sum()['Tumor Volume (mm3)']

quartiles = Ceftamin_tumor_group.quantile([.25,.5,.75])
lowerq = quartiles[0.25]
upperq = quartiles[0.75]
iqr = upperq-lowerq

lower_bound = lowerq - (1.5*iqr)
upper_bound = upperq + (1.5*iqr)

print(f"The interquartile range for Ceftamin is: {iqr}")
print(f"Values below {lower_bound} could be outliers.")
print(f"Values above {upper_bound} could be outliers.")

In [None]:
# Generate a box plot of the final tumor volume of each mouse across four regimens of interest
data = [Ramicane_tumor_group, Capomulin_tumor_group, Infubinol_tumor_group, Ceftamin_tumor_group]
yellow_triangle = dict(markerfacecolor='y', marker='^')
fig1, ax1 = plt.subplots()
ax1.set_title('Final Tumor Volume of Each Mouse')
ax1.set_ylabel('Final Tummor Volume (mm3)')
ax1.set_xlabel('Treatment Regimen')
ax1.boxplot(data,  flierprops=yellow_triangle)

# Set the axes ranges and axes labels
Treatments = ["Ramicane", "Capomulin", 'Infubinol', 'Ceftamin' ]
ax1.set_xticklabels(Treatments,
                    rotation=45, fontsize=8)


plt.show()

## Line and scatter plots

In [None]:
# Generate a line plot of time point versus tumor volume for a mouse treated with Capomulin
Cap_df=mouse_study.loc[(mouse_study['Drug Regimen']=='Capomulin'),:]
Cap_df=Cap_df.groupby(['Timepoint']).mean()['Tumor Volume (mm3)']
Cap_df.plot(kind='line', figsize=(17,7))
plt.xlabel('Timepoint')
plt.ylabel('Tumor Volume (mm3)')
plt.title('Average Tumor Volume per Timepoint')

In [None]:
# Generate a scatter plot of mouse weight versus average tumor volume for the Capomulin regimen
Tumor_Weight=mouse_study.loc[(mouse_study['Drug Regimen']=='Capomulin'),:]
Tumor_Weight=Tumor_Weight.groupby(['Weight (g)']).mean()['Tumor Volume (mm3)']
Tumor_Weight_df=pd.DataFrame({'Tumor Volume':Tumor_Weight})
x_axis=list(Tumor_Weight.index.values.tolist())
x_values=[value for value in x_axis]
y_values = Tumor_Weight_df ['Tumor Volume']
plt.scatter(x_axis, y_values,)
plt.xlabel('Mouse Weight (g)')
plt.ylabel('Tumor Volume (mm3)')
plt.title('Mouse Weight vs Tumor Volume')

In [None]:
# Calculate the correlation coefficient and linear regression model for mouse weight and average tumor volume for the Capomulin regimen
Tumor_Weight=mouse_study.loc[(mouse_study['Drug Regimen']=='Capomulin'),:]
Tumor_Weight=Tumor_Weight.groupby(['Mouse ID','Weight (g)']).mean()['Tumor Volume (mm3)']
Tumor_Weight_df=pd.DataFrame({'Tumor Volume':Tumor_Weight})
Tumor_Weight_df
x_axis=np.arange(len(Tumor_Weight_df))
y_values = Tumor_Weight_df ['Tumor Volume']
(slope, intercept, rvalue, pvalue, stderr) = st.linregress(x_axis, y_values)
regress_values = x_axis * slope + intercept
line_eq = "y = " + str(round(slope,2)) + "x + " + str(round(intercept,2))
correlation = st.pearsonr(x_axis,y_values)
print(f"The correlation between both factors is {round(correlation[0],2)}")
print(f'The Linear Regression Model is {line_eq}')