## Observations and Insights

## Dependencies and starter code

In [None]:
# %matplotlib notebook

In [None]:
# Dependencies and Setup
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
import scipy.stats as st

# Study data files
mouse_metadata = "data/Mouse_metadata.csv"
study_results = "data/Study_results.csv"

# Read the mouse data and the study results
mouse_metadata = pd.read_csv(mouse_metadata)
study_results = pd.read_csv(study_results)

# Combine the data into a single dataset
metadata_results_combined_df = pd.merge(mouse_metadata, study_results, on='Mouse ID', how='outer')

# Show the DataFrame
metadata_results_combined_df

## Summary statistics

In [None]:
# Generate a summary statistics table of mean, median, variance, standard deviation, and SEM of the tumor volume for each regimen
# Group by the Drug Regimen to get some stats.
regimen_group = metadata_results_combined_df.groupby(['Drug Regimen'])
# Get the average, create a dataframe and rename the column
volume_summary = regimen_group['Tumor Volume (mm3)'].mean()
volume_summary_df = pd.DataFrame(volume_summary)
volume_summary_df = volume_summary_df.rename(columns={'Tumor Volume (mm3)':'Mean Tumor Volume (mm3)'})
# Get the other stats and show the dataframe
volume_summary_df['Median Tumor Volume (mm3)'] = regimen_group['Tumor Volume (mm3)'].median()
volume_summary_df['Variance Tumor Volume (mm3)'] = regimen_group['Tumor Volume (mm3)'].var()
volume_summary_df['Stand. Dev. Tumor Volume (mm3)'] = regimen_group['Tumor Volume (mm3)'].std()
volume_summary_df['SEM Tumor Volume (mm3)'] = regimen_group['Tumor Volume (mm3)'].sem()
volume_summary_df

## Bar plots

In [None]:
# Generate a bar plot showing number of data points for each treatment regimen using pyplot
# Get the number of data points from the previous groupby and get the count
number_of_data_points = regimen_group['Tumor Volume (mm3)'].count()

# Get a list of the index from the volume stats dataframe and use that list to create the xaxis for the bar plot
drug_regimen = volume_summary_df.index.values.tolist()
x_axis = np.arange(len(drug_regimen))

# Create a bar plot with the data using matplotlib.plot and prettyfy the plot
plt.bar(x_axis, number_of_data_points, tick_label=drug_regimen, width=.5)
plt.xticks(rotation='vertical')
plt.title('Number of Data Points per Drug Regimen')
plt.ylabel('Number of Data Points')
plt.xlabel('Drug Regimen')
plt.grid()
plt.tight_layout()

In [None]:
# Generate a bar plot showing number of data points for each treatment regimen using pandas
# Create a dataframe of the groupby where I got the count of values form the above cell
data_point_count_df = pd.DataFrame(number_of_data_points)

# Create a bar plot using pandas plot of the data and prettyfy it
Count_PDPlot = data_point_count_df.plot(kind='bar', grid=True, legend=False)
plt.title('Number of Data Points per Drug Regimen')
Count_PDPlot.set_ylabel('Number of Data Points')
plt.tight_layout()

## Pie plots

In [None]:
# Generate a pie plot showing the distribution of female versus male mice using pyplot
unique_mouse = metadata_results_combined_df.drop_duplicates(subset=['Mouse ID'], keep='last')
sex_ratio_group = unique_mouse.groupby(['Sex']).count()

sex_ratio_df = pd.DataFrame(sex_ratio_group['Mouse ID'])
sex_ratio = list(sex_ratio_df['Mouse ID'])
labels = sex_ratio_df.index.values.tolist()

Ratio_MPLPlot = plt.pie(sex_ratio, labels=labels, autopct='%1.1f%%')


In [None]:
# Generate a pie plot showing the distribution of female versus male mice using pandas
sex_ratio_df
gender_list = sex_ratio_df.keys()

Ratio_PDPlot = sex_ratio_df.plot(kind='pie', y=gender_list, legend=False, autopct='%1.1f%%')
a=Ratio_PDPlot.set_ylabel('')
# plt.title('Female to Male Ratio')

## Quartiles, outliers and boxplots

In [None]:
# Calculate the final tumor volume of each mouse across four of the most promising treatment regimens. 
# Capomulin, Ramicane, Infubinol, and Ceftamin.
metadata_results_combined_df
prom_regimens = metadata_results_combined_df[metadata_results_combined_df['Drug Regimen']
                                             .isin(['Capomulin', 'Ramicane', 'Infubinol', 'Ceftamin'])] 
final_volume = prom_regimens.groupby(['Mouse ID']).tail(1)
final_volume = final_volume.sort_values(by=['Tumor Volume (mm3)'])

# Calculate the IQR and quantitatively determine if there are any potential outliers. 
# df.loc[df['column_name'] == some_value]
for drug in list(set(final_volume['Drug Regimen'])):
    quartiles = final_volume.loc[final_volume['Drug Regimen']==drug]['Tumor Volume (mm3)'].quantile([0.25, 0.5, 0.75])
    lowerq = quartiles[0.25]
    upperq = quartiles[0.75]
    iqr = upperq - lowerq

    lower_bound = lowerq - (1.5*iqr)
    upper_bound = upperq + (1.5*iqr)

    max_tumor_volume = float(final_volume.loc[final_volume['Drug Regimen']==drug]['Tumor Volume (mm3)'].tail(1))
    min_tumor_volume = float(final_volume.loc[final_volume['Drug Regimen']==drug]['Tumor Volume (mm3)'].head(1))
       
    if (max_tumor_volume > upper_bound) | (min_tumor_volume < lower_bound):
        print(f"{drug} has at least one potential outlier.")
    else:
        print(f"{drug} does not have any outliers.")


In [None]:
# Generate a box plot of the final tumor volume of each mouse across four regimens of interest

# Using Matplotlib, generate a box and whisker plot of the final tumor volume for all four treatment regimens and highlight any potential outliers in the plot by changing their color and style.
# Hint: All four box plots should be within the same figure. Use this Matplotlib documentation page for help with changing the style of the outliers.

bp_data = [final_volume['Tumor Volume (mm3)'].loc[final_volume['Drug Regimen']=='Ramicane'],
           final_volume['Tumor Volume (mm3)'].loc[final_volume['Drug Regimen']=='Capomulin'],
           final_volume['Tumor Volume (mm3)'].loc[final_volume['Drug Regimen']=='Infubinol'],
           final_volume['Tumor Volume (mm3)'].loc[final_volume['Drug Regimen']=='Ceftamin']
          ]

fig, ax = plt.subplots()
ax.set_title("Final Tumor Volume per Drug Regimen")
ax.set_ylabel("Final Tumor Volume (mm3)")
ax.set_xticklabels(["Ramicane", "Capomulin", "Infubinol", "Ceftamin"])
ax.set_xlabel("Drug Regimen")
bp = plt.boxplot(bp_data)
plt.tight_layout()
plt.grid(False)
plt.show()

## Line and scatter plots

In [None]:
# Generate a line plot of time point versus tumor volume for a mouse treated with Capomulin
Capomulinf966_df = metadata_results_combined_df.loc[(metadata_results_combined_df['Drug Regimen']=='Capomulin')&
                            (metadata_results_combined_df['Mouse ID']=='f966')]

capomulin_line_plot = Capomulinf966_df.plot.line(x='Timepoint', y='Tumor Volume (mm3)', legend=False)
title = plt.title(f"For Mouse with ID f966")
ylabel = plt.ylabel("Tumor Volume (mm3)")

In [None]:
# Generate a scatter plot of mouse weight versus average tumor volume for the Capomulin regimen
Capomulin_df = metadata_results_combined_df.loc[metadata_results_combined_df['Drug Regimen']=='Capomulin']

AverageTumorVolme_perMouse_df = Capomulin_df.groupby(['Mouse ID']).mean()

scatter = AverageTumorVolme_perMouse_df.plot.scatter(x='Tumor Volume (mm3)', y='Weight (g)')
title = plt.title("Mouse Weight vs Average Tumor Volume")

In [None]:
# Calculate the correlation coefficient and linear regression model for mouse weight and average tumor volume for the 
# Capomulin regimen

CorrelationCoefficient = round(st.pearsonr(AverageTumorVolme_perMouse_df['Tumor Volume (mm3)'],
                                           AverageTumorVolme_perMouse_df['Weight (g)'])[0],2)
print(f"The correlation coefficient for mouse weight and average tumor volume for the Capomulin regimen is {CorrelationCoefficient}.")
