In [456]:
%matplotlib notebook
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd

In [457]:
mouse_metedata = 'Resources/mouse_metadata.csv'
study_results = 'Resources/data_Study_results.csv'

study_df = pd.read_csv(study_results)
mouse_df = pd.read_csv(mouse_metedata)

In [458]:
# find duplicate and drop all values associated with value
dup_ids = study_df[['Mouse ID', 'Timepoint']].value_counts()
dup_ids.to_csv('Resources/duplicate_data.csv')

# drop g989
study_df.drop(study_df.loc[study_df['Mouse ID']== 'g989'].index, inplace=True)

In [459]:
# merging two data frames
merged_df = pd.merge(study_df, mouse_df, on = 'Mouse ID')

In [460]:
merged_df.head(5)

Unnamed: 0,Mouse ID,Timepoint,Tumor Volume (mm3),Metastatic Sites,Drug Regimen,Sex,Age_months,Weight (g)
0,b128,0,45.0,0,Capomulin,Female,9,22
1,b128,5,45.651331,0,Capomulin,Female,9,22
2,b128,10,43.270852,0,Capomulin,Female,9,22
3,b128,15,43.784893,0,Capomulin,Female,9,22
4,b128,20,42.731552,0,Capomulin,Female,9,22


In [461]:
# Generate a summary statistics table consisting of the mean, median, variance,
# standard deviation, and SEM of the tumor volume for each drug regimen.

In [462]:
drug_regimen_mean = merged_df.groupby(['Drug Regimen']).mean()['Tumor Volume (mm3)']
drug_regimen_median = merged_df.groupby(['Drug Regimen']).median()['Tumor Volume (mm3)']
drug_regimen_var = merged_df.groupby(['Drug Regimen']).var()['Tumor Volume (mm3)']
drug_regimen_std = merged_df.groupby(['Drug Regimen']).std()['Tumor Volume (mm3)']
drug_regimen_sem = merged_df.groupby(['Drug Regimen']).sem()['Tumor Volume (mm3)']

drug_regimen_df = pd.DataFrame({'Mean':drug_regimen_mean,
                               'Median':drug_regimen_median,
                               'Variance':drug_regimen_var,
                               'Stand. Dev':drug_regimen_std,
                               'SEM':drug_regimen_sem,})
drug_regimen_df

Unnamed: 0_level_0,Mean,Median,Variance,Stand. Dev,SEM
Drug Regimen,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
Capomulin,40.675741,41.557809,24.947764,4.994774,0.329346
Ceftamin,52.591172,51.776157,39.290177,6.268188,0.469821
Infubinol,52.884795,51.820584,43.128684,6.567243,0.492236
Ketapril,55.235638,53.698743,68.553577,8.279709,0.60386
Naftisol,54.331565,52.509285,66.173479,8.134708,0.596466
Placebo,54.033581,52.288934,61.168083,7.821003,0.581331
Propriva,52.32093,50.446266,43.852013,6.622085,0.544332
Ramicane,40.216745,40.673236,23.486704,4.846308,0.320955
Stelasyn,54.233149,52.431737,59.450562,7.710419,0.573111
Zoniferol,53.236507,51.818479,48.533355,6.966589,0.516398


In [463]:
# Bar plots using Pandas / Matplotlibs to find total mice per treatment

In [464]:
totalmice_pds = merged_df[['Drug Regimen','Mouse ID']]
totalmice_pds_2 = totalmice_pds['Drug Regimen'].value_counts()

In [465]:
# bar graph using pandas
totalmice_pds_2.plot(kind='bar', figsize = (6,4))

# aesthetics
plt.title('Total Mice by Regimen')
plt.xlabel('Regimen')
plt.ylabel('Quantity')
plt.tight_layout
plt.show()

In [466]:
# bargraph using pyplot
unique_regimen = merged_df['Drug Regimen'].unique()

# find x and y axis
total_mice = [230, 228, 188, 186, 182, 181, 181, 178, 178, 148]
x_axis = np.arange(len(total_mice))
regimens = [value for value in unique_regimen]

# bar graph
plt.bar(x_axis, total_mice, color = 'lightblue', alpha = 0.2, align = 'center')

tick_locations = [value for value in x_axis]
plt.xticks(tick_locations, regimens)

# aesthetics
plt.title('Total Mice by Regimen')
plt.xlabel('Regimen')
plt.ylabel('Quantity')
plt.tight_layout
plt.show()

In [467]:
# Generate a pie plot using both Pandas's DataFrame.plot() and Matplotlib's 
# pyplot that shows the distribution of female or male mice in the study.

In [468]:
merged_df.head()

Unnamed: 0,Mouse ID,Timepoint,Tumor Volume (mm3),Metastatic Sites,Drug Regimen,Sex,Age_months,Weight (g)
0,b128,0,45.0,0,Capomulin,Female,9,22
1,b128,5,45.651331,0,Capomulin,Female,9,22
2,b128,10,43.270852,0,Capomulin,Female,9,22
3,b128,15,43.784893,0,Capomulin,Female,9,22
4,b128,20,42.731552,0,Capomulin,Female,9,22


In [469]:
group_bysex = merged_df.groupby(['Mouse ID','Sex'])

male_female_df = pd.DataFrame(group_bysex.size())
male_female = pd.DataFrame(male_female_df.groupby(["Sex"]).count())
male_female.columns = ["Total Distribution"]

male_female["Percentage"] = (100*(male_female["Total Distribution"]/male_female["Total Distribution"].sum()))
male_female["Percentage"] = male_female["Percentage"]

male_female

In [470]:
colors = ['lightblue', 'lightgreen']
plot = male_female.plot.pie(y='Total Distribution', colors = colors,
                            explode = (0,.1), shadow = True,
                            startangle = 140, autopct="%1.1f%%",
                            figsize=(5,5))

plt.title('Male v Female')
plt.axis('equal')

In [471]:
sizes = [49.59, 50.41]
explode = (0.3, 0)
labels = ["Female","Male"]
colors = ['lightblue', 'lightgreen']

plt.pie(sizes, explode=explode, labels=labels, colors=colors, autopct="%1.1f%%",
        shadow=True, startangle=140)

plt.axis("equal")

In [472]:
merged_df

Unnamed: 0,Mouse ID,Timepoint,Tumor Volume (mm3),Metastatic Sites,Drug Regimen,Sex,Age_months,Weight (g)
0,b128,0,45.000000,0,Capomulin,Female,9,22
1,b128,5,45.651331,0,Capomulin,Female,9,22
2,b128,10,43.270852,0,Capomulin,Female,9,22
3,b128,15,43.784893,0,Capomulin,Female,9,22
4,b128,20,42.731552,0,Capomulin,Female,9,22
...,...,...,...,...,...,...,...,...
1875,m601,25,33.118756,1,Capomulin,Male,22,17
1876,m601,30,31.758275,1,Capomulin,Male,22,17
1877,m601,35,30.834357,1,Capomulin,Male,22,17
1878,m601,40,31.378045,1,Capomulin,Male,22,17


In [473]:
# get all four regimens given to us: Capomulin, Ramicane, Infubinol, and Ceftamin
four_regimens = merged_df[merged_df["Drug Regimen"].isin(["Capomulin",
                                                          "Ramicane",
                                                          "Infubinol",
                                                          "Ceftamin"])]

four_regimens = four_regimens.sort_values(["Timepoint"], ascending=True)
four_regimens_df = four_regimens[["Drug Regimen", "Mouse ID", "Timepoint", "Tumor Volume (mm3)"]]

four_regimens_df.head(5)

In [474]:
four_regimens_sorted = four_regimens_df.groupby(['Drug Regimen', 'Mouse ID']).last()['Tumor Volume (mm3)']
four_regimens_sorted.head()

four_regimens_df2 = four_regimens_sorted.to_frame()
four_regimens_df2

the_four_regimens = ['Capomulin', 'Ramicane', 'Infubinol','Ceftamin']

four_regimens_df3 = four_regimens_df2.reset_index()
tumor_lists = four_regimens_df3.groupby('Drug Regimen')['Tumor Volume (mm3)'].apply(list)
tumor_list_df = pd.DataFrame(tumor_lists)
tumor_list_df = tumor_list_df.reindex(the_four_regimens)
tumor_vols = [vol for vol in tumor_list_df['Tumor Volume (mm3)']]
plt.boxplot(tumor_vols, labels=the_four_regimens)
plt.ylim(10, 80)

In [475]:
# Select a mouse that was treated with Capomulin

In [476]:
mouse_wcap = merged_df[merged_df["Mouse ID"].isin(["b128"])]
mouse_wcap_df = mouse_wcap[["Mouse ID", "Timepoint", "Tumor Volume (mm3)"]]

single_line_plot_df = mouse_wcap_df.reset_index()
single_line_plot_2 = single_line_plot_df[["Mouse ID", "Timepoint", "Tumor Volume (mm3)"]]

lines = single_line_plot_2.plot.line()

# aesthetics
plt.title('Mouse treated with Capomulin')
plt.xlabel('Volume')
plt.ylabel('Timepoint')
plt.tight_layout
plt.show()

In [477]:
# capomulin treatment regimen

In [478]:
cap_df = merged_df.loc[merged_df['Drug Regimen']=='Capomulin',:]
cap_df.head()

Unnamed: 0,Mouse ID,Timepoint,Tumor Volume (mm3),Metastatic Sites,Drug Regimen,Sex,Age_months,Weight (g)
0,b128,0,45.0,0,Capomulin,Female,9,22
1,b128,5,45.651331,0,Capomulin,Female,9,22
2,b128,10,43.270852,0,Capomulin,Female,9,22
3,b128,15,43.784893,0,Capomulin,Female,9,22
4,b128,20,42.731552,0,Capomulin,Female,9,22


In [479]:
cap_treatment = cap_df.groupby(['Mouse ID']).mean()
plt.scatter(cap_treatment['Weight (g)'], cap_treatment['Tumor Volume (mm3)'])
plt.xlabel('Weight (g)')
plt.ylabel('Average Tumor Volume (mm3)')
plt.show()

<IPython.core.display.Javascript object>