In [26]:
# Dependencies and Setup
import matplotlib.pyplot as plt
import pandas as pd
import scipy.stats as st

#To display all outputs in a cell rather than the last output
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = 'all'

# Study data files
mouse_metadata_path = "data/Mouse_metadata.csv"
study_results_path = "data/Study_results.csv"

# Read the mouse data and the study results
mouse_metadata = pd.read_csv(mouse_metadata_path)
study_results = pd.read_csv(study_results_path)

# Combine the data into a single dataset
DF = pd.merge(study_results, mouse_metadata, how='left', on='Mouse ID')

In [39]:
# Checking the number of mice in the DataFrame.
DF['Mouse ID'].value_counts().count()

249

In [57]:
# Getting the duplicate mice by ID number that shows up for Mouse ID and Timepoint. 
# Note:- As per the description, for a timepoint, there should be only one entry for any mouse


#Duplicate Mice
duplicate_mice_ids = list(DF[DF.duplicated(subset=['Mouse ID','Timepoint'], keep = False)]['Mouse ID'].unique())
duplicate_mice_ids

['g989']

In [66]:
# Optional: Get all the data for the duplicate mouse ID. 

#Extra:- Duplicated Entries are displayed as follows:
#DF[DF.duplicated(subset=['Mouse ID','Timepoint'], keep = False)]

DF[DF['Mouse ID'].isin(duplicate_mice_ids)]
#13 rows for this mouse

Unnamed: 0,Mouse ID,Timepoint,Tumor Volume (mm3),Metastatic Sites,Drug Regimen,Sex,Age_months,Weight (g)
107,g989,0,45.0,0,Propriva,Female,21,26
137,g989,0,45.0,0,Propriva,Female,21,26
329,g989,5,48.786801,0,Propriva,Female,21,26
360,g989,5,47.570392,0,Propriva,Female,21,26
620,g989,10,51.745156,0,Propriva,Female,21,26
681,g989,10,49.880528,0,Propriva,Female,21,26
815,g989,15,51.325852,1,Propriva,Female,21,26
869,g989,15,53.44202,0,Propriva,Female,21,26
950,g989,20,55.326122,1,Propriva,Female,21,26
1111,g989,20,54.65765,1,Propriva,Female,21,26


In [79]:
# Create a clean DataFrame by dropping the duplicate mouse by its ID.
# Note:- All data corresponding to this mouse is dropped

DF.drop(DF[DF['Mouse ID'].isin(duplicate_mice_ids)].index, axis=0, inplace=True)
#Left with 1880 rows

In [84]:
# Checking the number of mice in the clean DataFrame.
DF['Mouse ID'].value_counts().count()

248

In [85]:
DF.head()

Unnamed: 0,Mouse ID,Timepoint,Tumor Volume (mm3),Metastatic Sites,Drug Regimen,Sex,Age_months,Weight (g)
0,b128,0,45.0,0,Capomulin,Female,9,22
1,f932,0,45.0,0,Ketapril,Male,15,29
2,g107,0,45.0,0,Ketapril,Female,2,29
3,a457,0,45.0,0,Ketapril,Female,11,30
4,c819,0,45.0,0,Ketapril,Male,21,25
