In [151]:
# Dependencies and Setup
import matplotlib.pyplot as plt
import pandas as pd
import scipy.stats as st

# Study data files
mouse_metadata_path = "data/Mouse_metadata.csv"
study_results_path = "data/Study_results.csv"

# Read the mouse data and the study results
mouse_metadata = pd.read_csv(mouse_metadata_path)
study_results = pd.read_csv(study_results_path)

In [152]:
## Checking DF
mouse_metadata.head()

Unnamed: 0,Mouse ID,Drug Regimen,Sex,Age_months,Weight (g)
0,k403,Ramicane,Male,21,16
1,s185,Capomulin,Female,3,17
2,x401,Capomulin,Female,16,15
3,m601,Capomulin,Male,22,17
4,g791,Ramicane,Male,11,16


In [153]:
## Checking DF
study_results.head()

Unnamed: 0,Mouse ID,Timepoint,Tumor Volume (mm3),Metastatic Sites
0,b128,0,45.0,0
1,f932,0,45.0,0
2,g107,0,45.0,0
3,a457,0,45.0,0
4,c819,0,45.0,0


In [154]:
# Combine the data into a single dataset
combined_df = pd.merge(mouse_metadata, study_results, how="outer", on="Mouse ID")

# Display the data table for preview
combined_df.head()

Unnamed: 0,Mouse ID,Drug Regimen,Sex,Age_months,Weight (g),Timepoint,Tumor Volume (mm3),Metastatic Sites
0,k403,Ramicane,Male,21,16,0,45.0,0
1,k403,Ramicane,Male,21,16,5,38.825898,0
2,k403,Ramicane,Male,21,16,10,35.014271,1
3,k403,Ramicane,Male,21,16,15,34.223992,1
4,k403,Ramicane,Male,21,16,20,32.997729,1


In [155]:
# Checking the number of mice
## The number of unique values in Mouse ID column
number_mice = len(combined_df["Mouse ID"].unique())
number_mice

249

In [163]:
## Counting how many entries there are
len(combined_df)

1893

In [166]:
## Groupby ID and then apply aggregate to count how many times each ID appears
ID_grouped = combined_df.groupby("Mouse ID")
ID_count = ID_grouped["Mouse ID"].count()
ID_count

Mouse ID
a203    10
a251    10
a262    10
a275    10
a366     7
        ..
z435     3
z578    10
z581    10
z795    10
z969    10
Name: Mouse ID, Length: 249, dtype: int64

In [167]:
# Getting the duplicate mice by ID number that shows up for Mouse ID and Timepoint. 
## Select all duplicate rows based on multiple column names in list
ID_TP = combined_df[["Mouse ID" , "Timepoint"]]
duplicates = ID_TP[ID_TP.duplicated(["Mouse ID", "Timepoint"])]
duplicates

Unnamed: 0,Mouse ID,Timepoint
909,g989,0
911,g989,5
913,g989,10
915,g989,15
917,g989,20


In [168]:
# Optional: Get all the data for the duplicate mouse ID. 
duplicates_complete = combined_df[combined_df.duplicated(["Mouse ID", "Timepoint"])]
duplicates_complete

Unnamed: 0,Mouse ID,Drug Regimen,Sex,Age_months,Weight (g),Timepoint,Tumor Volume (mm3),Metastatic Sites
909,g989,Propriva,Female,21,26,0,45.0,0
911,g989,Propriva,Female,21,26,5,47.570392,0
913,g989,Propriva,Female,21,26,10,49.880528,0
915,g989,Propriva,Female,21,26,15,53.44202,0
917,g989,Propriva,Female,21,26,20,54.65765,1


In [169]:
# Create a clean DataFrame by dropping the duplicate mouse by its ID.
clean_df = combined_df.drop([combined_df.index[909] , combined_df.index[911] , combined_df.index[913] , combined_df.index[915] , combined_df.index[917]])
clean_df.head()

Unnamed: 0,Mouse ID,Drug Regimen,Sex,Age_months,Weight (g),Timepoint,Tumor Volume (mm3),Metastatic Sites
0,k403,Ramicane,Male,21,16,0,45.0,0
1,k403,Ramicane,Male,21,16,5,38.825898,0
2,k403,Ramicane,Male,21,16,10,35.014271,1
3,k403,Ramicane,Male,21,16,15,34.223992,1
4,k403,Ramicane,Male,21,16,20,32.997729,1


In [170]:
# Checking the number of mice in the clean DataFrame.
total_mice = len(clean_df["Mouse ID"].unique())
total_mice

249

In [162]:
# Number of entries has been reduced by 5
len(clean_df)

1888

In [172]:
## clean_2_df = combined_df[combined_df["Mouse ID"] != "g989"]
## total = len(clean_2_df["Mouse ID"].unique())
## print(total)
## print(len(clean_2_df))
