# Pymaceuticals Inc.
---

### Analysis

- Add your analysis here.
 

In [2]:
# Dependencies and Setup
import matplotlib.pyplot as plt
import pandas as pd
import scipy.stats as st
from scipy.stats import linregress
import numpy as np


# Study data files
mouse_metadata_path = "data/Mouse_metadata"
study_results_path = "data/Study_results"

# Read the mouse data and the study results
mouse_metadata = pd.read_csv(mouse_metadata_path)
study_results = pd.read_csv(study_results_path)

# Combine the data into a single DataFrame
merged_data = pd.merge(mouse_metadata, study_results, on = "Mouse ID")
merged_data.head()


FileNotFoundError: [Errno 2] No such file or directory: 'data/Mouse_metadata'

In [None]:
# Checking the number of mice.
mouse_count = merged_data["Mouse ID"].nunique()
mouse_count

In [None]:

# Our data should be uniquely identified by Mouse ID and Timepoint
# Get the duplicate mice by ID number that shows up for Mouse ID and Timepoint.
duplicates = merged_data[merged_data.duplicated(subset=['Mouse ID', 'Timepoint'], keep=False)]
duplicates[['Mouse ID', 'Timepoint']].drop_duplicates()

In [None]:
# Optional: Get all the data for the duplicate mouse ID.
duplicate_mouse_data = merged_data[merged_data['Mouse ID'] == 'g989']
duplicate_mouse_data

In [None]:
# Create a clean DataFrame by dropping the duplicate mouse by its ID.
clean_data = merged_data.drop(merged_data[merged_data['Mouse ID'] == 'g989'].index)

# Display the first few rows of the clean DataFrame
clean_data.head()

In [None]:
# Checking the number of mice in the clean DataFrame.
clean_count = clean_data["Mouse ID"].nunique()
clean_count

## Summary Statistics

In [None]:
# Generate a summary statistics table of mean, median, variance, standard deviation, and SEM of the tumor volume for each regimen

summary_stats = merged_data.groupby('Drug Regimen')['Tumor Volume (mm3)'].agg(
    Mean_Tumor_Volume='mean',
    Median_Tumor_Volume='median',
    Tumor_Volume_Variance='var',
    Tumor_Volume_Std_Dev='std',
    Tumor_Volume_Std_Error='sem'
).reset_index()
summary_stats

In [None]:
# A more advanced method to generate a summary statistics table of mean, median, variance, standard deviation,
# and SEM of the tumor volume for each regimen (only one method is required in the solution)

# Using the aggregation method, produce the same summary statistics in a single line

summary_stats = merged_data.groupby('Drug Regimen')['Tumor Volume (mm3)'].agg(
    mean='mean',
    median='median',
    var='var',
    std='std',
    sem='sem'
)

# Adjust the column layout to match the desired style
summary_stats.columns.name = "Tumor Volume (mm3)"

# Display the summary statistics table
summary_stats

## Bar and Pie Charts

In [None]:

# Generate a bar plot showing the total number of rows (Mouse ID/Timepoints) for each drug regimen using Pandas.
timepoint_counts = merged_data['Drug Regimen'].value_counts()

plt.figure(figsize=(10, 6))
timepoint_counts.plot(kind='bar', color='skyblue')
plt.title('# of Observed Mouse Timepoints for Each Drug Regimen')
plt.xlabel('Drug Regimen')
plt.ylabel('# of Observed Mouse Timepoints')
plt.xticks(rotation=90) 
plt.tight_layout()
plt.show()

In [None]:
# Generate a bar plot showing the total number of rows (Mouse ID/Timepoints) for each drug regimen using pyplot.
# Count the number of timepoints for each drug regimen
timepoint_counts = merged_data['Drug Regimen'].value_counts()

plt.bar(timepoint_counts.index, timepoint_counts.values, color='skyblue')
plt.title('# of Observed Mouse Timepoints')
plt.xlabel('Drug Regimen')
plt.ylabel('# of Observed Mouse Timepoints')
plt.xticks(rotation=90) 
plt.tight_layout()
plt.show()

In [None]:
# Generate a pie chart, using Pandas, showing the distribution of unique female versus male mice used in the study

# Get the unique mice with their gender
gender_counts = merged_data["Sex"].value_counts()

gender_counts.plot.pie(
    autopct='%1.1f%%', 
    colors=['royalblue', 'orange'],  
    title='Distribution of Male vs Female Mice'
)
plt.tight_layout()
plt.show()

In [None]:
# Generate a pie chart, using pyplot, showing the distribution of unique female versus male mice used in the study

# Get the unique mice with their gender
gender_counts = merged_data["Sex"].value_counts()

plt.figure(figsize=(6, 6))
plt.pie(
    gender_counts.values, 
    labels=gender_counts.index, 
    autopct='%1.1f%%',    
    colors=['royalblue', 'orange']  
)
plt.title('Distribution of Male vs Female Mice')
plt.tight_layout()
plt.show()

## Quartiles, Outliers and Boxplots

In [None]:
# Calculate the final tumor volume of each mouse across four of the treatment regimens:
# Merge this group df with the original DataFrame to get the tumor volume at the last timepoint
selected_regimens = ['Capomulin', 'Ramicane', 'Infubinol', 'Ceftamin']
filtered_data = merged_data[merged_data['Drug Regimen'].isin(selected_regimens)]

# Get the last (greatest) timepoint for each mouse
last_timepoints = filtered_data.groupby('Mouse ID')['Timepoint'].max().reset_index()

# Merge this group DataFrame with the original DataFrame to get the tumor volume at the last timepoint
final_tumor_volume = pd.merge(last_timepoints, filtered_data, on=['Mouse ID', 'Timepoint'], how='left')
final_tumor_volume.head()

In [None]:
# Put treatments into a list for for loop (and later for plot labels)
treatments = ['Capomulin', 'Ramicane', 'Infubinol', 'Ceftamin']

# Create empty list to fill with tumor vol data (for plotting)
tumor_vol_data = []

# Loop through each treatment to calculate IQR and determine potential outliers
for treatment in treatments:
    
    # Locate the rows which contain mice on each drug and get the tumor volumes
    final_tumor_volumes = final_tumor_volume[final_tumor_volume['Drug Regimen'] == treatment]['Tumor Volume (mm3)']
    
    # Add subset to the list
    tumor_vol_data.append(final_tumor_volumes)
    
    # Calculate the IQR
    quartiles = final_tumor_volumes.quantile([0.25, 0.5, 0.75])
    lowerq = quartiles[0.25]
    upperq = quartiles[0.75]
    iqr = upperq - lowerq
    
    # Calculate the lower and upper bounds for outliers
    lower_bound = lowerq - (1.5 * iqr)
    upper_bound = upperq + (1.5 * iqr)
    
    # Determine outliers using the bounds (keep original index)
    outliers = final_tumor_volumes[(final_tumor_volumes < lower_bound) | (final_tumor_volumes > upper_bound)]
    
    # Print the potential outliers for each treatment
    print(f"{treatment}'s potential outliers: {outliers}")

In [None]:
# Generate a box plot that shows the distribution of the tumor volume for each treatment group.
selected_regimens = ['Capomulin', 'Ramicane', 'Infubinol', 'Ceftamin']
final_tumor_data = merged_data[merged_data['Drug Regimen'].isin(selected_regimens)]

# Get the final tumor volume for each mouse
final_tumor_volume = final_tumor_data.groupby(['Drug Regimen', 'Mouse ID'])['Tumor Volume (mm3)'].last().reset_index()

# Separate the data by regimen
capomulin = final_tumor_volume[final_tumor_volume['Drug Regimen'] == 'Capomulin']['Tumor Volume (mm3)']
ramicane = final_tumor_volume[final_tumor_volume['Drug Regimen'] == 'Ramicane']['Tumor Volume (mm3)']
infubinol = final_tumor_volume[final_tumor_volume['Drug Regimen'] == 'Infubinol']['Tumor Volume (mm3)']
ceftamin = final_tumor_volume[final_tumor_volume['Drug Regimen'] == 'Ceftamin']['Tumor Volume (mm3)']

plt.figure(figsize=(8, 6))
plt.boxplot(
    [capomulin, ramicane, infubinol, ceftamin],
    labels=['Capomulin', 'Ramicane', 'Infubinol', 'Ceftamin'],
    flierprops=dict(marker='o', markerfacecolor='red', markersize=12)  
)
plt.title('Final Tumor Volume (mm3)')
plt.ylabel('Final Tumor Volume (mm3)')
plt.tight_layout()
plt.show()

## Line and Scatter Plots

In [None]:
# Generate a line plot of tumor volume vs. time point for a single mouse treated with Capomulin
# Filter the data for mouse I509 under the Capomulin regimen
mouse_data = merged_data[(merged_data['Drug Regimen'] == 'Capomulin') & (merged_data['Mouse ID'] == 'l509')]

plt.figure(figsize=(8, 6))
plt.plot(mouse_data['Timepoint'], mouse_data['Tumor Volume (mm3)'], marker='o')
plt.title('Capomulin treatment of mouse l509')
plt.xlabel('Timepoint (days)')
plt.ylabel('Tumor Volume (mm3)')
plt.tight_layout()
plt.show()

In [None]:
# Generate a scatter plot of mouse weight vs. the average observed tumor volume for the entire Capomulin regimen
capomulin_data = merged_data[merged_data['Drug Regimen'] == 'Capomulin']

# Group by Mouse ID and calculate the average tumor volume
avg_tumor_volume = capomulin_data.groupby('Mouse ID')[['Weight (g)', 'Tumor Volume (mm3)']].mean()

plt.figure(figsize=(8, 6))
plt.scatter(avg_tumor_volume['Weight (g)'], avg_tumor_volume['Tumor Volume (mm3)'])
plt.title('Mouse Weight vs. Average Tumor Volume (Capomulin Regimen)')
plt.xlabel('Weight (g)')
plt.ylabel('Average Tumor Volume (mm3)')
plt.tight_layout()
plt.show()

## Correlation and Regression

In [None]:
# Calculate the correlation coefficient and a linear regression model
# for mouse weight and average observed tumor volume for the entire Capomulin regimen
capomulin_data = merged_data[merged_data['Drug Regimen'] == 'Capomulin']

# Group by Mouse ID to get average weight and tumor volume
avg_tumor_volume = capomulin_data.groupby('Mouse ID').mean(numeric_only=True)

# Calculate the correlation coefficient
correlation = np.corrcoef(avg_tumor_volume['Weight (g)'], avg_tumor_volume['Tumor Volume (mm3)'])[0, 1]
print(f"Correlation Coefficient: {correlation:.2f}")

# Perform linear regression
slope, intercept, r_value, p_value, std_err = linregress(avg_tumor_volume['Weight (g)'], avg_tumor_volume['Tumor Volume (mm3)'])

# Generate the regression line values
regression_values = avg_tumor_volume['Weight (g)'] * slope + intercept

plt.figure(figsize=(8, 6))
plt.scatter(avg_tumor_volume['Weight (g)'], avg_tumor_volume['Tumor Volume (mm3)'], marker='o')
plt.plot(avg_tumor_volume['Weight (g)'], regression_values, color='red')  # Regression line
plt.title('Mouse Weight vs. Average Tumor Volume (Capomulin Regimen)')
plt.xlabel('Weight (g)')
plt.ylabel('Average Tumor Volume (mm3)')
plt.tight_layout()
plt.show()