In [None]:
import pandas as pd
import plotly.offline as pyo
import plotly.express as px
import plotly.graph_objects as go

# Initialize notebook mode for Plotly
pyo.init_notebook_mode()

# Load the penguins dataset
penguins = pd.read_csv("https://raw.githubusercontent.com/mwaskom/seaborn-data/master/penguins.csv")

# Drop missing values
penguins = penguins.dropna(subset=['flipper_length_mm'])

# List of species
species_list = penguins['species'].unique()

# Create histograms for each species
for species in species_list:
    # Filter data for each species
    species_data = penguins[penguins['species'] == species]
    
    # Calculate statistics
    mean = species_data['flipper_length_mm'].mean()
    median = species_data['flipper_length_mm'].median()
    std = species_data['flipper_length_mm'].std()
    q1 = species_data['flipper_length_mm'].quantile(0.25)
    q3 = species_data['flipper_length_mm'].quantile(0.75)
    iqr = q3 - q1
    min_value = species_data['flipper_length_mm'].min()
    max_value = species_data['flipper_length_mm'].max()
    
    # Create histogram
    fig = px.histogram(species_data, x='flipper_length_mm', nbins=20, title=f"{species} - Flipper Length")
    
    # Add vertical lines for mean and median
    fig.add_vline(x=mean, line_dash="dash", line_color="blue", annotation_text="Mean", annotation_position="top left")
    fig.add_vline(x=median, line_dash="dash", line_color="green", annotation_text="Median", annotation_position="top right")
    
    # Add rectangles for range (min to max), IQR, and 2 standard deviations
    fig.add_vrect(x0=min_value, x1=max_value, fillcolor="red", opacity=0.1, line_width=0, annotation_text="Range", annotation_position="top left")
    fig.add_vrect(x0=q1, x1=q3, fillcolor="yellow", opacity=0.2, line_width=0, annotation_text="IQR", annotation_position="top left")
    fig.add_vrect(x0=mean - 2*std, x1=mean + 2*std, fillcolor="blue", opacity=0.1, line_width=0, annotation_text="±2 Std Dev", annotation_position="top left")
    
    # Show the plot
    fig.show(renderer="png")

In [None]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

# Load the penguins dataset
penguins = pd.read_csv("https://raw.githubusercontent.com/mwaskom/seaborn-data/master/penguins.csv")

# Drop missing values
penguins = penguins.dropna(subset=['flipper_length_mm'])

# List of species
species_list = penguins['species'].unique()

# Set Seaborn style
sns.set_style("whitegrid")

# Create a figure with subplots (1 row, 3 columns)
fig, axes = plt.subplots(1, 3, figsize=(18, 6), sharey=True)

# Loop over each species and create a KDE plot
for i, species in enumerate(species_list):
    # Filter data for each species
    species_data = penguins[penguins['species'] == species]
    
    # Calculate statistics
    mean = species_data['flipper_length_mm'].mean()
    median = species_data['flipper_length_mm'].median()
    std = species_data['flipper_length_mm'].std()
    q1 = species_data['flipper_length_mm'].quantile(0.25)
    q3 = species_data['flipper_length_mm'].quantile(0.75)
    
    # Create KDE plot
    sns.kdeplot(data=species_data, x='flipper_length_mm', ax=axes[i], fill=True, color="skyblue", label=f'{species}')
    
    # Add vertical lines for mean and median
    axes[i].axvline(mean, color='blue', linestyle='--', label='Mean')
    axes[i].axvline(median, color='green', linestyle='-.', label='Median')
    
    # Add shaded areas for IQR and ±2 standard deviations
    axes[i].axvspan(mean - 2*std, mean + 2*std, alpha=0.2, color='lightblue', label='±2 Std Dev')
    axes[i].axvspan(q1, q3, alpha=0.3, color='yellow', label='IQR')
    
    # Add titles and labels
    axes[i].set_title(f"{species} - Flipper Length KDE")
    axes[i].set_xlabel("Flipper Length (mm)")
    axes[i].legend()

# Set overall title for the figure
plt.suptitle("Flipper Length Distribution Across Penguin Species", fontsize=16)

# Display the plot
plt.tight_layout(rect=[0, 0.03, 1, 0.95])
plt.show()

3.) Search online for some images of box plots, histograms, and kernel density estimators (perhaps for the same data set); describe to a ChatBot what you think the contrasting descriptions of these three "data distribution" visualization methods are; and then see if the ChatBot agrees and what "pros and cons" list of these three "data distribution" visualization methods your ChatBot can come up with; finally, describe your preference for one or the other and your rationale for this preference

I prefer the histogram because it is easy to spot jumps in the data, whereas the kernel density estimators smooth out the data so it is not as precise. Additionally, I am not a fan of the box plot. I feel I can get the same information from a histogram with even more detail. 

ChatBot Link: https://chatgpt.com/share/66ef8d6f-6ff4-8004-a636-31724ec2c5e5

4.) Run the code below and look at the resulting figure of distrubutions and then answer the following questions
1. Which datasets have similar means and similar variances
A,C
2. Which datasets have similar means but quite different variances
A,B
3. Which datasets have similar variances but quite different means
none
4. Which datasets have quite different means and quite different variances
C,D

In [None]:
from scipy import stats
import plotly.graph_objects as go
from plotly.subplots import make_subplots
import numpy as np

n = 1500
data1 = stats.uniform.rvs(0, 10, size=n)
data2 = stats.norm.rvs(5, 1.5, size=n)
data3 = np.r_[stats.norm.rvs(2, 0.25, size=int(n/2)), stats.norm.rvs(8, 0.5, size=int(n/2))]
data4 = stats.norm.rvs(6, 0.5, size=n)

fig = make_subplots(rows=1, cols=4)

fig.add_trace(go.Histogram(x=data1, name='A', nbinsx=30, marker=dict(line=dict(color='black', width=1))), row=1, col=1)
fig.add_trace(go.Histogram(x=data2, name='B', nbinsx=15, marker=dict(line=dict(color='black', width=1))), row=1, col=2)
fig.add_trace(go.Histogram(x=data3, name='C', nbinsx=45, marker=dict(line=dict(color='black', width=1))), row=1, col=3)
fig.add_trace(go.Histogram(x=data4, name='D', nbinsx=15, marker=dict(line=dict(color='black', width=1))), row=1, col=4)

fig.update_layout(height=300, width=750, title_text="Row of Histograms")
fig.update_xaxes(title_text="A", row=1, col=1)
fig.update_xaxes(title_text="B", row=1, col=2)
fig.update_xaxes(title_text="C", row=1, col=3)
fig.update_xaxes(title_text="D", row=1, col=4)
fig.update_xaxes(range=[-0.5, 10.5])

for trace in fig.data:
    trace.xbins = dict(start=0, end=10)
    
# This code was produced by just making requests to Microsoft Copilot
# https://github.com/pointOfive/stat130chat130/blob/main/CHATLOG/wk3/COP/SLS/0001_concise_makeAplotV1.md

fig.show(renderer="png") # USE `fig.show(renderer="png")` FOR ALL GitHub and MarkUs SUBMISSIONS

In [None]:
from scipy import stats
import numpy as np

# Recreate the datasets
n = 1500
data1 = stats.uniform.rvs(0, 10, size=n)
data2 = stats.norm.rvs(5, 1.5, size=n)
data3 = np.r_[stats.norm.rvs(2, 0.25, size=int(n/2)), stats.norm.rvs(8, 0.5, size=int(n/2))]
data4 = stats.norm.rvs(6, 0.5, size=n)

# Calculate means and variances
means = [np.mean(data) for data in [data1, data2, data3, data4]]
variances = [np.var(data) for data in [data1, data2, data3, data4]]

# Print the results
for i, (mean, var) in enumerate(zip(means, variances), 1):
    print(f"Dataset {i}: Mean = {mean:.2f}, Variance = {var:.2f}")

ChatBot Link: https://chatgpt.com/share/66f60f87-7ed8-8004-a736-03134dca4652

In [None]:
from scipy import stats 
import pandas as pd
import numpy as np
  
sample1 = stats.gamma(a=2,scale=2).rvs(size=1000)
fig1 = px.histogram(pd.DataFrame({'data': sample1}), x="data")

sample1.mean()
np.quantile(sample1, [0.5])

sample2 = -stats.gamma(a=2,scale=2).rvs(size=1000)
fig1.show(renderer='png')

In [None]:
import plotly.express as px

# Sample 1: Right-skewed (positive skew)
sample1 = stats.gamma(a=2, scale=2).rvs(size=1000)
fig1 = px.histogram(pd.DataFrame({'data': sample1}), x="data", title="Right-Skewed Distribution")
fig1.show(renderer="png")

# Calculate mean and median for right-skewed sample
mean1 = sample1.mean()
median1 = np.quantile(sample1, 0.5)
print(f"Right-Skewed: Mean = {mean1}, Median = {median1}")

# Sample 2: Left-skewed (negative skew)
sample2 = -stats.gamma(a=2, scale=2).rvs(size=1000)
fig2 = px.histogram(pd.DataFrame({'data': sample2}), x="data", title="Left-Skewed Distribution")
fig2.show(renderer="png")

# Calculate mean and median for left-skewed sample
mean2 = sample2.mean()
median2 = np.quantile(sample2, 0.5)
print(f"Left-Skewed: Mean = {mean2}, Median = {median2}")

Gamma distributions are usually right-skewed, which means the mean is greater than the median. By negating the gamma value of distritbution, it is flipped horizontally to create a left-skewed dataset.

ChatBot Link: https://chatgpt.com/share/66f1ecdd-7c0c-8004-a303-d23430fe453b

In [None]:
import pandas as pd
import matplotlib.pyplot as plt

# Load the dataset
df = pd.read_csv('covid19-download.csv')

# Ensure the 'date' column is in datetime format
df['date'] = pd.to_datetime(df['date'], errors='coerce')

# Ensure the 'totalcases' column is numeric
df['totalcases'] = pd.to_numeric(df['totalcases'], errors='coerce')

# Drop any rows with missing values
df.dropna(subset=['date', 'totalcases'], inplace=True)

# Group by month and sum the total cases
monthly_cases = df.resample('M', on='date')['totalcases'].sum().reset_index()

# Create the bar plot with wider bars
plt.figure(figsize=(12, 6))
plt.bar(monthly_cases['date'], monthly_cases['totalcases'], width=20, color='blue', alpha=0.7)  # Adjust width here
plt.xlabel('Date')
plt.ylabel('Total Cases')
plt.title('Total COVID-19 Cases Over Time (Monthly)')
plt.xticks(rotation=45)
plt.tight_layout()
plt.show()

ChatBot Link: Im so sorry... it deleted and disapeared. I can narrate our conversation. I asked chatbot to make a histogram with this dataset comparing the dates with frequency of covid 19 cases. It gave me really thin bins, so I demanded it give me thicker bins and after multiple back and forths, it came up with this. 

In [None]:
import plotly.express as px

df = px.data.gapminder()

for template in ["plotly_white"]:
    fig = px.scatter(df, 
                     x="gdpPercap", 
                     y="lifeExp", 
                     animation_frame="year", 
                     animation_group="country", 
                     size="pop", 
                     color="continent", 
                     hover_name="country", 
                     log_x=True, 
                     size_max=55, 
                     range_x=[100, 100000], 
                     range_y=[25, 90], 
                     template=template)  # Apply the template here
    
    fig.show(renderer="png")

In [None]:
import plotly.express as px
import pandas as pd

# Load the baby names data
bn = pd.read_csv('https://raw.githubusercontent.com/hadley/data-baby-names/master/baby-names.csv')

# Make identical boy and girl names distinct
bn['name'] = bn['name'] + " " + bn['sex']

# Rank based on 'percent' for each year, descending order
bn['rank'] = bn.groupby('year')['percent'].rank(ascending=False)

# Sort by name and year to calculate percent change
bn = bn.sort_values(['name', 'year'])

# Calculate percent change in name prevalence from the previous year
bn['percent change'] = bn['percent'].diff()

# Handle new names that appear in the dataset
new_name = [True] + list(bn.name[:-1].values != bn.name[1:].values)
bn.loc[new_name, 'percent change'] = bn.loc[new_name, 'percent']

# Filter common names (percent > 0.001)
bn = bn[bn.percent > 0.001]

# Sort by year for correct animation sequence
bn = bn.sort_values('year')

# Create the scatter plot with Plotly
fig = px.scatter(bn, 
                 x="percent change",      # X-axis: percent change
                 y="rank",                # Y-axis: rank
                 animation_frame="year",  # Animate over years
                 animation_group="name",  # Track names across frames
                 size="percent",          # Size by percent
                 color="sex",             # Color by sex
                 hover_name="name",       # Hover displays name
                 size_max=50,             # Max bubble size
                 range_x=[-0.005, 0.005]  # X-axis range
                )

# Reverse the y-axis so rank 1 is at the top
fig.update_yaxes(autorange='reversed')

# Use fig.show(renderer="png") for GitHub/MarkUs submissions
fig.show(renderer="png")

ChatBot Link: https://chatgpt.com/share/66f22be4-5dac-8004-8d76-71908cbbf2de