# Descriptive Statistics and Summaries - Solutions

Summary statistics, correlation analysis, and value_counts() operations.

## Question 1
Create a DataFrame with numerical data and calculate basic descriptive statistics (mean, median, std, min, max).

In [None]:
import pandas as pd
import numpy as np

np.random.seed(42)
df_stats = pd.DataFrame({
    'Score1': np.random.normal(75, 10, 100),
    'Score2': np.random.normal(80, 15, 100),
    'Score3': np.random.normal(70, 12, 100)
})
print("Sample data (first 5 rows):")
print(df_stats.head())

print("\nBasic statistics:")
print(f"Mean: {df_stats.mean()}")
print(f"\nMedian: {df_stats.median()}")
print(f"\nStandard deviation: {df_stats.std()}")
print(f"\nMinimum: {df_stats.min()}")
print(f"\nMaximum: {df_stats.max()}")

## Question 2
Use describe() method to get a comprehensive statistical summary of a DataFrame.

In [None]:
print("Comprehensive statistical summary:")
print(df_stats.describe())

## Question 3
Calculate the correlation matrix between numerical columns in a DataFrame.

In [None]:
correlation_matrix = df_stats.corr()
print("Correlation matrix:")
print(correlation_matrix)

## Question 4
Use value_counts() to count the frequency of categorical values in a column.

In [None]:
df_categorical = pd.DataFrame({
    'Grade': ['A', 'B', 'A', 'C', 'B', 'A', 'C', 'B', 'A', 'B'],
    'Department': ['Math', 'Science', 'Math', 'English', 'Science', 'Math', 'English', 'Science', 'Math', 'Science']
})
print("Sample categorical data:")
print(df_categorical)

print("\nGrade frequencies:")
print(df_categorical['Grade'].value_counts())

print("\nDepartment frequencies:")
print(df_categorical['Department'].value_counts())

## Question 5
Calculate quantiles (25th, 50th, 75th percentiles) for numerical columns.

In [None]:
quantiles = df_stats.quantile([0.25, 0.5, 0.75])
print("Quantiles (25th, 50th, 75th percentiles):")
print(quantiles)

## Question 6
Find the mode (most frequent value) in each column of a DataFrame.

In [None]:
print("Mode for categorical data:")
print(df_categorical.mode())

# For continuous data, mode might not be very meaningful, but here's how:
print("\nMode for numerical data (rounded to 1 decimal):")
df_rounded = df_stats.round(1)
print(df_rounded.mode().head(3))

## Question 7
Calculate skewness and kurtosis for numerical columns to understand data distribution.

In [None]:
print("Skewness (measure of asymmetry):")
print(df_stats.skew())

print("\nKurtosis (measure of tail heaviness):")
print(df_stats.kurtosis())

## Question 8
Create a covariance matrix and compare it with the correlation matrix.

In [None]:
covariance_matrix = df_stats.cov()
print("Covariance matrix:")
print(covariance_matrix)

print("\nCorrelation matrix (for comparison):")
print(correlation_matrix)

print("\nNote: Correlation is standardized covariance (values between -1 and 1)")

## Question 9
Use nunique() to count the number of unique values in each column.

In [None]:
print("Number of unique values in categorical data:")
print(df_categorical.nunique())

print("\nNumber of unique values in numerical data (rounded):")
print(df_stats.round(1).nunique())

## Question 10
Create a summary table showing count, mean, and standard deviation grouped by a categorical variable.

In [None]:
# Create combined dataset
df_combined = pd.DataFrame({
    'Category': ['A', 'B', 'A', 'B', 'C', 'A', 'B', 'C', 'A', 'C'],
    'Value1': [10, 15, 12, 18, 20, 11, 16, 22, 13, 19],
    'Value2': [25, 30, 28, 35, 40, 27, 32, 38, 29, 36]
})
print("Combined dataset:")
print(df_combined)

summary_by_category = df_combined.groupby('Category').agg({
    'Value1': ['count', 'mean', 'std'],
    'Value2': ['count', 'mean', 'std']
})
print("\nSummary statistics by category:")
print(summary_by_category)