# Statistical Learning with Python

A comprehensive introduction to statistical learning using Python, covering data manipulation, visualization, and statistical analysis.

## Contents
1. Setup and Imports
2. Vectors and Matrices
3. Random Variables
4. Data Visualization
5. Advanced Graphics
6. Indexing and Data Manipulation
7. Loading and Analyzing Real Data
8. Summary Statistics

## Setup and Imports

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from mpl_toolkits.mplot3d import Axes3D
import seaborn as sns

plt.style.use('seaborn-v0_8-whitegrid')
sns.set_palette("husl")
%matplotlib inline
%config InlineBackend.figure_format = 'retina'

## Vectors and Matrices

In [None]:
# Vector operations
x1 = np.array([1, 2, 3, 4])
x2 = np.array([1, 0, 1, 0])
y = x1 + x2

print(f"x1: {x1}")
print(f"x2: {x2}")
print(f"y = x1 + x2: {y}")
print(f"Length: {len(y)}")

In [None]:
# Matrix operations
x = np.array([[1, 3], [2, 4]])

print("Matrix x:")
print(x)
print("\nSquare root:")
print(np.sqrt(x))

## Random Variables

In [None]:
np.random.seed(123)
z1 = np.random.randn(10)

mu = np.mean(z1)
s2 = np.var(z1, ddof=1)

print(f"Random values: {z1}")
print(f"Mean: {mu:.4f}")
print(f"Variance: {s2:.4f}")

## Data Visualization

In [None]:
# Scatter plot
x1 = np.random.randn(100)
x2 = np.random.randn(100)

plt.figure(figsize=(8, 6))
plt.scatter(x1, x2, alpha=0.6)
plt.xlabel('x-axis')
plt.ylabel('y-axis')
plt.title('Scatter Plot')
plt.tight_layout()
plt.show()

## Advanced Graphics

In [None]:
# Create function grid
x = np.linspace(-np.pi, np.pi, 50)
y = np.linspace(-np.pi, np.pi, 50)
X, Y = np.meshgrid(x, y)
f = np.cos(Y) / (1 + X**2)

In [None]:
# Contour plot
plt.figure(figsize=(10, 8))
contour = plt.contour(X, Y, f, levels=15)
plt.colorbar(contour)
plt.xlabel('x')
plt.ylabel('y')
plt.title('Contour Plot')
plt.tight_layout()
plt.show()

In [None]:
# Contour plot with more detail
plt.figure(figsize=(10, 8))
contour = plt.contour(X, Y, f, levels=45, cmap='viridis')
plt.colorbar(contour)
plt.xlabel('x')
plt.ylabel('y')
plt.title('Detailed Contour Plot')
plt.tight_layout()
plt.show()

In [None]:
# Antisymmetric function
fa = (f - f.T) / 2

In [None]:
# Heatmap
plt.figure(figsize=(10, 8))
plt.imshow(fa, extent=[-np.pi, np.pi, -np.pi, np.pi], origin='lower', cmap='RdBu_r')
plt.colorbar()
plt.xlabel('x')
plt.ylabel('y')
plt.title('Heatmap')
plt.tight_layout()
plt.show()

In [None]:
# 3D surface plot
fig = plt.figure(figsize=(12, 9))
ax = fig.add_subplot(111, projection='3d')
surf = ax.plot_surface(X, Y, fa, cmap='viridis', alpha=0.9)
ax.set_xlabel('X')
ax.set_ylabel('Y')
ax.set_zlabel('Z')
ax.view_init(elev=20, azim=30)
plt.colorbar(surf)
plt.title('3D Surface Plot')
plt.tight_layout()
plt.show()

## Indexing and Data Manipulation

In [None]:
# Create a 4x4 matrix
A = np.arange(1, 17).reshape(4, 4)
print("Matrix A:")
print(A)

In [None]:
# Indexing examples
print(f"Element at [0,1]: {A[0, 1]}")
print(f"\nSubmatrix [0:2, 1:3]:\n{A[0:2, 1:3]}")
print(f"\nFirst row: {A[0, :]}")
print(f"\nSecond column: {A[:, 1]}")
print(f"\nDimensions: {A.shape}")

In [None]:
# Advanced indexing
print(f"Rows [0,2], columns [1,3]:\n{A[[0,2], :][:, [1,3]]}")
print(f"\nAll rows except [0,2]:\n{A[[1,3], :]}")

## Loading and Analyzing Real Data

We'll work with the Auto MPG dataset.

In [None]:
# Load Auto dataset
# Download from: http://www-bcf.usc.edu/~gareth/ISL/Auto.csv
# Or use the UCI ML Repository version

try:
    Auto = pd.read_csv("Auto.csv", na_values='?')
    Auto = Auto.dropna()
    print("Dataset loaded successfully!")
    print(f"Shape: {Auto.shape}")
except FileNotFoundError:
    print("Auto.csv not found. Downloading from UCI...")
    url = "https://archive.ics.uci.edu/ml/machine-learning-databases/auto-mpg/auto-mpg.data"
    column_names = ['mpg', 'cylinders', 'displacement', 'horsepower', 'weight', 'acceleration', 'year', 'origin', 'name']
    Auto = pd.read_csv(url, sep='\s+', names=column_names, na_values='?')
    Auto = Auto.dropna()
    print("Dataset downloaded and loaded!")
    print(f"Shape: {Auto.shape}")

In [None]:
# Display first few rows
Auto.head()

In [None]:
# Column names
print(Auto.columns.tolist())

## Data Visualization with Real Data

In [None]:
# Scatter plot: MPG vs Cylinders
plt.figure(figsize=(8, 6))
plt.scatter(Auto['cylinders'], Auto['mpg'], alpha=0.5)
plt.xlabel('Cylinders')
plt.ylabel('MPG')
plt.title('MPG vs Cylinders')
plt.grid(True, alpha=0.3)
plt.tight_layout()
plt.show()

In [None]:
# Box plot
plt.figure(figsize=(10, 6))
sns.boxplot(data=Auto, x='cylinders', y='mpg')
plt.xlabel('Cylinders')
plt.ylabel('MPG')
plt.title('MPG Distribution by Cylinders')
plt.tight_layout()
plt.show()

In [None]:
# Histogram
plt.figure(figsize=(10, 6))
plt.hist(Auto['mpg'], bins=20, color='steelblue', edgecolor='black', alpha=0.7)
plt.xlabel('MPG')
plt.ylabel('Frequency')
plt.title('Distribution of MPG')
plt.grid(True, alpha=0.3, axis='y')
plt.tight_layout()
plt.show()

In [None]:
# Pairplot for selected variables
subset_vars = ['mpg', 'displacement', 'horsepower', 'weight', 'acceleration']
sns.pairplot(Auto[subset_vars], diag_kind='kde', plot_kws={'alpha': 0.6})
plt.suptitle('Pairplot of Auto Dataset Variables', y=1.02)
plt.tight_layout()
plt.show()

## Summary Statistics

In [None]:
# Overall summary
Auto.describe()

In [None]:
# Summary for specific variable
Auto['mpg'].describe()

In [None]:
# Correlation matrix
numeric_cols = Auto.select_dtypes(include=[np.number]).columns
correlation = Auto[numeric_cols].corr()

plt.figure(figsize=(10, 8))
sns.heatmap(correlation, annot=True, cmap='coolwarm', center=0, fmt='.2f')
plt.title('Correlation Matrix')
plt.tight_layout()
plt.show()

In [None]:
# Summary by groups
Auto.groupby('cylinders')['mpg'].agg(['mean', 'std', 'min', 'max', 'count'])