In [None]:
# ==========================
# 📦 Install Scikit-learn (Colab only)
# ==========================
!pip install scikit-learn pandas matplotlib numpy seaborn

# Classical Machine Learning with Scikit-learn: A Beginner's Guide

This notebook introduces fundamental concepts in classical machine learning using Scikit-learn, specifically for materials science applications. We will explore data loading, preprocessing, classification, regression, PCA analysis, and high-throughput screening approaches.

**Learning Path**: Data Loading → Preprocessing → Classification → Regression → PCA Analysis → High-throughput Screening

Let's start by setting up our machine learning environment for materials informatics.

In [None]:
# Import necessary libraries for classical ML and materials science
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.ensemble import RandomForestRegressor, RandomForestClassifier
from sklearn.svm import SVR, SVC
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_absolute_error, r2_score, accuracy_score, classification_report
from sklearn.decomposition import PCA
from sklearn.cluster import KMeans
import warnings
warnings.filterwarnings('ignore')

# Set random seed for reproducibility
np.random.seed(42)

print(f"Scikit-learn ready for materials informatics!")
print(f"NumPy: {np.__version__}, Pandas: {pd.__version__}")

# ✅ 1. Data Loading - Materials Dataset Handling

The first step in any materials informatics project is loading and understanding your dataset. Materials data comes in various formats and often requires careful preprocessing to extract meaningful features.

## Common Materials Data Sources

- **Experimental databases**: ICSD, NIST, Materials Project
- **Computational databases**: AFLOW, OQMD, C2DB
- **Literature mining**: Automated extraction from papers
- **High-throughput calculations**: DFT, MD simulations

## Key Considerations

- **Data quality**: Missing values, outliers, measurement errors
- **Feature engineering**: Converting chemical formulas to descriptors
- **Target variables**: Property of interest (bandgap, formation energy, etc.)
- **Data imbalance**: Uneven representation of materials classes

This section demonstrates loading and exploring materials datasets.

In [None]:
# Create comprehensive materials dataset for demonstration
def create_materials_dataset():
    """Create a realistic materials dataset for ML demonstrations."""
    
    # Material systems and their properties
    materials_data = [
        # Format: [name, formula, space_group, a, b, c, bandgap, formation_energy, class]
        ["Silicon", "Si", "Fd-3m", 5.431, 5.431, 5.431, 1.12, -5.45, "Semiconductor"],
        ["Gallium Arsenide", "GaAs", "F-43m", 5.653, 5.653, 5.653, 1.42, -0.74, "Semiconductor"],
        ["Molybdenum Disulfide", "MoS2", "P63/mmc", 3.16, 3.16, 12.3, 1.8, -1.23, "2D Material"],
        ["Tungsten Diselenide", "WSe2", "P63/mmc", 3.28, 3.28, 12.96, 1.6, -1.45, "2D Material"],
        ["Graphene", "C", "P6/mmm", 2.46, 2.46, 6.7, 0.0, 0.0, "2D Material"],
        ["Hexagonal Boron Nitride", "BN", "P63/mmc", 2.50, 2.50, 6.66, 5.9, -2.51, "Insulator"],
        ["Titanium Dioxide", "TiO2", "P42/mnm", 4.594, 4.594, 2.959, 3.2, -9.45, "Insulator"],
        ["Zinc Oxide", "ZnO", "P63mc", 3.25, 3.25, 5.207, 3.37, -3.48, "Semiconductor"],
        ["Cadmium Telluride", "CdTe", "F-43m", 6.482, 6.482, 6.482, 1.5, -0.92, "Semiconductor"],
        ["Indium Phosphide", "InP", "F-43m", 5.869, 5.869, 5.869, 1.34, -0.88, "Semiconductor"],
        ["Gallium Nitride", "GaN", "P63mc", 3.189, 3.189, 5.185, 3.4, -1.09, "Semiconductor"],
        ["Aluminum Nitride", "AlN", "P63mc", 3.112, 3.112, 4.982, 6.2, -3.29, "Insulator"],
        ["Silicon Carbide", "SiC", "F-43m", 4.359, 4.359, 4.359, 2.36, -0.73, "Semiconductor"],
        ["Tin Dioxide", "SnO2", "P42/mnm", 4.737, 4.737, 3.186, 3.6, -5.81, "Semiconductor"],
        ["Copper Oxide", "Cu2O", "Pn-3m", 4.27, 4.27, 4.27, 2.17, -1.68, "Semiconductor"]
    ]
    
    # Convert to DataFrame
    columns = ['Material', 'Formula', 'Space_Group', 'a', 'b', 'c', 
               'Bandgap_eV', 'Formation_Energy_eV', 'Class']
    df = pd.DataFrame(materials_data, columns=columns)
    
    # Add computed features
    df['Volume'] = df['a'] * df['b'] * df['c']
    df['Average_Lattice'] = (df['a'] + df['b'] + df['c']) / 3
    df['Is_Cubic'] = ((df['a'] - df['b']).abs() < 0.01) & ((df['b'] - df['c']).abs() < 0.01)
    
    return df

# Load the materials dataset
print("🔬 Loading Materials Dataset")
print("=" * 35)

df = create_materials_dataset()

print(f"Dataset shape: {df.shape}")
print(f"Columns: {list(df.columns)}")
print("\n📊 First 5 materials:")
print(df.head())

# Dataset overview
print("\n📋 Dataset Information:")
print(f"Total materials: {len(df)}")
print(f"Material classes: {df['Class'].unique()}")
print(f"Class distribution:")
print(df['Class'].value_counts())

# Statistical summary
print("\n📈 Property Statistics:")
numeric_cols = ['Bandgap_eV', 'Formation_Energy_eV', 'Volume', 'Average_Lattice']
print(df[numeric_cols].describe().round(2))

# Data quality check
print("\n🔍 Data Quality Check:")
print(f"Missing values: {df.isnull().sum().sum()}")
print(f"Duplicate entries: {df.duplicated().sum()}")
print(f"Unique space groups: {df['Space_Group'].nunique()}")