In [None]:
# !python3 dataset_generator.py

# TODO: Feature Extraction and Transfer Learning

This notebook covers:
- Feature Extraction and Fine-tuning in Python
- Feature Extraction in Spark
- Fine-tuning using Orca
- Instance and Mapping-based Transfer Learning using `adapt`

Complete the TODOs in each section.

## 1. Feature Extraction and Fine-tuning using PyTorch

In [None]:
# TODO: Load a pretrained CNN and freeze feature layers
# - Load CIFAR-100
# - Preprocess the dataset
# - Replace final layer
# - Train only the classifier layer
# - Print the f1, precision and recall
# - Plot a confusion matrix too

# List of models (Choose any one)
  # AlexNet
  # ConvNeXt
  # DenseNet
  # EfficientNet
  # EfficientNetV2
  # GoogLeNet
  # Inception V3
  # MaxVit
  # MNASNet
  # MobileNet V2
  # MobileNet V3
  # RegNet
  # ResNet
  # ResNeXt
  # ShuffleNet V2
  # SqueezeNet
  # SwinTransformer
  # VGG
  # VisionTransformer
  # Wide ResNet

# YOUR CODE HERE


## 2. Feature Extraction in Spark

In [None]:
# Use PySpark to load and process the toy_dataset.csv
# - Perform feature encoding and vectorization
# - Apply PCA to reduce dimensions
# - Visualize or print PCA components

# Load the dataset
# Assuming toy_dataset.csv is in the current directory or a path you specify
file_path = "/content/toy_dataset.csv" # Replace with the actual path to your file
df = spark.read.csv(file_path, header=True, inferSchema=True)

# Show the schema and some data
df.printSchema()
df.show(5)

# Perform feature encoding and vectorization
from pyspark.ml.feature import StringIndexer, VectorAssembler
from pyspark.ml import Pipeline

# Identify categorical and numerical columns
categorical_cols = [col for col, dtype in df.dtypes if dtype == 'string']
numerical_cols = [col for col, dtype in df.dtypes if dtype != 'string' and col != 'label'] # Assuming 'label' is the target column if any

# Index categorical columns
indexers = [StringIndexer(inputCol=col, outputCol=col + "_indexed", handleInvalid="skip") for col in categorical_cols]

# Assemble all feature columns into a single vector
assembler_inputs = [indexer.getOutputCol() for indexer in indexers] + numerical_cols
assembler = VectorAssembler(inputCols=assembler_inputs, outputCol="features", handleInvalid="skip")

# Create a pipeline to apply transformations
pipeline = Pipeline(stages=indexers + [assembler])

# Fit and transform the data
pipeline_model = pipeline.fit(df)
df_transformed = pipeline_model.transform(df)

# Show the transformed data with the features column
df_transformed.select("features").show(5, truncate=False)

# Apply PCA to reduce dimensions
from pyspark.ml.feature import PCA
from pyspark.ml.linalg import Vectors

# Set the number of principal components
# You can choose a suitable number based on your analysis or requirements
k = 3 # Example: reduce to 3 dimensions

pca = PCA(k=k, inputCol="features", outputCol="pcaFeatures")

# Fit the PCA model and transform the data
pca_model = pca.fit(df_transformed)
df_pca = pca_model.transform(df_transformed)

# Show the PCA results
df_pca.select("pcaFeatures").show(5, truncate=False)

# Print the explained variance ratio for each component
print("Explained Variance Ratio:", pca_model.explainedVariance)

# Print the principal components (loadings)
print("Principal Components (Loadings):")
print(pca_model.pc)

# Stop the SparkSession
spark.stop()

In [None]:
# Create a SparkSession
from pyspark.sql import SparkSession

spark = SparkSession.builder.appName("FeatureExtraction").getOrCreate()