# Setup.

## Requirements.

In [None]:
%pip install -r ../requirements.txt

## Imports.

In [3]:
import pandas as pd
import numpy as np

## Sample Dataset.

In [19]:
df = pd.read_csv('data/podcast.csv')
df_num = df.select_dtypes(include=[np.number])

df.head()

Unnamed: 0,id,Podcast_Name,Episode_Title,Episode_Length_minutes,Genre,Host_Popularity_percentage,Publication_Day,Publication_Time,Guest_Popularity_percentage,Number_of_Ads,Episode_Sentiment,Listening_Time_minutes
0,0,Mystery Matters,Episode 98,,True Crime,74.81,Thursday,Night,,0.0,Positive,31.41998
1,1,Joke Junction,Episode 26,119.8,Comedy,66.95,Saturday,Afternoon,75.95,2.0,Negative,88.01241
2,2,Study Sessions,Episode 16,73.9,Education,69.97,Tuesday,Evening,8.97,0.0,Negative,44.92531
3,3,Digital Digest,Episode 45,67.17,Technology,57.22,Monday,Morning,78.7,2.0,Positive,46.27824
4,4,Mind & Body,Episode 86,110.51,Health,80.07,Monday,Afternoon,58.68,3.0,Neutral,75.61031


# 1. Null Handling.

In [6]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 750000 entries, 0 to 749999
Data columns (total 12 columns):
 #   Column                       Non-Null Count   Dtype  
---  ------                       --------------   -----  
 0   id                           750000 non-null  int64  
 1   Podcast_Name                 750000 non-null  object 
 2   Episode_Title                750000 non-null  object 
 3   Episode_Length_minutes       662907 non-null  float64
 4   Genre                        750000 non-null  object 
 5   Host_Popularity_percentage   750000 non-null  float64
 6   Publication_Day              750000 non-null  object 
 7   Publication_Time             750000 non-null  object 
 8   Guest_Popularity_percentage  603970 non-null  float64
 9   Number_of_Ads                749999 non-null  float64
 10  Episode_Sentiment            750000 non-null  object 
 11  Listening_Time_minutes       750000 non-null  float64
dtypes: float64(5), int64(1), object(6)
memory usage: 68.7+ MB


In [21]:
# 'Number_of_Ads' : Only one -> just drop the row.
print(df['Number_of_Ads'].isnull().sum())
df = df.dropna(subset=['Number_of_Ads'])

# 'Episode_Length_minutes' and 'Guest_Popularity_percentage' : Large number of nulls.
# Let's briefly check the feature importance.
from sklearn.ensemble import RandomForestRegressor
X = df_num.drop(columns=['Listening_Time_minutes'])
y = df_num['Listening_Time_minutes']
features = X.columns

model = RandomForestRegressor(max_leaf_nodes    = 10, 
                              max_depth         = 3, 
                              n_estimators      = 5,
                              random_state      = 42,)
model.fit(X, y)

from sklearn.inspection import permutation_importance
perm = permutation_importance(model, X, y, n_repeats=2, random_state=42)
importance_df = pd.DataFrame({'Feature': features, 'Importance': perm.importances_mean})

display(importance_df)

display(df_num.corr()['Listening_Time_minutes'])

0


Unnamed: 0,Feature,Importance
0,id,0.0
1,Episode_Length_minutes,1.481068
2,Host_Popularity_percentage,0.0
3,Guest_Popularity_percentage,0.0
4,Number_of_Ads,0.0


id                            -0.000876
Episode_Length_minutes         0.916749
Host_Popularity_percentage     0.050870
Guest_Popularity_percentage   -0.016014
Number_of_Ads                 -0.118337
Listening_Time_minutes         1.000000
Name: Listening_Time_minutes, dtype: float64

In [None]:
# 'Episode_Length_minutes' seems to be important -> delete rows with nulls.
df = df.dropna(subset=['Episode_Length_minutes'])

# 'Guest_Popularity_percentage' seems to be not important -> fill nulls with mean.
df['Guest_Popularity_percentage'] = df['Guest_Popularity_percentage'].fillna(df['Guest_Popularity_percentage'].mean())

df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 662906 entries, 1 to 749999
Data columns (total 12 columns):
 #   Column                       Non-Null Count   Dtype  
---  ------                       --------------   -----  
 0   id                           662906 non-null  int64  
 1   Podcast_Name                 662906 non-null  object 
 2   Episode_Title                662906 non-null  object 
 3   Episode_Length_minutes       662906 non-null  float64
 4   Genre                        662906 non-null  object 
 5   Host_Popularity_percentage   662906 non-null  float64
 6   Publication_Day              662906 non-null  object 
 7   Publication_Time             662906 non-null  object 
 8   Guest_Popularity_percentage  662906 non-null  float64
 9   Number_of_Ads                662906 non-null  float64
 10  Episode_Sentiment            662906 non-null  object 
 11  Listening_Time_minutes       662906 non-null  float64
dtypes: float64(5), int64(1), object(6)
memory usage: 65.7+ MB


: 

# 2. Feature Scaling.

# 3. Encoding.

# 4. Dimensionality Reduction

Dimensionality reduction maps features from a high-dimensional space to a lower-dimensional one while preserving as much task-relevant structure as possible. It helps with:
- Curse of dimensionality and collinearity reduction.
- Speeding up training/inference and mitigating overfitting.
- Visualization and exploratory analysis.
- Noise suppression by projecting onto informative subspaces.


## 4.1 What & When to Use
Use dimensionality reduction when you have many features relative to samples, strong feature correlations, sparse/high-cardinality encodings, or you need 2D/3D visualization.

**Families:**
- **Linear projections:** PCA, TruncatedSVD, RandomProjection (Gaussian/Sparse). Good first-line choices; fast and scalable.
- **Manifold learning:** UMAP, t-SNE. Nonlinear; preserve local neighborhoods for visualization; not ideal inside production pipelines that require stable transforms across fits (especially t-SNE).
- **Clustering of features:** FeatureAgglomeration merges similar features.
- **Neural:** Autoencoders learn non-linear compressions; require more engineering and compute.

**Choosing tips:**
- Dense numeric data → Standardize → PCA.
- Sparse text/categorical (e.g., TF-IDF, One-Hot) → TruncatedSVD.
- Very high dimensional with limited time → RandomProjection.
- Visualization or local structure → UMAP (often) or t-SNE.
- Want human interpretation of components → PCA (loadings, explained variance).
- Need end-to-end learnable compression → Autoencoder.


## 4.2 Methods at a Glance
- **PCA (Principal Component Analysis):** Orthogonal linear components maximizing variance. Sensitive to scaling; use after StandardScaler. Outputs `explained_variance_ratio_` to pick `n_components` (e.g., 90–99%).
- **TruncatedSVD:** PCA-like on sparse matrices without centering. Works with TF-IDF / One-Hot; choose components via held-out downstream score.
- **Random Projections (Gaussian / Sparse):** Johnson–Lindenstrauss lemma for distance preservation with O(ndk). No fitting cost; set `n_components` to hundreds–thousands depending on samples.
- **UMAP:** Fast manifold learner; preserves local neighborhoods. Good for visualization and as a preprocessor if you freeze a fitted model.
- **t-SNE:** High-quality 2D/3D visualization; not typically used inside ML pipelines (non-parametric, expensive, sensitive to hyperparameters).
- **FeatureAgglomeration:** Clusters features (columns) using hierarchical clustering; outputs averaged clusters as features.
- **Autoencoders:** Neural network encoder-decoder minimizing reconstruction error; flexible but requires tuning and GPU for scale.


## 4.3 How to Evaluate
- **Supervised objective:** Compare cross-validated downstream scores *with vs. without* reduction. Choose `n_components` that maximizes validation score.
- **Unsupervised structural faithfulness:**
  - **Trustworthiness** / **Continuity** (neighborhood preservation) for embeddings.
  - **Reconstruction error** for PCA/Autoencoders.
- **Stability & reproducibility:** Fix `random_state`, record seeds, version transforms, and persist fitted reducers.
- **Runtime & memory:** Measure transform time and peak memory to balance accuracy and efficiency.


## 4.4 Practical Tips & Pitfalls
- Always place reducers **inside** scikit-learn Pipelines to avoid leakage.
- Standardize dense numeric features before PCA; **do not center** sparse matrices before TruncatedSVD.
- For UMAP/t-SNE, start with `n_neighbors≈min(15, sqrt(n_samples))` and check trustworthiness.
- For interpretability, inspect PCA loadings (top absolute coefficients per component) and plot cumulative explained variance.
- Keep the reducer fitted on training data and reuse it for validation/test to ensure consistency.


In [None]:
import numpy as np
import pandas as pd

from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.decomposition import PCA, TruncatedSVD
from sklearn.random_projection import GaussianRandomProjection, SparseRandomProjection
from sklearn.manifold import TSNE, trustworthiness
from sklearn.cluster import FeatureAgglomeration
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, roc_auc_score, silhouette_score
from scipy.sparse import csr_matrix

RANDOM_STATE = 42

# Example dense dataset.
X_dense = np.random.RandomState(RANDOM_STATE).randn(1000, 50)
y = (X_dense[:, 0] + 0.5*X_dense[:, 1] + 0.1*np.random.RandomState(RANDOM_STATE).randn(1000) > 0).astype(int)

# Example sparse dataset (simulating TF-IDF-like inputs).
X_sparse = csr_matrix(np.random.RandomState(RANDOM_STATE).poisson(0.05, size=(1000, 5000)))

Xtr, Xte, ytr, yte = train_test_split(X_dense, y, test_size=0.2, random_state=RANDOM_STATE, stratify=y)

# 1) PCA pipeline on dense numeric data.
pipe_pca = Pipeline([
    ("scaler", StandardScaler(with_mean=True, with_std=True)),
    ("pca", PCA(random_state=RANDOM_STATE)),
    ("clf", LogisticRegression(max_iter=200, n_jobs=None, random_state=RANDOM_STATE))
])

param_pca = {"pca__n_components": [0.8, 0.9, 0.95, 0.99, 10, 20, 30]}  # ratio or integer.
g_pca = GridSearchCV(pipe_pca, param_grid=param_pca, cv=5, n_jobs=-1, scoring="roc_auc")
g_pca.fit(Xtr, ytr)
pca_auc = roc_auc_score(yte, g_pca.predict_proba(Xte)[:, 1])

# 2) TruncatedSVD for sparse high-dimensional inputs.
pipe_tsvd = Pipeline([
    ("tsvd", TruncatedSVD(random_state=RANDOM_STATE)),
    ("clf", LogisticRegression(max_iter=200, n_jobs=None, random_state=RANDOM_STATE))
])
param_tsvd = {"tsvd__n_components": [50, 100, 200, 300]}
g_tsvd = GridSearchCV(pipe_tsvd, param_grid=param_tsvd, cv=5, n_jobs=-1, scoring="roc_auc")
g_tsvd.fit(X_sparse[:800], y[:800])
tsvd_auc = roc_auc_score(y[800:], g_tsvd.predict_proba(X_sparse[800:])[:, 1])

# 3) Random projection (Gaussian) as a fast baseline.
pipe_rp = Pipeline([
    ("rp", GaussianRandomProjection(n_components=50, random_state=RANDOM_STATE)),
    ("clf", LogisticRegression(max_iter=200, n_jobs=None, random_state=RANDOM_STATE))
])
pipe_rp.fit(Xtr, ytr)
rp_auc = roc_auc_score(yte, pipe_rp.predict_proba(Xte)[:, 1])

# 4) FeatureAgglomeration to merge similar columns.
agg = FeatureAgglomeration(n_clusters=20)  # Choose clusters based on CV score.
Xtr_agg = agg.fit_transform(Xtr)
Xte_agg = agg.transform(Xte)

# 5) UMAP / t-SNE for visualization-like embeddings.
# Note: UMAP is not in sklearn; if available, use: from umap import UMAP
# Here we use t-SNE to produce a 2D embedding and measure trustworthiness.
# Fit on a subset for speed.
subset = np.random.RandomState(RANDOM_STATE).choice(X_dense.shape[0], size=500, replace=False)
X_sub = X_dense[subset]

tsne = TSNE(n_components=2, perplexity=30, learning_rate="auto", init="pca", random_state=RANDOM_STATE)
X_emb = tsne.fit_transform(X_sub)

tw = trustworthiness(X_sub, X_emb, n_neighbors=5)

# Summarize results.
summary = pd.DataFrame({
    "method": ["PCA+LR", "TruncatedSVD+LR", "GaussianRP+LR", "t-SNE (trustworthiness)"],
    "score": [pca_auc, tsvd_auc, rp_auc, tw]
})
summary


Unnamed: 0,method,score
0,PCA+LR,0.990491
1,TruncatedSVD+LR,0.433172
2,GaussianRP+LR,0.972876
3,t-SNE (trustworthiness),0.795987


# 5. Pipeline.