<a href="https://colab.research.google.com/github/cacress/CSC442/blob/main/Rithik_Modeling.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import pandas as pd

### Import cleaned and prepared dataset
df = pd.read_csv('https://raw.githubusercontent.com/cacress/CSC442/refs/heads/rithik_eda/datasets/rithik_eda_dataset.csv')

In [None]:
### Add a simple neutrality measure that is extracted from sentiment scores.
### This will aid in regression analysis, since it is a closeness measure to 0 rather than in one of two opposite directions.
df['title_neutrality'] = 1 - df['title_sentiment'].abs()
df['description_neutrality'] = 1 - df['description_sentiment'].abs()

Principal Component Analysis and t-SNE Clustering for various text metric features

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.manifold import TSNE

# Metrics to analyze
features = [
    'description_sentiment', 'title_sentiment',
    'description_neutrality', 'title_neutrality',
    'description_ari', 'title_ari',
    # I've excluded the word and character counts of titles/descriptions for this analysis
    # I did this because I want to focus on metrics that are not obvious at first glance, such as how long a title is
    # Additionally, word and character counts are highly correlated to each other, which may confound our t-SNE inferences
]

# Dropping empty genres and standardizing data
data = df[features + ['top_one_genre']].dropna()
scaler = StandardScaler()
data_scaled = scaler.fit_transform(data[features])

# ----- PCA -----
pca = PCA(n_components=2)
data_pca = pca.fit_transform(data_scaled)
explained_variance = pca.explained_variance_ratio_
pc1_label = f"PC1 ({explained_variance[0]*100:.1f}% var)"
pc2_label = f"PC2 ({explained_variance[1]*100:.1f}% var)"
data['pca1'] = data_pca[:, 0]
data['pca2'] = data_pca[:, 1]

# ----- t-SNE -----
tsne = TSNE(n_components=2, random_state=42)
data_tsne = tsne.fit_transform(data_scaled)
data['tsne1'] = data_tsne[:, 0]
data['tsne2'] = data_tsne[:, 1]

# Get the top 18 most frequent genres from top_one_genre and reorganize them into six subsets (3 per group)
genre_counts = data['top_one_genre'].value_counts()
top_genres = list(genre_counts.head(18).index)

# Take a subset of the data that only includes these top 18 genres
data_top = data[data['top_one_genre'].isin(top_genres)]

# Split into six further subsets
genre_groups = [top_genres[i:i+3] for i in range(0, len(top_genres), 3)]

# PCA and t-SNE plots
for idx, group in enumerate(genre_groups):
    subset_data = data_top[data_top['top_one_genre'].isin(group)]
    fig, axes = plt.subplots(1, 2, figsize=(10, 4))

    # PCA plot
    sns.scatterplot(data=subset_data, x='pca1', y='pca2', hue='top_one_genre',
                    palette="deep", alpha=0.7, ax=axes[0])
    axes[0].set_title(f"PCA: Subset {idx+1} - Genres: {', '.join(group)}")
    axes[0].set_xlabel(pc1_label)
    axes[0].set_ylabel(pc2_label)

    # t-SNE plot
    sns.scatterplot(data=subset_data, x='tsne1', y='tsne2', hue='top_one_genre',
                    palette="deep", alpha=0.7, ax=axes[1])
    axes[1].set_title(f"t-SNE: Subset {idx+1} - Genres: {', '.join(group)}")
    axes[1].set_xlabel("t-SNE Component 1")
    axes[1].set_ylabel("t-SNE Component 2")

    # Cleaner plot legend
    axes[1].legend_.remove() if axes[1].get_legend() is not None else None

    plt.tight_layout()
    plt.show()


In [None]:
import math
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import StandardScaler
from sklearn.manifold import TSNE

# Included attributes
features = [
    'description_sentiment', 'title_sentiment',
    'description_ari', 'title_ari',
    'description_word_count', 'title_word_count',
    'description_character_count', 'title_character_count'
]

# Filter columns with 'top_one_genre'
filtered_data = df.dropna(subset=features + ['top_one_genre']).copy()

# Get the top 18 genres
genre_counts = filtered_data['top_one_genre'].value_counts()
top_genres = list(genre_counts.index)
if len(top_genres) > 18:
    top_genres = top_genres[:18]
filtered_data = filtered_data[filtered_data['top_one_genre'].isin(top_genres)]

# Standardize data to prep for clustering
scaler = StandardScaler()
data_scaled = scaler.fit_transform(filtered_data[features])

# Apply t-SNE to reduce dimensions
tsne = TSNE(n_components=2, random_state=42)
tsne_components = tsne.fit_transform(data_scaled)
filtered_data['tsne1'] = tsne_components[:, 0]
filtered_data['tsne2'] = tsne_components[:, 1]

n_genres = len(top_genres)
n_cols = 6
n_rows = math.ceil(n_genres / n_cols)

# Visualize
fig, axes = plt.subplots(n_rows, n_cols, figsize=(4 * n_cols, 4 * n_rows), sharex=True, sharey=True)
axes = axes.flatten()

# One t-SNE plot per genre
for i, genre in enumerate(top_genres):
    genre_data = filtered_data[filtered_data['top_one_genre'] == genre]
    sns.scatterplot(data=genre_data, x='tsne1', y='tsne2', ax=axes[i], alpha=0.7)
    axes[i].set_title(genre, fontsize=10)
    axes[i].set_xlabel("t-SNE Component 1", fontsize=8)
    axes[i].set_ylabel("t-SNE Component 2", fontsize=8)

for j in range(i + 1, len(axes)):
    axes[j].set_visible(False)

plt.suptitle("t-SNE of Text Metrics by Genre", fontsize=16, y=1.02)
plt.tight_layout(rect=[0, 0, 1, 0.95])
plt.show()


In [None]:
##### Let's do some analysis into how likely there is a pattern within each genre based on the t-SNE clustering.

# I've created a "pattern metric" that calculates how far the centroid of a certain genre's clustering is from the
# trivial center, [0,0]. This yields us an interval attribute we can use to compare the "pattern-ness" of genres
# based on their text metrics.

import numpy as np
from scipy.stats import skew

def compute_pattern_metric(df, x_col='tsne1', y_col='tsne2', genre_col='top_one_genre', min_points=50):
    pattern_metrics = {}
    genres = df[genre_col].unique()
    for genre in genres:
        # Pull a single genre (current genre)
        genre_data = df[df[genre_col].str.lower() == genre.lower()]
        if len(genre_data) < min_points:
            continue

        # Compute centroid of the cluster based on the t-SNE dimensions
        centroid_x = np.mean(genre_data[x_col])
        centroid_y = np.mean(genre_data[y_col])
        centroid_displacement = np.sqrt(centroid_x**2 + centroid_y**2)

        # Compute asymmetry about the x-axis
        mean_x = np.mean(genre_data[x_col])
        median_x = np.median(genre_data[x_col])
        asymmetry = abs(mean_x - median_x)

        # Calculate our final pattern metric (weighted sum of displacement and asymmetry)
        # I've weighted centroid displacement more because a global shift in data is likely more significant
        # in telling a pattern for text metrics during clustering than asymmetric data is
        pattern_metric = centroid_displacement * 0.65 + asymmetry * 0.35

        pattern_metrics[genre] = {
            'n_points': len(genre_data),
            'centroid_displacement': centroid_displacement,
            'asymmetry_tsne1': asymmetry,
            'pattern_metric': pattern_metric
        }
    return pattern_metrics

pattern_metrics = compute_pattern_metric(filtered_data)

# Print values
print("Pattern Metric for each genre (higher means more shifted from center and asymmetric):\n")
for genre, metrics in pattern_metrics.items():
    print(f"Genre: {genre}")
    print(f"  Number of points: {metrics['n_points']}")
    print(f"  Centroid displacement from (0,0): {metrics['centroid_displacement']:.3f}")
    print(f"  Asymmetry (|mean - median| on tsne1): {metrics['asymmetry_tsne1']:.3f}")
    print(f"  Composite Pattern Metric: {metrics['pattern_metric']:.3f}\n")

# Mean for pattern metric values
mean_pattern_metric = np.mean([metrics['pattern_metric'] for metrics in pattern_metrics.values()])
print(f"Mean Pattern Metric across all genres: {mean_pattern_metric:.3f}")

In [None]:
##### Getting plots for specifically Horror and Reality, since they yielded the highest pattern values.
horror_data = filtered_data[filtered_data['top_one_genre'].str.lower() == 'horror']
reality_data = filtered_data[filtered_data['top_one_genre'].str.lower() == 'reality']

fig, axes = plt.subplots(1, 2, figsize=(14, 6), sharex=True, sharey=True)

# Plot for Horror
sns.scatterplot(data=horror_data, x='tsne1', y='tsne2', ax=axes[0], alpha=0.7, color='purple')
axes[0].set_title("t-SNE: Horror")
axes[0].set_xlabel("t-SNE Component 1")
axes[0].set_ylabel("t-SNE Component 2")

# Plot for Reality
sns.scatterplot(data=reality_data, x='tsne1', y='tsne2', ax=axes[1], alpha=0.7, color='green')
axes[1].set_title("t-SNE: Reality")
axes[1].set_xlabel("t-SNE Component 1")
axes[1].set_ylabel("t-SNE Component 2")

plt.tight_layout()
plt.show()
