# Marvel Universe: Character Analysis

This notebook analyzes the Marvel character dataset to explore powers, roles, and affiliations using various data science techniques.

## Objectives

1. Explore and clean the dataset
2. Perform NLP on character powers
3. Predict character roles based on powers
4. Cluster similar characters
5. Create network visualizations of character affiliations
6. Estimate power levels


## Setup and Data Loading

In [None]:
# Import necessary libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from wordcloud import WordCloud
import networkx as nx
import plotly.express as px
import plotly.graph_objects as go
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.cluster import KMeans
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, confusion_matrix
import spacy

# Set plotting style
sns.set(style='whitegrid')
plt.style.use('fivethirtyeight')

# Display settings
%matplotlib inline
pd.set_option('display.max_columns', None)


In [None]:
# Load the dataset
df = pd.read_csv('../data/marvel_characters_dataset.csv')

# Display the first few rows
df.head()

## Data Exploration and Cleaning

In [None]:
# Basic information about the dataset
print(f'Dataset shape: {df.shape}')
df.info()

# Check for missing values
df.isnull().sum()

In [None]:
# Clean the data
# Convert all character names to title case for consistency
df['Character'] = df['Character'].str.title()

# Check for duplicates
duplicates = df[df.duplicated('Character')]
print(f'Number of duplicate characters: {len(duplicates)}')
if len(duplicates) > 0:
    print(duplicates)
    # Remove duplicates if needed
    df = df.drop_duplicates('Character')

# Standardize role categories
df['Role'] = df['Role'].str.title()

# Display cleaned data
df.head()

## Exploratory Data Analysis

In [None]:
# Distribution of roles
plt.figure(figsize=(10, 6))
role_counts = df['Role'].value_counts()
sns.barplot(x=role_counts.index, y=role_counts.values)
plt.title('Distribution of Character Roles', fontsize=16)
plt.xlabel('Role')
plt.ylabel('Count')
plt.xticks(rotation=0)
plt.show()

# Pie chart of roles
plt.figure(figsize=(8, 8))
plt.pie(role_counts, labels=role_counts.index, autopct='%1.1f%%', startangle=90, colors=sns.color_palette('viridis', len(role_counts)))
plt.title('Proportion of Character Roles', fontsize=16)
plt.axis('equal')
plt.show()

In [None]:
# Distribution of affiliations
plt.figure(figsize=(12, 8))
affiliation_counts = df['Affiliation'].value_counts().head(10)  # Top 10 affiliations
sns.barplot(x=affiliation_counts.values, y=affiliation_counts.index)
plt.title('Top 10 Character Affiliations', fontsize=16)
plt.xlabel('Count')
plt.ylabel('Affiliation')
plt.show()

## Power Analysis with NLP

In [None]:
# Create a word cloud of powers
all_powers = ' '.join(df['Powers'].dropna())

wordcloud = WordCloud(width=800, height=400, background_color='white', max_words=100, contour_width=3, contour_color='steelblue').generate(all_powers)

plt.figure(figsize=(16, 8))
plt.imshow(wordcloud, interpolation='bilinear')
plt.axis('off')
plt.title('Word Cloud of Marvel Character Powers', fontsize=20)
plt.show()

In [None]:
# TF-IDF Vectorization of powers
tfidf = TfidfVectorizer(stop_words='english', min_df=2)
powers_tfidf = tfidf.fit_transform(df['Powers'].fillna(''))

# Get feature names
feature_names = tfidf.get_feature_names_out()

# Display the top terms for each character
def display_top_terms(character_idx, top_n=5):
    character_name = df.iloc[character_idx]['Character']
    tfidf_scores = powers_tfidf[character_idx].toarray()[0]
    top_indices = tfidf_scores.argsort()[-top_n:][::-1]
    top_terms = [(feature_names[i], tfidf_scores[i]) for i in top_indices]
    print(f'Top terms for {character_name}:')
    for term, score in top_terms:
        print(f'  - {term}: {score:.4f}')

# Display top terms for a few characters
for idx in [0, 5, 10]:  # Iron Man, Spider-Man, Scarlet Witch
    display_top_terms(idx)
    print()

## Character Clustering

In [None]:
# Cluster characters based on powers using K-means
n_clusters = 5  # You can adjust this
kmeans = KMeans(n_clusters=n_clusters, random_state=42)
df['Cluster'] = kmeans.fit_predict(powers_tfidf)

# Visualize clusters
from sklearn.decomposition import PCA

# Reduce dimensions for visualization
pca = PCA(n_components=2, random_state=42)
powers_2d = pca.fit_transform(powers_tfidf.toarray())

# Create a DataFrame for plotting
plot_df = pd.DataFrame({
    'x': powers_2d[:, 0],
    'y': powers_2d[:, 1],
    'character': df['Character'],
    'role': df['Role'],
    'cluster': df['Cluster']
})

# Plot clusters
plt.figure(figsize=(12, 8))
sns.scatterplot(data=plot_df, x='x', y='y', hue='cluster', style='role', s=100)

# Add character labels
for i, row in plot_df.iterrows():
    plt.annotate(row['character'], (row['x'], row['y']), fontsize=8)

plt.title('Character Clusters Based on Powers', fontsize=16)
plt.xlabel('PCA Component 1')
plt.ylabel('PCA Component 2')
plt.legend(title='Cluster')
plt.show()

## Role Prediction

In [None]:
# Predict character role based on powers
X = powers_tfidf
y = df['Role']

# Split the data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# Train a Random Forest classifier
rf = RandomForestClassifier(n_estimators=100, random_state=42)
rf.fit(X_train, y_train)

# Make predictions
y_pred = rf.predict(X_test)

# Evaluate the model
print('Classification Report:
')
print(classification_report(y_test, y_pred))

# Confusion Matrix
plt.figure(figsize=(8, 6))
cm = confusion_matrix(y_test, y_pred)
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', xticklabels=rf.classes_, yticklabels=rf.classes_)
plt.title('Confusion Matrix', fontsize=16)
plt.xlabel('Predicted')
plt.ylabel('Actual')
plt.show()

In [None]:
# Function to predict role from powers
def predict_role(powers_text):
    # Transform the input text
    powers_vector = tfidf.transform([powers_text])
    
    # Make prediction
    predicted_role = rf.predict(powers_vector)[0]
    probabilities = rf.predict_proba(powers_vector)[0]
    
    # Get probability for each class
    proba_dict = {role: prob for role, prob in zip(rf.classes_, probabilities)}
    
    return predicted_role, proba_dict

# Test with some examples
test_powers = [
    "Superhuman strength, Flight, Energy projection",
    "Mind control, Telekinesis, Illusion creation",
    "Regeneration, Enhanced senses, Combat skills"
]

for powers in test_powers:
    role, probas = predict_role(powers)
    print(f'Powers: {powers}')
    print(f'Predicted Role: {role}')
    print('Probabilities:')
    for role, prob in sorted(probas.items(), key=lambda x: x[1], reverse=True):
        print(f'  - {role}: {prob:.2f}')
    print()

## Affiliation Network Analysis

In [None]:
# Create a network graph of characters based on shared affiliations
G = nx.Graph()

# Add nodes (characters)
for idx, row in df.iterrows():
    G.add_node(row['Character'], role=row['Role'], affiliation=row['Affiliation'])

# Add edges (shared affiliations)
affiliations = df['Affiliation'].unique()
for affiliation in affiliations:
    chars_in_affiliation = df[df['Affiliation'] == affiliation]['Character'].tolist()
    for i in range(len(chars_in_affiliation)):
        for j in range(i+1, len(chars_in_affiliation)):
            G.add_edge(chars_in_affiliation[i], chars_in_affiliation[j], affiliation=affiliation)

# Visualize the network
plt.figure(figsize=(16, 12))

# Set node colors based on role
role_colors = {'Hero': 'blue', 'Villain': 'red', 'Antihero': 'purple'}
node_colors = [role_colors.get(G.nodes[node]['role'], 'gray') for node in G.nodes()]

# Set node sizes based on degree (number of connections)
node_sizes = [300 * (1 + G.degree(node)) for node in G.nodes()]

# Draw the network
pos = nx.spring_layout(G, seed=42)  # Position nodes using force-directed layout
nx.draw_networkx_nodes(G, pos, node_color=node_colors, node_size=node_sizes, alpha=0.8)
nx.draw_networkx_edges(G, pos, width=1.0, alpha=0.5)
nx.draw_networkx_labels(G, pos, font_size=10, font_weight='bold')

# Add a legend
import matplotlib.patches as mpatches
legend_patches = [mpatches.Patch(color=color, label=role) for role, color in role_colors.items()]
plt.legend(handles=legend_patches, title='Character Role')

plt.title('Marvel Character Affiliation Network', fontsize=20)
plt.axis('off')
plt.tight_layout()
plt.show()

## Power Level Estimation

In [None]:
# Define a function to estimate power level based on powers
def estimate_power_level(powers_text):
    # Define power keywords for each level
    high_power_keywords = ['cosmic', 'reality', 'god', 'manipulation', 'telekinesis', 'magic', 'energy projection']
    medium_power_keywords = ['superhuman strength', 'regeneration', 'flight', 'enhanced', 'super', 'control']
    
    powers_lower = powers_text.lower()
    
    # Check for high power keywords
    for keyword in high_power_keywords:
        if keyword in powers_lower:
            return 'High'
    
    # Check for medium power keywords
    for keyword in medium_power_keywords:
        if keyword in powers_lower:
            return 'Medium'
    
    # Default to low
    return 'Low'

# Apply the function to estimate power levels
df['Estimated_Power_Level'] = df['Powers'].apply(estimate_power_level)

# Display the distribution of estimated power levels
plt.figure(figsize=(10, 6))
power_level_counts = df['Estimated_Power_Level'].value_counts().sort_index()
sns.barplot(x=power_level_counts.index, y=power_level_counts.values, palette='viridis')
plt.title('Distribution of Estimated Power Levels', fontsize=16)
plt.xlabel('Power Level')
plt.ylabel('Count')
plt.show()

# Compare power levels across roles
plt.figure(figsize=(12, 8))
role_power_counts = pd.crosstab(df['Role'], df['Estimated_Power_Level'])
role_power_counts.plot(kind='bar', stacked=True, colormap='viridis')
plt.title('Power Level Distribution by Role', fontsize=16)
plt.xlabel('Role')
plt.ylabel('Count')
plt.legend(title='Power Level')
plt.show()

## Export Processed Data for React App

In [None]:
# Export the processed data with estimated power levels
df.to_csv('../data/processed_marvel_characters.csv', index=False)

# Export network data for visualization in React
# Nodes
nodes_data = []
for node in G.nodes():
    nodes_data.append({
        'id': node,
        'role': G.nodes[node]['role'],
        'affiliation': G.nodes[node]['affiliation'],
        'power_level': df[df['Character'] == node]['Estimated_Power_Level'].values[0]
    })

# Edges
edges_data = []
for source, target, data in G.edges(data=True):
    edges_data.append({
        'source': source,
        'target': target,
        'affiliation': data['affiliation']
    })

# Save to JSON
import json

network_data = {
    'nodes': nodes_data,
    'links': edges_data
}

with open('../data/marvel_network.json', 'w') as f:
    json.dump(network_data, f, indent=2)

print('Data exported successfully!')

## Conclusion

In this notebook, we've analyzed the Marvel character dataset using various data science techniques:

1. Explored and cleaned the dataset
2. Performed NLP analysis on character powers
3. Clustered characters based on their powers
4. Built a model to predict character roles
5. Created a network visualization of character affiliations
6. Estimated power levels based on character abilities

The processed data and visualizations can now be used in the React frontend application to create an interactive experience for exploring the Marvel universe.