In [2]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import ast

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))


/kaggle/input/superheroes-nlp-dataset/superheroes_nlp_dataset.csv


# Load dataset and pre-process

In [35]:
# Load the dataset
df = pd.read_csv("/kaggle/input/superheroes-nlp-dataset/superheroes_nlp_dataset.csv")
df.isnull().sum()

# Convert the column to numeric type
df['overall_score'] = pd.to_numeric(df['overall_score'], errors='coerce').astype(pd.Int64Dtype())

df['history_text'] = df['history_text'].astype(str)

# Find the second maximum value
second_max_value = df['overall_score'].nlargest(2).iloc[-1] * 2

# Replace non-integer values with np.inf
df['overall_score'].replace([value for value in df['overall_score'] if not isinstance(value, np.int64)], second_max_value, inplace=True)
selected_columns = ['name', 'overall_score']
df[selected_columns].sort_values(['overall_score'], ascending=[False]).head(3)

Unnamed: 0,name,overall_score
333,Curse,458
1387,War Machine II,458
1313,Thunderbird III,458


# Q1. The most powerfull hero

`combined_score = score(intelligence + strength + speed + durability + power + combat) `

In [36]:

df['superpower_count']=df.apply(lambda r: len(ast.literal_eval(r['superpowers'])), axis=1)
df['combined_score']=df.intelligence_score + df.strength_score + df.speed_score + df.durability_score + df.power_score + df.combat_score
df.alignment.fillna('not available', inplace=True)
df.name.fillna('not available', inplace=True)

df[df.overall_score==second_max_value].sort_values(['combined_score','superpower_count'], ascending=[False,False])
selected_columns = ['name', 'combined_score', 'superpower_count']
df[selected_columns].sort_values(['combined_score','superpower_count'], ascending=[False,False]).head(1)


Unnamed: 0,name,combined_score,superpower_count
526,Golden Master's Mech,600,106


# Q1.A The most powerful superhero from each creator

In [4]:
# Get the row with the highest power value for each group
max_power_rows = df.groupby('creator').apply(lambda x: x[x['combined_score'] == x['combined_score'].max()]).sort_values(by='combined_score', ascending=False)

selected_columns = ['creator', 'name', 'combined_score']

max_power_rows[selected_columns].head(5)

# Group the data by category and find the maximum price
max_price = df.groupby('creator')['combined_score'].max()

# Create a new DataFrame with the maximum price and count for each category
result = pd.DataFrame({
    'creator': max_price.index,
    'combined_score': max_price.values,
#     'name': df.iloc[max_price.index]
})

result.sort_values(by='combined_score', ascending=False).head(10)


Unnamed: 0,creator,combined_score
5,DC Comics,600
22,Marvel Comics,600
21,Lego,600
10,George R. R. Martin,600
19,J. R. R. Tolkien,600
17,Image Comics,600
31,Shueisha,590
20,Konami,575
6,Dark Horse Comics,575
13,Hasbro,565


# Q2. Top 5 Superpowers 

In [5]:
# Load the dataset
df_1 = pd.read_csv("/kaggle/input/superheroes-nlp-dataset/superheroes_nlp_dataset.csv")
df_1.isnull().sum()

# Convert the column to numeric type
df_1['overall_score'] = pd.to_numeric(df_1['overall_score'], errors='coerce').astype(pd.Int64Dtype())

# Find the second maximum value
second_max_value = df_1['overall_score'].nlargest(2).iloc[-1] * 2

# Replace non-integer values with np.inf
df_1['overall_score'].replace([value for value in df_1['overall_score'] if not isinstance(value, np.int64)], second_max_value, inplace=True)
selected_columns = ['name', 'overall_score']
df_1[selected_columns].sort_values(['overall_score'], ascending=[False]).head(3)

df_1['superpowers']=df_1.apply(lambda r: ast.literal_eval(r['superpowers']), axis=1)
df_1_superpowers = (
    df_1["superpowers"].apply(pd.Series).stack().pipe(pd.get_dummies).groupby(level=0).sum()
)
df_1_superpowers.columns = df_1_superpowers.columns.str.lower().str.replace(" ", "_")

selected_columns=['name', 'overall_score','intelligence_score','strength_score','speed_score','durability_score','power_score','combat_score']
heropower=df_1[selected_columns].join(df_1_superpowers)
heropower.fillna(0, inplace=True)
heropower.isnull().sum()
infinite=heropower[heropower.overall_score==second_max_value]
non_infinite=heropower[heropower.overall_score!=second_max_value]
infinite_sums=infinite.drop(selected_columns, axis=1).sum(axis=0)
non_infinite_sums=non_infinite.drop(selected_columns, axis=1).sum(axis=0)
infinite_power=infinite_sums[(infinite_sums>0) & (non_infinite_sums==0)]
infinite_power

apotheosis                1.0
omnipotent                4.0
orbing                    1.0
salvation                 1.0
willpower_manipulation    1.0
dtype: float64

# Q3. Which race has the most immortal superheroes?

In [6]:
# 
data_race = df.groupby(['type_race'])['combined_score'].mean().to_frame(name = 'mean_power_score').reset_index()
data_race = data_race.sort_values(by='mean_power_score', ascending=False)

data_race.head(1)

Unnamed: 0,type_race,mean_power_score
53,Saiyan,581.0


# Q4. Name the creator having most superheroes of type “Parademon”.

# Q. 5) Which comic creator has most superhero teams?

In [7]:
# group by creator and count the number of teams
creator_teams = df.groupby('creator')['teams'].count()

# get the creator with the most teams
most_teams_creator = creator_teams.idxmax()

print(f'{most_teams_creator} has the most teams with {creator_teams[most_teams_creator]} teams.')

Marvel Comics has the most teams with 615 teams.


**Q5 A. Find names, real names and alias of superhero who is part of most teams.**

In [8]:
df_count = df.groupby('creator').count()

# sort by number of teams in descending order
df_sorted = df_count.sort_values('teams', ascending=False)

# get the creator with the most teams
creator = df_sorted.index[0]

# get the real name and name of the creator
real_name = df.loc[df['creator'] == creator, 'real_name'].iloc[0]
name = df.loc[df['creator'] == creator, 'name'].iloc[0]

print(f"{real_name} ({name}) has the most teams ({df_sorted.iloc[0]['teams']}).")

Delroy Garrett, Jr. (3-D Man) has the most teams (615).


**Q5 B. Are there any crossovers between creators and teams?**

In [9]:
df_count_2 = df.groupby('teams')['creator'].nunique()

# get the teams with more than one creator
teams_with_multiple_creators = df_count_2[df_count_2 > 1].index.tolist()

print(f"The following teams belong to multiple creators: {', '.join(teams_with_multiple_creators)}")

The following teams belong to multiple creators: ['Incredible Family'], []


# 6.What are the characteristics that can predict a superhero alignment.

In [10]:
from textblob import TextBlob

# Define a function to classify the hero as good or bad based on their alignment and history text
def classify_hero(row):
    # Convert history_text to string and perform sentiment analysis
    history_sentiment = TextBlob(str(row['history_text'])).sentiment.polarity

    # Classify the hero as good or bad based on their alignment and history text sentiment
    row['alignment'] = str(row['alignment']).lower()
    
    if row['alignment'] == 'good': # and history_sentiment > 0:
        return 'good'
    elif row['alignment'] == 'bad' and history_sentiment < 0:
        return 'bad'
    else:
        return 'neutral'

# Apply the function to each row of the dataset
df['hero_type'] = df.apply(classify_hero, axis=1)

# Print the resulting dataframe with the new hero_type column
selected_columns_q6 = ['name', 'hero_type']
df[selected_columns_q6].head(5)

Unnamed: 0,name,hero_type
0,3-D Man,good
1,514A (Gotham),neutral
2,A-Bomb,good
3,Aa,good
4,Aaron Cash,good


# Q7. From history of superheroes, 
**A. Find list of superheroes having negative past but now aligned positively.**


In [11]:
my_list = []
from textblob import TextBlob

# Define a function to classify the hero as good or bad based on their alignment and history text
def classify_hero(row):
    # Convert history_text to string and perform sentiment analysis
    history_sentiment = TextBlob(str(row['history_text'])).sentiment.polarity

    # Classify the hero as good or bad based on their alignment and history text sentiment
    row['alignment'] = str(row['alignment']).lower()
    
    if row['alignment'] == 'bad' and history_sentiment > 0:
        my_list.append(row['name'])

# Apply the function to each row of the dataset
df['hero_type'] = df.apply(classify_hero, axis=1)

# Print the resulting dataframe with the new hero_type column
selected_columns_q6 = ['name', 'hero_type']
my_list[:5]

['Abomination', 'Abra Kadabra (CW)', 'Abraxas', 'Air-Walker', 'Ajax']

**7 B. Extract patterns from superhero history for each creator.**

In [12]:
# Importing necessary libraries
import pandas as pd
import nltk
from gensim import corpora, models

# Reading the dataset

# Collecting history_text of each superhero
history_text = df['history_text'].tolist()

# Tokenizing and stemming the history_text
tokenizer = nltk.RegexpTokenizer(r'\w+')
stemmer = nltk.stem.PorterStemmer()
texts = [[stemmer.stem(word) for word in tokenizer.tokenize(document.lower())] for document in history_text]

# Creating a dictionary from the texts
dictionary = corpora.Dictionary(texts)

# Creating a corpus from the dictionary
corpus = [dictionary.doc2bow(text) for text in texts]

# Applying LDA to the corpus
lda_model = models.LdaModel(corpus=corpus, id2word=dictionary, num_topics=10)

# Printing the topics identified by LDA
for topic in lda_model.print_topics():
    print(topic)


(0, '0.047*"the" + 0.022*"of" + 0.022*"a" + 0.021*"to" + 0.020*"and" + 0.012*"he" + 0.010*"that" + 0.010*"wa" + 0.010*"s" + 0.009*"in"')
(1, '0.057*"the" + 0.039*"to" + 0.036*"and" + 0.025*"a" + 0.022*"of" + 0.016*"hi" + 0.015*"he" + 0.015*"wa" + 0.014*"in" + 0.011*"that"')
(2, '0.061*"the" + 0.032*"to" + 0.026*"of" + 0.022*"and" + 0.021*"a" + 0.019*"in" + 0.013*"hi" + 0.013*"he" + 0.012*"s" + 0.012*"wa"')
(3, '0.043*"the" + 0.024*"and" + 0.021*"to" + 0.020*"a" + 0.018*"of" + 0.016*"hi" + 0.013*"in" + 0.012*"he" + 0.010*"s" + 0.010*"with"')
(4, '0.069*"the" + 0.040*"to" + 0.032*"and" + 0.023*"of" + 0.022*"a" + 0.019*"he" + 0.015*"hi" + 0.015*"wa" + 0.015*"in" + 0.011*"with"')
(5, '0.059*"the" + 0.031*"to" + 0.026*"and" + 0.022*"a" + 0.020*"her" + 0.018*"of" + 0.017*"in" + 0.014*"s" + 0.011*"hi" + 0.011*"wa"')
(6, '0.063*"the" + 0.037*"to" + 0.027*"and" + 0.024*"of" + 0.018*"a" + 0.016*"in" + 0.015*"wa" + 0.014*"hi" + 0.013*"s" + 0.012*"with"')
(7, '0.044*"to" + 0.033*"the" + 0.026*"and

# 8. Report on the 10 superheroes with most relatives, status of those relatives where possible, and the alignment of those superheroes.

In [129]:
import re
import ast

# Load the dataset
df = pd.read_csv("/kaggle/input/superheroes-nlp-dataset/superheroes_nlp_dataset.csv")
df['relatives'] = df['relatives'].astype(str)

# define the regular expression pattern
pattern1 = r'\(([^)]+)\)'
pattern2 = r'\([^)]*\)'

# extract the values within round brackets from the column
df['relative_name'] = df['relatives'].apply(lambda x: re.findall(pattern1, str(x)))
df['relative_type'] = df['relatives'].apply(lambda x: re.findall(pattern2, str(x)))
df = df[['relative_name', 'relative_type', 'alignment']]

# convert string column into list
df['relative_name'] = df['relative_name'].apply(lambda x: ast.literal_eval(str(x)))

df['relative_count'] = df['relative_name'].apply(len)
df = df.sort_values('relative_count', ascending=False).head(10)
df

Unnamed: 0,relative_name,relative_type,alignment,relative_count
574,"[adoptive paternal distant ancestor, deceased,...","[(adoptive paternal distant ancestor, deceased...",Good,52
936,"[maternal ancestor, maternal ancestor, materna...","[(maternal ancestor), (maternal ancestor), (ma...",Good,46
75,"[son, son, son, half-brother, half-sister, ado...","[(son), (son), (son), (half-brother), (half-si...",Good,36
340,"[adoptive paternal distant ancestor, deceased,...","[(adoptive paternal distant ancestor, deceased...",Good,30
1100,"[mother, Batman, father, maternal grandfather,...","[(mother), (Batman, father), (maternal grandfa...",Good,21
121,"[distant ancestor, deceased, distant ancestor,...","[(distant ancestor, deceased), (distant ancest...",Bad,19
637,"[paternal grandfather, presumed deceased, pres...","[(paternal grandfather, presumed deceased), (p...",Bad,17
1251,"[father, mother, uncle, deceased, aunt, deceas...","[(father), (mother), (uncle, deceased), (aunt,...",Good,17
515,"[maternal ancestor, deceased, maternal ancesto...","[(maternal ancestor, deceased), (maternal ance...",Good,17
750,"[father, deceased, mother, deceased, brother, ...","[(father, deceased), (mother, deceased), (brot...",Bad,16


# 9. Find out any other interesting insights from given data. 

**A. Which 3 comic characters can you recommend to your friends to read or watch?**


In [None]:
# NA