In [7]:
import numpy as np
import pandas as pd
import ast

# Load dataset and pre-process

In [2]:
# Load the dataset
df = pd.read_csv("superheroes_nlp_dataset.csv")
df.isnull().sum()

# Convert the column to numeric type
df['overall_score'] = pd.to_numeric(df['overall_score'], errors='coerce').astype(pd.Int64Dtype())

# Find the second maximum value
second_max_value = df['overall_score'].nlargest(2).iloc[-1] * 2

# Replace non-integer values with np.inf
df['overall_score'].replace([value for value in df['overall_score'] if not isinstance(value, np.int64)], second_max_value, inplace=True)
selected_columns = ['name', 'overall_score']
df[selected_columns].sort_values(['overall_score'], ascending=[False]).head(3)

Unnamed: 0,name,overall_score
333,Curse,458
1387,War Machine II,458
1313,Thunderbird III,458


# Q1. The most powerfull hero

`combined_score = score(intelligence + strength + speed + durability + power + combat) `

In [3]:

df['superpower_count']=df.apply(lambda r: len(ast.literal_eval(r['superpowers'])), axis=1)
df['combined_score']=df.intelligence_score + df.strength_score + df.speed_score + df.durability_score + df.power_score + df.combat_score
df.alignment.fillna('not available', inplace=True)
df.name.fillna('not available', inplace=True)

df[df.overall_score==second_max_value].sort_values(['combined_score','superpower_count'], ascending=[False,False])
selected_columns = ['name', 'combined_score', 'superpower_count']
df[selected_columns].sort_values(['combined_score','superpower_count'], ascending=[False,False]).head(1)


Unnamed: 0,name,combined_score,superpower_count
526,Golden Master's Mech,600,106


# Q1.A The most powerful superhero from each creator

In [4]:
# Get the row with the highest power value for each group
max_power_rows = df.groupby('creator').apply(lambda x: x[x['combined_score'] == x['combined_score'].max()]).sort_values(by='combined_score', ascending=False)

selected_columns = ['creator', 'name', 'combined_score']

max_power_rows[selected_columns].head(5)

# Group the data by category and find the maximum price
max_price = df.groupby('creator')['combined_score'].max()

# Create a new DataFrame with the maximum price and count for each category
result = pd.DataFrame({
    'creator': max_price.index,
    'combined_score': max_price.values,
#     'name': df.iloc[max_price.index]
})

result.sort_values(by='combined_score', ascending=False).head(10)


Unnamed: 0,creator,combined_score
5,DC Comics,600
22,Marvel Comics,600
21,Lego,600
10,George R. R. Martin,600
19,J. R. R. Tolkien,600
17,Image Comics,600
31,Shueisha,590
20,Konami,575
6,Dark Horse Comics,575
13,Hasbro,565


# Q2. Top 5 Superpowers 

In [5]:
# Load the dataset
df_1 = pd.read_csv("superheroes_nlp_dataset.csv")
df_1.isnull().sum()

# Convert the column to numeric type
df_1['overall_score'] = pd.to_numeric(df_1['overall_score'], errors='coerce').astype(pd.Int64Dtype())

# Find the second maximum value
second_max_value = df_1['overall_score'].nlargest(2).iloc[-1] * 2

# Replace non-integer values with np.inf
df_1['overall_score'].replace([value for value in df_1['overall_score'] if not isinstance(value, np.int64)], second_max_value, inplace=True)
selected_columns = ['name', 'overall_score']
df_1[selected_columns].sort_values(['overall_score'], ascending=[False]).head(3)

df_1['superpowers']=df_1.apply(lambda r: ast.literal_eval(r['superpowers']), axis=1)
df_1_superpowers = (
    df_1["superpowers"].apply(pd.Series).stack().pipe(pd.get_dummies).groupby(level=0).sum()
)
df_1_superpowers.columns = df_1_superpowers.columns.str.lower().str.replace(" ", "_")

selected_columns=['name', 'overall_score','intelligence_score','strength_score','speed_score','durability_score','power_score','combat_score']
heropower=df_1[selected_columns].join(df_1_superpowers)
heropower.fillna(0, inplace=True)
heropower.isnull().sum()
infinite=heropower[heropower.overall_score==second_max_value]
non_infinite=heropower[heropower.overall_score!=second_max_value]
infinite_sums=infinite.drop(selected_columns, axis=1).sum(axis=0)
non_infinite_sums=non_infinite.drop(selected_columns, axis=1).sum(axis=0)
infinite_power=infinite_sums[(infinite_sums>0) & (non_infinite_sums==0)]
infinite_power

apotheosis                1.0
omnipotent                4.0
orbing                    1.0
salvation                 1.0
willpower_manipulation    1.0
dtype: float64

# Q3. Which race has the most immortal superheroes?

In [6]:
# 
data_race = df.groupby(['type_race'])['combined_score'].mean().to_frame(name = 'mean_power_score').reset_index()
data_race = data_race.sort_values(by='mean_power_score', ascending=False)

data_race.head(1)

Unnamed: 0,type_race,mean_power_score
53,Saiyan,581.0


# Q4. Name the creator having most superheroes of type “Parademon”.