## Initial Setup

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import matplotlib
import seaborn as sns
import plotly.offline as py
color = sns.color_palette()
import plotly.graph_objs as go
from plotly import tools

In [2]:
py.init_notebook_mode(connected=True)
%matplotlib inline

import warnings
warnings.filterwarnings('ignore')

## Loading the data

In [3]:
heroes = pd.read_csv('heroes_information.csv')
heroes.head(10)

Unnamed: 0.1,Unnamed: 0,name,Gender,Eye color,Race,Hair color,Height,Publisher,Skin color,Alignment,Weight
0,0,A-Bomb,Male,yellow,Human,No Hair,203.0,Marvel Comics,-,good,441.0
1,1,Abe Sapien,Male,blue,Icthyo Sapien,No Hair,191.0,Dark Horse Comics,blue,good,65.0
2,2,Abin Sur,Male,blue,Ungaran,No Hair,185.0,DC Comics,red,good,90.0
3,3,Abomination,Male,green,Human / Radiation,No Hair,203.0,Marvel Comics,-,bad,441.0
4,4,Abraxas,Male,blue,Cosmic Entity,Black,-99.0,Marvel Comics,-,bad,-99.0
5,5,Absorbing Man,Male,blue,Human,No Hair,193.0,Marvel Comics,-,bad,122.0
6,6,Adam Monroe,Male,blue,-,Blond,-99.0,NBC - Heroes,-,good,-99.0
7,7,Adam Strange,Male,blue,Human,Blond,185.0,DC Comics,-,good,88.0
8,8,Agent 13,Female,blue,-,Blond,173.0,Marvel Comics,-,good,61.0
9,9,Agent Bob,Male,brown,Human,Brown,178.0,Marvel Comics,-,good,81.0


In [4]:
heroes.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 734 entries, 0 to 733
Data columns (total 11 columns):
Unnamed: 0    734 non-null int64
name          734 non-null object
Gender        734 non-null object
Eye color     734 non-null object
Race          734 non-null object
Hair color    734 non-null object
Height        734 non-null float64
Publisher     719 non-null object
Skin color    734 non-null object
Alignment     734 non-null object
Weight        732 non-null float64
dtypes: float64(2), int64(1), object(8)
memory usage: 63.2+ KB


In [5]:
print("missing value count in Publisher:",heroes['Publisher'].isnull().sum())
print("missing value count in Weight:",heroes['Weight'].isnull().sum())

missing value count in Publisher: 15
missing value count in Weight: 2


In [6]:
# dropping first column 
heroes.drop(['Unnamed: 0'], axis=1, inplace=True)

# replacing '-' and NaN values with 'unknown' in Publisher attribute
heroes.replace(to_replace='-', value='unknown', inplace=True)
heroes['Publisher'].fillna('unknown', inplace=True)

In [7]:
heroes.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 734 entries, 0 to 733
Data columns (total 10 columns):
name          734 non-null object
Gender        734 non-null object
Eye color     734 non-null object
Race          734 non-null object
Hair color    734 non-null object
Height        734 non-null float64
Publisher     734 non-null object
Skin color    734 non-null object
Alignment     734 non-null object
Weight        732 non-null float64
dtypes: float64(2), object(8)
memory usage: 57.4+ KB


In [8]:
heroes['Weight'].value_counts()

-99.0     237
 79.0      23
 54.0      23
 81.0      22
 90.0      19
 52.0      15
 86.0      15
 59.0      14
 56.0      13
 77.0      13
 61.0      13
 101.0     12
 50.0      12
 88.0      11
 63.0      11
 83.0      10
 95.0      10
 74.0      10
 65.0       9
 72.0       8
 68.0       8
 104.0      8
 135.0      8
 57.0       7
 97.0       7
 99.0       7
 117.0      6
 108.0      6
 55.0       5
 70.0       5
         ... 
 14.0       1
 356.0      1
 105.0      1
 324.0      1
 360.0      1
 236.0      1
 36.0       1
 140.0      1
 128.0      1
 248.0      1
 16.0       1
 45.0       1
 234.0      1
 96.0       1
 82.0       1
 76.0       1
 198.0      1
 116.0      1
 176.0      1
 170.0      1
 47.0       1
 78.0       1
 27.0       1
 25.0       1
 178.0      1
 132.0      1
 38.0       1
 412.0      1
 320.0      1
 855.0      1
Name: Weight, Length: 135, dtype: int64

Umm there's alot of negative weights. Ideally weights can't be negative the super heroes could be light as air but not negative, so let's replace them by NaN.

In [9]:
heroes[heroes['Weight'].isnull()]

Unnamed: 0,name,Gender,Eye color,Race,Hair color,Height,Publisher,Skin color,Alignment,Weight
286,Godzilla,unknown,unknown,Kaiju,unknown,108.0,unknown,grey,bad,
389,King Kong,Male,yellow,Animal,Black,30.5,unknown,unknown,good,


In [10]:
# replacing negative Heights and Weights with NaN
heroes.replace(-99.0, np.nan, inplace=True)

In [11]:
heroes.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 734 entries, 0 to 733
Data columns (total 10 columns):
name          734 non-null object
Gender        734 non-null object
Eye color     734 non-null object
Race          734 non-null object
Hair color    734 non-null object
Height        517 non-null float64
Publisher     734 non-null object
Skin color    734 non-null object
Alignment     734 non-null object
Weight        495 non-null float64
dtypes: float64(2), object(8)
memory usage: 57.4+ KB


So it turns out even the Height attribute had a lot of negative values. Now we've a lot of missing values to fill

In [12]:
ht_wt = heroes[['Height','Weight']]

In [13]:
# imputing missing heights and weights with median
from sklearn.preprocessing import Imputer
imputer = Imputer(strategy="median")

X = imputer.fit_transform(ht_wt)
# X = imputer.transform(ht_wt)
heroes_h_w = pd.DataFrame(X, columns=ht_wt.columns)

In [14]:
heroes_h_w.isnull().any()

Height    False
Weight    False
dtype: bool

In [15]:
heroes_without_h_w = heroes.drop(['Height','Weight'],axis=1)
heroes = pd.concat([heroes_without_h_w, heroes_h_w], axis=1)
heroes.head()

Unnamed: 0,name,Gender,Eye color,Race,Hair color,Publisher,Skin color,Alignment,Height,Weight
0,A-Bomb,Male,yellow,Human,No Hair,Marvel Comics,unknown,good,203.0,441.0
1,Abe Sapien,Male,blue,Icthyo Sapien,No Hair,Dark Horse Comics,blue,good,191.0,65.0
2,Abin Sur,Male,blue,Ungaran,No Hair,DC Comics,red,good,185.0,90.0
3,Abomination,Male,green,Human / Radiation,No Hair,Marvel Comics,unknown,bad,203.0,441.0
4,Abraxas,Male,blue,Cosmic Entity,Black,Marvel Comics,unknown,bad,183.0,81.0


## Some Insights

First lets see the distribution of the number of super heroes from each of the Publishers

In [16]:
publisher_series = heroes['Publisher'].value_counts()
publishers = list(publisher_series.index)
publications = list((publisher_series/publisher_series.sum())*100)

In [17]:
trace = go.Pie(labels=publishers, values=publications)
layout = go.Layout(
    title='comic-wise publications distributions',
    height=950,
    width=950
)
data = [trace]
fig = go.Figure(data=data, layout=layout)
py.iplot(fig, filename='comics-wise-distribution')

__Marvel Comics and DC Comics definitely have a huge market captured__

Not sure what the Alignment attribute means but the values tell me this is more like a flag whether the character is a 'Hero' or a 'Villian' or 'Neutral' character. Now I'm also curious as to which characters are actually neutral and which ones fall into the unknown alignment.

In [18]:
heroes.loc[heroes['Alignment']=='unknown']

Unnamed: 0,name,Gender,Eye color,Race,Hair color,Publisher,Skin color,Alignment,Height,Weight
33,Anti-Venom,Male,blue,Symbiote,Blond,Marvel Comics,unknown,unknown,229.0,358.0
110,Blackwulf,Male,red,Alien,White,Marvel Comics,unknown,unknown,188.0,88.0
138,Brundlefly,Male,unknown,Mutant,unknown,unknown,unknown,unknown,193.0,81.0
426,Man of Miracles,unknown,blue,God / Eternal,Silver,Image Comics,unknown,unknown,183.0,81.0
535,Q,Male,unknown,God / Eternal,unknown,Star Trek,unknown,unknown,183.0,81.0
676,Trickster,Male,blue,Human,Blond,DC Comics,unknown,unknown,183.0,81.0
692,Venompool,Male,unknown,Symbiote,unknown,Marvel Comics,unknown,unknown,226.0,81.0


Well Venom was definitely a bad guy, that means Anti-Venom must a heroic character. May be someone who stood up against Venom (cutting Spiderman some free-time for Mary Jane) or may be someone born out of Venom idk if you know about this character do let me know...
But atleast this explains the kind of characters whose alignment is not known.

Now there's someone as Venompool also....idk how many characters Venom has had the relationship with in the past. But then lets first check the alignment of Deadpool

In [19]:
heroes.loc[heroes['name']=='Deadpool']

Unnamed: 0,name,Gender,Eye color,Race,Hair color,Publisher,Skin color,Alignment,Height,Weight
212,Deadpool,Male,brown,Mutant,No Hair,Marvel Comics,unknown,neutral,188.0,95.0


Hmm now Deadpool's alignment is set to Neutral may be because he doesn't restricts himself only to the bad guys and enjoys kicking the b*** some of the X-Mens as well  :P 

In [20]:
heroes['Alignment'].value_counts()

good       496
bad        207
neutral     24
unknown      7
Name: Alignment, dtype: int64

## Number of Heroes vs Number of Villians
Let's see the count of total heroes, total villian and neutral characters in each of the publications

In [21]:
tot_pub = (heroes.Publisher.value_counts().index)
col_names = ['Publisher', 'total_heroes', 'total_villian', 'total_neutral', 'total_unknown']
df = pd.DataFrame(columns=col_names)

for publisher in tot_pub:
    data=[]
    data.append(publisher)
    data.append(len(list(heroes['name'].loc[(heroes['Alignment']=='good') & (heroes['Publisher']==publisher)])))
    data.append(len(list(heroes['name'].loc[(heroes['Alignment']=='bad') & (heroes['Publisher']==publisher)])))
    data.append(len(list(heroes['name'].loc[(heroes['Alignment']=='neutral') & (heroes['Publisher']==publisher)])))
    data.append(len(list(heroes['name'].loc[(heroes['Alignment']=='unknown') & (heroes['Publisher']==publisher)])))
    df.loc[len(df)] = data

# print(df)

In [22]:
trace1 = go.Bar(
    x=list(df.Publisher),
    y=list(df.total_heroes),
    name='total_heroes'
)

trace2 = go.Bar(
    x=list(df.Publisher),
    y=list(df.total_villian),
    name='total_villians'
)

trace3 = go.Bar(
    x=list(df.Publisher),
    y=list(df.total_neutral),
    name='total_neutral'
)

trace4 = go.Bar(
    x=list(df.Publisher),
    y=list(df.total_unknown),
    name='total_unknown'
)

data = [trace1, trace2, trace3, trace4]
layout = go.Layout(
    title='Publisher-wise number of heroes vs number of villians',
    barmode='group'
)

fig = go.Figure(data=data, layout=layout)
py.iplot(fig, filename='heroes-vs-villians per Publisher')

clearly there are more heroes than villians in any of the comics' publication. This is really disheartening as all the battles i ever read about during my childhood....__it was never an equal fight as the heroes are outnumbered in both Marvel and DC Universe!!__


except for <i>Image Comics</i> [ zoom-in ] where there are just 2 heroes against 11 villians. Interesting!

## Gender Distribution - overall and alignment-wise

In [23]:
# gender distribution
gender_series = heroes['Gender'].value_counts()
genders = list(gender_series.index)
distribution = list((gender_series/gender_series.sum())*100)

trace = go.Pie(labels=genders, values=distribution)
layout = go.Layout(
    title='overall gender distributions',
    height=500,
    width=500
)
data = [trace]
fig = go.Figure(data=data, layout=layout)
py.iplot(fig, filename='gender-distribution')

# gender distribution by alignment
heroes_gender_series = heroes['Gender'].loc[heroes['Alignment']=='good'].value_counts()
heroes_genders = list(heroes_gender_series.index)
heroes_distribution = list((heroes_gender_series/heroes_gender_series.sum())*100)

villian_gender_series = heroes['Gender'].loc[heroes['Alignment']=='bad'].value_counts()
villian_genders = list(villian_gender_series.index)
villian_distribution = list((villian_gender_series/villian_gender_series.sum())*100)

neutral_gender_series = heroes['Gender'].loc[heroes['Alignment']=='neutral'].value_counts()
neutral_genders = list(neutral_gender_series.index)
neutral_distribution = list((neutral_gender_series/neutral_gender_series.sum())*100)

unknown_gender_series = heroes['Gender'].loc[heroes['Alignment']=='unknown'].value_counts()
unknown_genders = list(unknown_gender_series.index)
unknown_distribution = list((unknown_gender_series/unknown_gender_series.sum())*100)

fig = {
    "data": [
        {
            "labels": heroes_genders, 
            "values": heroes_distribution, 
            "type": "pie", 
            "name": "heroes",
            "domain":{'x': [0, 0.48],
                      'y': [0.51, 1]},
            "textinfo": "label"
        },
        {
            "labels": villian_genders, 
            "values": villian_distribution, 
            "type": "pie", 
            "name": "villians",
            "domain":{'x': [0.52, 1],
                     'y': [0.51, 1]},
            "textinfo": "label"
        },
        {
            "labels": neutral_genders, 
            "values": neutral_distribution, 
            "type": "pie", 
            "name": "neutral characters",
            "domain":{'x': [0, 0.48],
                      'y': [0, 0.49]},
            "textinfo": "label"
        },
        {
            "labels": unknown_genders, 
            "values": unknown_distribution, 
            "type": "pie", 
            "name": "unknown characters",
            "domain":{'x': [0.52, 1],
                      'y': [0, 0.49]},
            "textinfo": "label"
        }
    ],
    "layout": {"title": "Gender distribution among Heroes, Villians and Neutral Characters", 
               "showlegend": False}
}

py.iplot(fig, filename='Gender distribution')

so less! We need more women to join the Dark side....Harley Quinn, Catwomen, Poison Ivy, etc have been a few of my fav negative characters. __The world needs more sexy negative feminine characters!__

## Alignment of superheroes by gender

In [24]:
male_df = heroes.loc[heroes['Gender']=='Male']
female_df = heroes.loc[heroes['Gender']=='Female']

In [25]:
trace_m = go.Bar(
    x=male_df['Alignment'].value_counts().index,
    y=male_df['Alignment'].value_counts().values,
    name='male'
)

trace_f = go.Bar(
    x=female_df['Alignment'].value_counts().index,
    y=female_df['Alignment'].value_counts().values,
    name='female'
)

data = [trace_m, trace_f]
layout = go.Layout(
    title='Alignment of super heroes by gender',
    barmode='group'
)

fig = go.Figure(data=data, layout=layout)
py.iplot(fig, filename='alignment-by-gender')

## Distribution of superheroes by Race

In [26]:
trace = go.Bar(
    x=heroes['Race'].value_counts().index,
    y=heroes['Race'].value_counts().values,
    name='Races'
)

layout = go.Layout(
    title='Distribution of heroes across different races',
    barmode='bar'
)

fig = go.Figure(data=[trace], layout=layout)
py.iplot(fig, filename='distribution-by-race')

## Skin color distribution

__Let's find more heroes like HULK__

[ basically whose races are Humans or Human-like but skin colores have changed ]

<i>super-powers at the cost of skin-change!</i> Nope...I'd rather stay human.

In [27]:
human_heroes_changed_skin_color = heroes.loc[(heroes['Race'].isin(['Human','Human / Radiation','Human / Clone','Human-Kree','Human / Cosmic','Human / Altered','Human-Vuldarian','Human-Vulcan','Human-Spartoi','']) & 
            (~heroes['Skin color'].isin(['unknown','white','black','gray','grey'])))]

# print(human_heroes_changed_skin_color[['name','Skin color']])

green_skin=str(human_heroes_changed_skin_color['name'].loc[heroes['Skin color']=='green'].values)
red_skin=str(human_heroes_changed_skin_color['name'].loc[heroes['Skin color']=='red'].values)
blue_skin=str(human_heroes_changed_skin_color['name'].loc[heroes['Skin color']=='blue'].values)
silver_skin=str(human_heroes_changed_skin_color['name'].loc[heroes['Skin color']=='silver'].values)
gold_skin=str(human_heroes_changed_skin_color['name'].loc[heroes['Skin color']=='gold'].values)
purple_skin=str(human_heroes_changed_skin_color['name'].loc[heroes['Skin color']=='purple'].values)

trace = go.Bar(
    x=list(human_heroes_changed_skin_color['Skin color'].value_counts().index),
    y=list(human_heroes_changed_skin_color['Skin color'].value_counts().values),
    text=[green_skin,red_skin,blue_skin,silver_skin,gold_skin,purple_skin],
    marker=dict(color=[
        'rgba(0, 250, 32, 0.8)', #green
        'rgba(255,0,0,0.9)', #red
        'rgb(0,0,255,0.7)', #blue
        'rgb(192,192,192,0.8)', #silver
        'rgb(255,255,0,0.9)', #gold
        'rgb(128,0,128,0.7)' #purple
    ])
)

layout = go.Layout(
    title='Human Heroes with changed skin color',
    barmode='bar'
)

fig = go.Figure(data=[trace], layout=layout)
py.iplot(fig, filename='Human-Heroes-changed-skin-color')

In [28]:
# overall skin color distribution
skin_series = heroes['Skin color'].value_counts()
skins = list(skin_series.index)
color_distribution = list((skin_series/skin_series.sum())*100)

trace = go.Pie(labels=skins, values=color_distribution)

layout = go.Layout(
    title='skin color distributions',
    height=500,
    width=500
)

data = [trace]
fig = go.Figure(data=data, layout=layout)
py.iplot(fig, filename='skin color distribution')

In [29]:
# skin color distribution among the rest 10% heroes whose skin color is known

skin_series = heroes.loc[heroes['Skin color']!='unknown']['Skin color'].value_counts()
skins = list(skin_series.index)
color_distribution = list((skin_series/skin_series.sum())*100)

trace = go.Pie(
    labels=skins, 
    values=color_distribution,
    hoverinfo='label+percent', 
    textinfo='value',
    marker=dict(colors=[
        'rgba(0,255,0,0.9)', #green
        'rgba(0,0,255,0.8)', #blue
        'rgba(255,0,0,1)', #red
        'rgba(255,255,255,0.5)', #white
        'rgba(192,192,192,1)', #grey 
        'rgba(128,128,128,0.8)', #silver 
        'rgba(255,215,0,1)', #gold
        'rgba(128,0,128,0.8)', #purple
        'rgba(255,255,0,0.7)', #yellow
        'rgba(255,0,0,0.5)', #pink
        'rgba(255,140,0,0.8)', #orange/white
        'rgba(128,0,0,0.9)', #red/black
        'rgba(0,255,255,0.6)', #blue-white
        'rgba(0,0,0,0.8)', #black
        'rgba(255,165,0,0.7)', #orange
        'rgba(128,128,128,0.6)', #gray
    ],
        line=dict(
            color='rgb(8,48,107)',
            width=0.3))
)

layout = go.Layout(
    title='skin color distributions',
    height=700,
    width=700
)

data = [trace]
fig = go.Figure(data=data, layout=layout)
py.iplot(fig, filename='skin color distribution')

## Hair color distribution

__Now let's find more heroes like Professor X__

[ basically those who're bald ]

In [30]:
bald_or_not = heroes['Hair color'].where(heroes['Hair color']=="No Hair", other='Hair')

trace = go.Bar(
    x=bald_or_not.value_counts().index,
    y=bald_or_not.value_counts().values,
    name='bald vs not-bald',
    text=['not-bald','bald']
)

layout = go.Layout(
    title='bald vs not-bald',
    barmode='bar'
)

fig = go.Figure(data=[trace], layout=layout)
py.iplot(fig, filename='distribution-by-baldness')

In [31]:
# hair color distribution

hair_df = heroes.loc[~heroes['Hair color'].isin(['No Hair','unknown'])]['Hair color']
# excluding the bald types and unknown hair colors

# some values are same but with diff cases e.g. 'Blond' & 'blond' should be same, need to change all to lower case first
hair_df=hair_df.astype(str).str.lower()

hair_df=hair_df.str.replace('brownn','brown')

hair_series = hair_df.value_counts()
hair_colors = list(hair_series.index)
color_distribution = list((hair_series/hair_series.sum())*100)

trace = go.Pie(
    labels=hair_colors, 
    values=color_distribution,
    hoverinfo='label+percent', 
    textinfo='value',
    marker=dict(colors=[
        'rgba(0,0,0,0.8)', #black
        'rgba(243,243,164,0.8)', #blond
        'rgba(165,104,42,0.7)', #brown
        'rgba(255,0,0,0.9)', #red
        'rgba(255,255,255,1)', #white
        'rgba(165,42,42,0.9)', #auburn
        'rgba(0,255,0,0.7)', #green
        'rgba(165,88,29,0.7)', #strawberry blond
        'rgba(128,128,128,0.9)', #grey
        'rgba(128,0,128,0.8)', #purple
        'rgba(192,192,192,0.5)', #silver
        'rgba(216,210,181,1)', #brown/white
        'rgba(0,0,255,0.8)', #blue
        'rgba(255,165,0,0.6)', #orange
        'rgba(255,255,0,1)', #yellow
        'rgba(255,0,255,1)', #magenta
        'rgba(116,68,56,1)', #red / grey
        'rgba(60,42,8,1)', #brown / black
        'rgba(75,0,130,1)', #indigo
        'rgba(255,215,0,0.9)', #gold
        'rgba(20,28,27,1)', #black / blue
        'rgba(251,194,123,0.7)', #orange / white
        'rgba(251,223,214,0.8)', #red / white
        'rgba(255,64,0,1)', #red / orange
        'rgba(255,192,203,0.9)', #pink
    ],
        line=dict(
            color='rgb(8,48,107)',
            width=0.3))
)

layout = go.Layout(
    title='distributions by Hair color',
    height=750,
    width=750
)

data = [trace]
fig = go.Figure(data=data, layout=layout)
py.iplot(fig, filename='hair color distribution')

### Let's look into the super-powers of these heroes now

In [32]:
powers = pd.read_csv('super_hero_powers.csv')
powers.head()

Unnamed: 0,hero_names,Agility,Accelerated Healing,Lantern Power Ring,Dimensional Awareness,Cold Resistance,Durability,Stealth,Energy Absorption,Flight,...,Web Creation,Reality Warping,Odin Force,Symbiote Costume,Speed Force,Phoenix Force,Molecular Dissipation,Vision - Cryo,Omnipresent,Omniscient
0,3-D Man,True,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
1,A-Bomb,False,True,False,False,False,True,False,False,False,...,False,False,False,False,False,False,False,False,False,False
2,Abe Sapien,True,True,False,False,True,True,False,False,False,...,False,False,False,False,False,False,False,False,False,False
3,Abin Sur,False,False,True,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
4,Abomination,False,True,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False


wow! everything is in boolean...my life is so much simplified now :))

In [33]:
# converting all values to 0s and 1s
powers = powers * 1
powers.head(2)

Unnamed: 0,hero_names,Agility,Accelerated Healing,Lantern Power Ring,Dimensional Awareness,Cold Resistance,Durability,Stealth,Energy Absorption,Flight,...,Web Creation,Reality Warping,Odin Force,Symbiote Costume,Speed Force,Phoenix Force,Molecular Dissipation,Vision - Cryo,Omnipresent,Omniscient
0,3-D Man,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,A-Bomb,0,1,0,0,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0


## Most powerful superhero

In [34]:
powers.loc[:,'total_powers'] = powers.iloc[:, 1:].sum(axis=1)
powers.head(2)

Unnamed: 0,hero_names,Agility,Accelerated Healing,Lantern Power Ring,Dimensional Awareness,Cold Resistance,Durability,Stealth,Energy Absorption,Flight,...,Reality Warping,Odin Force,Symbiote Costume,Speed Force,Phoenix Force,Molecular Dissipation,Vision - Cryo,Omnipresent,Omniscient,total_powers
0,3-D Man,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,4.0
1,A-Bomb,0,1,0,0,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,7.0


In [35]:
trace = go.Bar(
    x=powers['hero_names'],
    y=powers['total_powers'],
    text=['names','total powers']
)

layout = go.Layout(
    title='most powerfull superhero',
    barmode='bar'
)

fig = go.Figure(data=[trace], layout=layout)
py.iplot(fig, filename='most-powerfull-superhero')

that's a lot, lets pllot for the top 30 superheroes only

In [36]:
powers = powers.sort_values('total_powers', ascending=False)

trace = go.Bar(
    x=powers['hero_names'].head(30),
    y=powers['total_powers'].head(30),
    text=['names','total powers']
)

layout = go.Layout(
    title='most powerfull superhero',
    barmode='bar'
)

fig = go.Figure(data=[trace], layout=layout)
py.iplot(fig, filename='most-powerfull-superhero')

### and the Winner is: Spectre

so there are character's even more powerful than Superman and Goku. 

#### <i>Atleast the internet will now have the answer for the __"Superman vs Goku"__ thing. Superman is clearly a winner here! </i> 
Although Goku can evolve into multiple levels of Super-Saiyan but we don't know about the powers of a Super Saiyan yet.

## Most common super-powers

with a lil inspiration from other kernels, i thought it would be interesting to see the most common super powers as well.

In [37]:
df = powers.drop(['hero_names'], axis=1)

df2 = pd.DataFrame()
for col in list(df.columns):
    df2[col] = df[col].value_counts()
    
df2.drop(['total_powers'], axis=1, inplace=True)
df2 = df2.T
df2.drop([0], axis=1, inplace=True)

In [38]:
df2.sort_values(1, ascending=False)
df2.rename(columns={1: 'total_heroes'}, inplace=True)

In [39]:
df2.sort_values('total_heroes', ascending=False, inplace=True)
df2['super_power']=df2.index

In [40]:
trace = go.Bar(
    x=np.array(df2['super_power'].loc[df2['total_heroes']>100]),
    y=np.array(df2['total_heroes'].loc[df2['total_heroes']>100])
)
layout = go.Layout(
    title='most common super powers',
    barmode='bar'
)

fig = go.Figure(data=[trace], layout=layout)
py.iplot(fig, filename='most common super powers')

## Most unique super-powers

In [41]:
list(df2['super_power'].loc[df2['total_heroes']==1])

['Electrical Transport',
 'Molecular Dissipation',
 'Phoenix Force',
 'Speed Force',
 'Hyperkinesis',
 'Changing Armor',
 'Hair Manipulation',
 'Anti-Gravity',
 'Spatial Awareness',
 'Banish',
 'Omnitrix',
 'Thirstokinesis',
 'Biokinesis',
 'Intuitive aptitude']