In [1]:
## Standard Imports
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
## Importing the OS and JSON Modules
import os,json

In [2]:
df1 = pd.read_csv('data/superhero_info.csv')

In [3]:
## Saving the 2 new hero and publisher columns into the dataframe
df1[['Hero','Publisher']] = df1['Hero|Publisher'].str.split('|',expand=True)
df1.head(2)

Unnamed: 0,Hero|Publisher,Gender,Race,Alignment,Hair color,Eye color,Skin color,Measurements,Hero,Publisher
0,A-Bomb|Marvel Comics,Male,Human,good,No Hair,yellow,Unknown,"{'Height': '203.0 cm', 'Weight': '441.0 kg'}",A-Bomb,Marvel Comics
1,Abe Sapien|Dark Horse Comics,Male,Icthyo Sapien,good,No Hair,blue,blue,"{'Height': '191.0 cm', 'Weight': '65.0 kg'}",Abe Sapien,Dark Horse Comics


In [4]:
## drop the original column 
df1 = df1.drop(columns=['Hero|Publisher'])
df1.head(2)

Unnamed: 0,Gender,Race,Alignment,Hair color,Eye color,Skin color,Measurements,Hero,Publisher
0,Male,Human,good,No Hair,yellow,Unknown,"{'Height': '203.0 cm', 'Weight': '441.0 kg'}",A-Bomb,Marvel Comics
1,Male,Icthyo Sapien,good,No Hair,blue,blue,"{'Height': '191.0 cm', 'Weight': '65.0 kg'}",Abe Sapien,Dark Horse Comics


In [5]:
## use .str.replace to replace all single quotes
df1['Measurements'] = df1['Measurements'].str.replace("'",'"')
## Apply the json.loads to the full column
df1['Measurements'] = df1['Measurements'].apply(json.loads)
df1['Measurements'].head()

0    {'Height': '203.0 cm', 'Weight': '441.0 kg'}
1     {'Height': '191.0 cm', 'Weight': '65.0 kg'}
2     {'Height': '185.0 cm', 'Weight': '90.0 kg'}
3    {'Height': '203.0 cm', 'Weight': '441.0 kg'}
4    {'Height': '193.0 cm', 'Weight': '122.0 kg'}
Name: Measurements, dtype: object

In [6]:
# creating the columns for the newly created dictionary
measure_sep = df1['Measurements'].apply(pd.Series)
measure_sep

Unnamed: 0,Height,Weight
0,203.0 cm,441.0 kg
1,191.0 cm,65.0 kg
2,185.0 cm,90.0 kg
3,203.0 cm,441.0 kg
4,193.0 cm,122.0 kg
...,...,...
458,183.0 cm,83.0 kg
459,165.0 cm,52.0 kg
460,66.0 cm,17.0 kg
461,170.0 cm,57.0 kg


In [7]:
# concat measure_sep with original dataframe
df1 = pd.concat((df1, measure_sep), axis = 1)
df1.head(2)

Unnamed: 0,Gender,Race,Alignment,Hair color,Eye color,Skin color,Measurements,Hero,Publisher,Height,Weight
0,Male,Human,good,No Hair,yellow,Unknown,"{'Height': '203.0 cm', 'Weight': '441.0 kg'}",A-Bomb,Marvel Comics,203.0 cm,441.0 kg
1,Male,Icthyo Sapien,good,No Hair,blue,blue,"{'Height': '191.0 cm', 'Weight': '65.0 kg'}",Abe Sapien,Dark Horse Comics,191.0 cm,65.0 kg


In [8]:
df1 = df1.drop(columns=['Measurements'])

In [9]:
# Cleaning up the powers dataset
df2 = pd.read_csv('data/superhero_powers.csv')
df2.head(2)

Unnamed: 0,hero_names,Powers
0,3-D Man,"Agility,Super Strength,Stamina,Super Speed"
1,A-Bomb,"Accelerated Healing,Durability,Longevity,Super..."


In [10]:
# Renaming the hero_names column to combine with the previous dataset
df2.rename(columns={'hero_names': 'Hero'}, inplace=True)
df2.head(2)

Unnamed: 0,Hero,Powers
0,3-D Man,"Agility,Super Strength,Stamina,Super Speed"
1,A-Bomb,"Accelerated Healing,Durability,Longevity,Super..."


In [11]:
# Creating one dataframe with both csv data files
df3 = pd.merge(df1, df2, on ='Hero')
df3

Unnamed: 0,Gender,Race,Alignment,Hair color,Eye color,Skin color,Hero,Publisher,Height,Weight,Powers
0,Male,Human,good,No Hair,yellow,Unknown,A-Bomb,Marvel Comics,203.0 cm,441.0 kg,"Accelerated Healing,Durability,Longevity,Super..."
1,Male,Icthyo Sapien,good,No Hair,blue,blue,Abe Sapien,Dark Horse Comics,191.0 cm,65.0 kg,"Agility,Accelerated Healing,Cold Resistance,Du..."
2,Male,Ungaran,good,No Hair,blue,red,Abin Sur,DC Comics,185.0 cm,90.0 kg,Lantern Power Ring
3,Male,Human / Radiation,bad,No Hair,green,Unknown,Abomination,Marvel Comics,203.0 cm,441.0 kg,"Accelerated Healing,Intelligence,Super Strengt..."
4,Male,Human,bad,No Hair,blue,Unknown,Absorbing Man,Marvel Comics,193.0 cm,122.0 kg,"Cold Resistance,Durability,Energy Absorption,S..."
...,...,...,...,...,...,...,...,...,...,...,...
458,Male,Human,good,Blond,blue,Unknown,Yellowjacket,Marvel Comics,183.0 cm,83.0 kg,"Size Changing,Animal Oriented Powers"
459,Female,Human,good,Strawberry Blond,blue,Unknown,Yellowjacket II,Marvel Comics,165.0 cm,52.0 kg,"Flight,Energy Blasts,Size Changing"
460,Male,Yoda's species,good,White,brown,green,Yoda,George Lucas,66.0 cm,17.0 kg,"Agility,Stealth,Danger Sense,Marksmanship,Weap..."
461,Female,Human,good,Black,blue,Unknown,Zatanna,DC Comics,170.0 cm,57.0 kg,"Cryokinesis,Telepathy,Magic,Fire Control,Proba..."


In [12]:
## showing the lists are really strings
df3.loc[1,'Powers']

'Agility,Accelerated Healing,Cold Resistance,Durability,Underwater breathing,Marksmanship,Weapons Master,Longevity,Intelligence,Super Strength,Telepathy,Stamina,Immortality,Reflexes,Enhanced Sight,Sub-Mariner'

In [13]:
df3['Powers_list'] = list(df3['Powers'].str.split(','))
df3['Powers_list']

0      [Accelerated Healing, Durability, Longevity, S...
1      [Agility, Accelerated Healing, Cold Resistance...
2                                   [Lantern Power Ring]
3      [Accelerated Healing, Intelligence, Super Stre...
4      [Cold Resistance, Durability, Energy Absorptio...
                             ...                        
458              [Size Changing, Animal Oriented Powers]
459               [Flight, Energy Blasts, Size Changing]
460    [Agility, Stealth, Danger Sense, Marksmanship,...
461    [Cryokinesis, Telepathy, Magic, Fire Control, ...
462    [Super Speed, Intangibility, Time Travel, Time...
Name: Powers_list, Length: 463, dtype: object

In [14]:
## exploding the column of lists
exploded = df3.explode('Powers_list')
exploded[['Hero','Powers_list']].head(5)

Unnamed: 0,Hero,Powers_list
0,A-Bomb,Accelerated Healing
0,A-Bomb,Durability
0,A-Bomb,Longevity
0,A-Bomb,Super Strength
0,A-Bomb,Stamina


In [15]:
## saving the unique values from the exploded column
cols_to_make = exploded['Powers_list'].dropna().unique()
cols_to_make

array(['Accelerated Healing', 'Durability', 'Longevity', 'Super Strength',
       'Stamina', 'Camouflage', 'Self-Sustenance', 'Agility',
       'Cold Resistance', 'Underwater breathing', 'Marksmanship',
       'Weapons Master', 'Intelligence', 'Telepathy', 'Immortality',
       'Reflexes', 'Enhanced Sight', 'Sub-Mariner', 'Lantern Power Ring',
       'Super Speed', 'Invulnerability', 'Animation', 'Super Breath',
       'Energy Absorption', 'Elemental Transmogrification',
       'Fire Resistance', 'Natural Armor', 'Molecular Manipulation',
       'Heat Resistance', 'Matter Absorption', 'Stealth', 'Flight',
       'Power Suit', 'Energy Blasts', 'Energy Beams', 'Power Cosmic',
       'Heat Generation', 'Danger Sense', 'Teleportation', 'Phasing',
       'Force Fields', 'Hypnokinesis', 'Energy Manipulation',
       'Invisibility', 'Enhanced Senses', 'Jump', 'Substance Secretion',
       'Natural Weapons', 'Wallcrawling', 'Vision - Thermal',
       'Power Augmentation', 'Cryokinesis', 'Dupli

In [16]:
for col in cols_to_make:
    df3[col] = df3['Powers'].str.contains(col)
df3.head()

  df3[col] = df3['Powers'].str.contains(col)
  df3[col] = df3['Powers'].str.contains(col)
  df3[col] = df3['Powers'].str.contains(col)
  df3[col] = df3['Powers'].str.contains(col)
  df3[col] = df3['Powers'].str.contains(col)
  df3[col] = df3['Powers'].str.contains(col)
  df3[col] = df3['Powers'].str.contains(col)
  df3[col] = df3['Powers'].str.contains(col)
  df3[col] = df3['Powers'].str.contains(col)
  df3[col] = df3['Powers'].str.contains(col)
  df3[col] = df3['Powers'].str.contains(col)
  df3[col] = df3['Powers'].str.contains(col)
  df3[col] = df3['Powers'].str.contains(col)
  df3[col] = df3['Powers'].str.contains(col)
  df3[col] = df3['Powers'].str.contains(col)
  df3[col] = df3['Powers'].str.contains(col)
  df3[col] = df3['Powers'].str.contains(col)
  df3[col] = df3['Powers'].str.contains(col)
  df3[col] = df3['Powers'].str.contains(col)
  df3[col] = df3['Powers'].str.contains(col)
  df3[col] = df3['Powers'].str.contains(col)
  df3[col] = df3['Powers'].str.contains(col)
  df3[col]

Unnamed: 0,Gender,Race,Alignment,Hair color,Eye color,Skin color,Hero,Publisher,Height,Weight,...,Hair Manipulation,Weather Control,Nova Force,Odin Force,Phoenix Force,Power Sense,Qwardian Power Ring,Melting,Changing Armor,Terrakinesis
0,Male,Human,good,No Hair,yellow,Unknown,A-Bomb,Marvel Comics,203.0 cm,441.0 kg,...,False,False,False,False,False,False,False,False,False,False
1,Male,Icthyo Sapien,good,No Hair,blue,blue,Abe Sapien,Dark Horse Comics,191.0 cm,65.0 kg,...,False,False,False,False,False,False,False,False,False,False
2,Male,Ungaran,good,No Hair,blue,red,Abin Sur,DC Comics,185.0 cm,90.0 kg,...,False,False,False,False,False,False,False,False,False,False
3,Male,Human / Radiation,bad,No Hair,green,Unknown,Abomination,Marvel Comics,203.0 cm,441.0 kg,...,False,False,False,False,False,False,False,False,False,False
4,Male,Human,bad,No Hair,blue,Unknown,Absorbing Man,Marvel Comics,193.0 cm,122.0 kg,...,False,False,False,False,False,False,False,False,False,False


In [17]:
# drop powers clumns
df3 = df3.drop(columns=['Powers','Powers_list'])

### 2. What is the average height of heroes for each publisher?

In [18]:
df3['Height'] = df3['Height'].str.replace(' cm', '')
df3['Weight'] = df3['Weight'].str.replace(' kg', '')
df3.head()

Unnamed: 0,Gender,Race,Alignment,Hair color,Eye color,Skin color,Hero,Publisher,Height,Weight,...,Hair Manipulation,Weather Control,Nova Force,Odin Force,Phoenix Force,Power Sense,Qwardian Power Ring,Melting,Changing Armor,Terrakinesis
0,Male,Human,good,No Hair,yellow,Unknown,A-Bomb,Marvel Comics,203.0,441.0,...,False,False,False,False,False,False,False,False,False,False
1,Male,Icthyo Sapien,good,No Hair,blue,blue,Abe Sapien,Dark Horse Comics,191.0,65.0,...,False,False,False,False,False,False,False,False,False,False
2,Male,Ungaran,good,No Hair,blue,red,Abin Sur,DC Comics,185.0,90.0,...,False,False,False,False,False,False,False,False,False,False
3,Male,Human / Radiation,bad,No Hair,green,Unknown,Abomination,Marvel Comics,203.0,441.0,...,False,False,False,False,False,False,False,False,False,False
4,Male,Human,bad,No Hair,blue,Unknown,Absorbing Man,Marvel Comics,193.0,122.0,...,False,False,False,False,False,False,False,False,False,False


In [19]:
df3['Height'] = df3['Height'].astype('float')
df3['Weight'] = df3['Weight'].astype('float')

In [20]:
publishers = df3['Publisher'].dropna().unique()
publishers

array(['Marvel Comics', 'Dark Horse Comics', 'DC Comics', 'Team Epic TV',
       'George Lucas', 'Shueisha', 'Star Trek', 'Unknown', 'Image Comics'],
      dtype=object)

In [22]:
df3.groupby(['Publisher'], as_index=False)['Height'].mean()

Unnamed: 0,Publisher,Height
0,DC Comics,181.923913
1,Dark Horse Comics,176.909091
2,George Lucas,159.6
3,Image Comics,211.0
4,Marvel Comics,191.546128
5,Shueisha,171.5
6,Star Trek,181.5
7,Team Epic TV,180.75
8,Unknown,178.0
