# Applying Advanced Transformations

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

In [2]:
info = pd.read_csv("Data/superhero_info - superhero_info.csv", low_memory=False)
powers = pd.read_csv("Data/superhero_powers - superhero_powers.csv", low_memory=False)

## Clean the files and combine them into one final DataFrame.

This dataframe should have the following columns:

Hero (Just the name of the Hero)

Publisher

Gender

Eye color

Race

Hair color

Height (numeric)

Skin color

Alignment

Weight (numeric)

Plus, one-hot-encoded columns for every power that appears in the dataset. E.g.:

    Agility

    Flight

    Superspeed

    etc.
Hint: There is a space in "100 kg" or "52.5 cm"

In [3]:
info.head()

Unnamed: 0,Hero|Publisher,Gender,Race,Alignment,Hair color,Eye color,Skin color,Measurements
0,A-Bomb|Marvel Comics,Male,Human,good,No Hair,yellow,Unknown,"{'Height': '203.0 cm', 'Weight': '441.0 kg'}"
1,Abe Sapien|Dark Horse Comics,Male,Icthyo Sapien,good,No Hair,blue,blue,"{'Height': '191.0 cm', 'Weight': '65.0 kg'}"
2,Abin Sur|DC Comics,Male,Ungaran,good,No Hair,blue,red,"{'Height': '185.0 cm', 'Weight': '90.0 kg'}"
3,Abomination|Marvel Comics,Male,Human / Radiation,bad,No Hair,green,Unknown,"{'Height': '203.0 cm', 'Weight': '441.0 kg'}"
4,Absorbing Man|Marvel Comics,Male,Human,bad,No Hair,blue,Unknown,"{'Height': '193.0 cm', 'Weight': '122.0 kg'}"


### Splitting name from Publisher

In [4]:
info[['Hero','Publisher']] = info['Hero|Publisher'].str.split('|',expand=True)
info = info.drop(columns=['Hero|Publisher'])
info.head()

Unnamed: 0,Gender,Race,Alignment,Hair color,Eye color,Skin color,Measurements,Hero,Publisher
0,Male,Human,good,No Hair,yellow,Unknown,"{'Height': '203.0 cm', 'Weight': '441.0 kg'}",A-Bomb,Marvel Comics
1,Male,Icthyo Sapien,good,No Hair,blue,blue,"{'Height': '191.0 cm', 'Weight': '65.0 kg'}",Abe Sapien,Dark Horse Comics
2,Male,Ungaran,good,No Hair,blue,red,"{'Height': '185.0 cm', 'Weight': '90.0 kg'}",Abin Sur,DC Comics
3,Male,Human / Radiation,bad,No Hair,green,Unknown,"{'Height': '203.0 cm', 'Weight': '441.0 kg'}",Abomination,Marvel Comics
4,Male,Human,bad,No Hair,blue,Unknown,"{'Height': '193.0 cm', 'Weight': '122.0 kg'}",Absorbing Man,Marvel Comics


### Splitting height from weight

In [5]:
info[['Height (cm)','Weight (kg)']] = info['Measurements'].str.split(',',expand=True)
info = info.drop(columns=['Measurements'])
info.head()

Unnamed: 0,Gender,Race,Alignment,Hair color,Eye color,Skin color,Hero,Publisher,Height (cm),Weight (kg)
0,Male,Human,good,No Hair,yellow,Unknown,A-Bomb,Marvel Comics,{'Height': '203.0 cm','Weight': '441.0 kg'}
1,Male,Icthyo Sapien,good,No Hair,blue,blue,Abe Sapien,Dark Horse Comics,{'Height': '191.0 cm','Weight': '65.0 kg'}
2,Male,Ungaran,good,No Hair,blue,red,Abin Sur,DC Comics,{'Height': '185.0 cm','Weight': '90.0 kg'}
3,Male,Human / Radiation,bad,No Hair,green,Unknown,Abomination,Marvel Comics,{'Height': '203.0 cm','Weight': '441.0 kg'}
4,Male,Human,bad,No Hair,blue,Unknown,Absorbing Man,Marvel Comics,{'Height': '193.0 cm','Weight': '122.0 kg'}


#### Clean columns

In [6]:
info.dtypes

Gender         object
Race           object
Alignment      object
Hair color     object
Eye color      object
Skin color     object
Hero           object
Publisher      object
Height (cm)    object
Weight (kg)    object
dtype: object

In [11]:
info['Height (cm)'] = info['Height (cm)'].astype('string')
to_replace = ["{Height:","'", "cm"]
for char in to_replace:
    info['Height (cm)'] = info['Height (cm)'].str.replace(char, '', regex=False)
info['Height (cm)'] = info['Height (cm)'].astype('float')
info.head()

Unnamed: 0,Gender,Race,Alignment,Hair color,Eye color,Skin color,Hero,Publisher,Height (cm),Weight (kg)
0,Male,Human,good,No Hair,yellow,Unknown,A-Bomb,Marvel Comics,203.0,'Weight': '441.0 kg'}
1,Male,Icthyo Sapien,good,No Hair,blue,blue,Abe Sapien,Dark Horse Comics,191.0,'Weight': '65.0 kg'}
2,Male,Ungaran,good,No Hair,blue,red,Abin Sur,DC Comics,185.0,'Weight': '90.0 kg'}
3,Male,Human / Radiation,bad,No Hair,green,Unknown,Abomination,Marvel Comics,203.0,'Weight': '441.0 kg'}
4,Male,Human,bad,No Hair,blue,Unknown,Absorbing Man,Marvel Comics,193.0,'Weight': '122.0 kg'}


In [13]:
info['Weight (kg)'] = info['Weight (kg)'].astype('string')
to_replace = ["Weight:","'", "kg", '}']
for char in to_replace:
    info['Weight (kg)'] = info['Weight (kg)'].str.replace(char, '', regex=False)
info['Weight (kg)'] = info['Weight (kg)'].astype('float')
info.head()

Unnamed: 0,Gender,Race,Alignment,Hair color,Eye color,Skin color,Hero,Publisher,Height (cm),Weight (kg)
0,Male,Human,good,No Hair,yellow,Unknown,A-Bomb,Marvel Comics,203.0,441.0
1,Male,Icthyo Sapien,good,No Hair,blue,blue,Abe Sapien,Dark Horse Comics,191.0,65.0
2,Male,Ungaran,good,No Hair,blue,red,Abin Sur,DC Comics,185.0,90.0
3,Male,Human / Radiation,bad,No Hair,green,Unknown,Abomination,Marvel Comics,203.0,441.0
4,Male,Human,bad,No Hair,blue,Unknown,Absorbing Man,Marvel Comics,193.0,122.0


### OneHotEncoding

In [14]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder
import json, os

In [15]:
powers.head()

Unnamed: 0,hero_names,Powers
0,3-D Man,"Agility,Super Strength,Stamina,Super Speed"
1,A-Bomb,"Accelerated Healing,Durability,Longevity,Super..."
2,Abe Sapien,"Agility,Accelerated Healing,Cold Resistance,Du..."
3,Abin Sur,Lantern Power Ring
4,Abomination,"Accelerated Healing,Intelligence,Super Strengt..."


# I'm so sorry, I can't figure this out.  Nothing anyone said on discord makes sense, the code along example isn't applicable.  There must be something I don't know because I've tried about two dozen different configurations and nothing makes any sense

In [67]:
# Got help from Eduardo_Galindez in discord, I don't know the other way to do this using json.loads
powers['Powers2'] = powers['Powers'].str.split(',').explode
powers['Powers2'] = powers['Powers2'].apply(pd.Series)
powers['Powers'] = powers['Powers'].apply(json.loads)
powers.head()

JSONDecodeError: Expecting value: line 1 column 1 (char 0)

In [59]:
cols_to_make = powers['Powers2'].dropna().unique()
cols_to_make

array([<bound method Series.explode of 0        [Agility, Super Strength, Stamina, Super Speed]
       1      [Accelerated Healing, Durability, Longevity, S...
       2      [Agility, Accelerated Healing, Cold Resistance...
       3                                   [Lantern Power Ring]
       4      [Accelerated Healing, Intelligence, Super Stre...
                                    ...
       662               [Flight, Energy Blasts, Size Changing]
       663    [Cold Resistance, Durability, Longevity, Super...
       664    [Agility, Stealth, Danger Sense, Marksmanship,...
       665    [Cryokinesis, Telepathy, Magic, Fire Control, ...
       666    [Super Speed, Intangibility, Time Travel, Time...
       Name: Powers, Length: 667, dtype: object>                                               ],
      dtype=object)

In [54]:
for cols in cols_to_make:
    powers[col] = powers['Powers'].str.contains(col)
powers.head()

AttributeError: Can only use .str accessor with string values!