In [34]:
# Import libraries
import pandas as pd
import numpy as np
import pickle

# Data cleaning

When data is acquired from the web, it is rarely clean and ready to use. In this notebook, we load the datasets we previously parsed and we clean the different columns. Please remember that even if only one part of the dataframe is displayed, we already checked that the cleaning worked for all of it.

# Part 1: Marvel dataset

In [35]:
with open('data/characters_marvel.txt', 'rb') as f:
    characters_marvel = pickle.load(f)

characters_marvel

Unnamed: 0,URL,Real Name,Current Alias,Relatives,Affiliation
0,/wiki/Aaron_Fox_(Earth-616),Aaron Fox,,", /wiki/Beth_Fox_(Earth-616)",
1,/wiki/Acrobat_(1940s)_(Earth-616),nknown,Acrobat,,
2,/wiki/Abigail_Mercury_(Clone)_(Earth-616),Abigail Mercury,,", /wiki/Abigail_Mercury_(Earth-616)",
3,/wiki/Ace_Maxwell_(Earth-616),Ace Maxwell,,,
4,/wiki/Abigail_Boylen_(Earth-616),"Abigail ""Abby"" Boylen",Cloud 9,", #cite_note-Avengers_The_Initiative_Vol_1_1-2",", /wiki/Champions_(Earth-616), /wiki/Undergrou..."
...,...,...,...,...,...
28034,/wiki/Zxaxz_(Earth-616),Zxaxz,,,
28035,/wiki/Zuwena_(Earth-616),Zuwena,,,", /wiki/Elephant%27s_Trunk_(Earth-616)"
28036,/wiki/Zurvan_(Earth-616),Zurvan,,", /wiki/Ahura_Mazda_(Earth-616), /wiki/Ahriman...",
28037,/wiki/Zygo_(Earth-616),Zygo,General Zygo,,


### `Real Name` cleaning:

In [36]:
characters_marvel['Real Name'].value_counts()

nknown                                                       3671
Unknown                                                       462
Unrevealed                                                    106
Not Applicable                                                 26
Unknown (The symbiote takes the name of its current host)      24
                                                             ... 
Henry Mortonson                                                 1
Sophia Sanduval                                                 1
Diatrice Alraune                                                1
Pih-Junn                                                        1
Baxter (Full name unknown)                                      1
Name: Real Name, Length: 22973, dtype: int64

**We can see that a few categories correspond to unknown names, we group them together under the label `Unknown`.**

In [37]:
characters_marvel.loc[characters_marvel['Real Name']=='nknown', 'Real Name'] ='Unknown'
characters_marvel.loc[characters_marvel['Real Name']=='Unrevealed', 'Real Name'] ='Unknown'
characters_marvel.loc[characters_marvel['Real Name']=='', 'Real Name'] ='Unknown'
characters_marvel.loc[characters_marvel['Real Name']=='N/A', 'Real Name'] ='Unknown'
characters_marvel.loc[characters_marvel['Real Name']=='Unknown (The symbiote takes the name of its current host)', 'Real Name'] ='Unknown'
characters_marvel.loc[characters_marvel['Real Name']=='None', 'Real Name'] ='Unknown'

# Removing links of the format [#]
characters_marvel["Real Name"] = characters_marvel["Real Name"].str.replace(r'\s\[\d\]', '')
characters_marvel["Real Name"] = characters_marvel["Real Name"].str.replace(r'\[\d\]', '')

characters_marvel['Real Name'].value_counts()

Unknown                             4299
Not Applicable                        26
Martin (full name unrevealed)         11
James "Jamie" Arthur Madrox           10
Verschlagen (first name unknown)       6
                                    ... 
Rusty                                  1
Vincent Farnsworth                     1
Taina Miranda                          1
Brule                                  1
Daemian Wainscroft                     1
Name: Real Name, Length: 22892, dtype: int64

### `Current Alias` cleaning:

In [38]:
characters_marvel["Current Alias"].value_counts()

                                         15870
Nova                                        27
Ghost Rider                                 16
Crimson Dynamo                              16
Black Knight                                15
                                         ...  
Jack Serious                                 1
Moon Wolf                                    1
Pierce                                       1
Big                                          1
Dreamer, Bestower of Unlimited Wishes        1
Name: Current Alias, Length: 10053, dtype: int64

In [39]:
characters_marvel.loc[characters_marvel['Current Alias']=='', 'Current Alias'] ='Unknown'

# Removing links of the format [#]
characters_marvel['Current Alias'] = characters_marvel['Current Alias'].str.replace(r'\s\[\d\]', '')
characters_marvel['Current Alias'] = characters_marvel['Current Alias'].str.replace(r'\[\d\]', '')

characters_marvel["Current Alias"].value_counts()

Unknown           15871
Nova                 27
Ghost Rider          18
Black Knight         16
Crimson Dynamo       16
                  ...  
Bombu                 1
Doctor Mole           1
Jon                   1
Paradise              1
Gunner Gates          1
Name: Current Alias, Length: 10003, dtype: int64

### `Relatives` and `Affiliation` cleaning:

**We transform everything to a list, it will be easier to handle.**

In [40]:
characters_marvel

Unnamed: 0,URL,Real Name,Current Alias,Relatives,Affiliation
0,/wiki/Aaron_Fox_(Earth-616),Aaron Fox,Unknown,", /wiki/Beth_Fox_(Earth-616)",
1,/wiki/Acrobat_(1940s)_(Earth-616),Unknown,Acrobat,,
2,/wiki/Abigail_Mercury_(Clone)_(Earth-616),Abigail Mercury,Unknown,", /wiki/Abigail_Mercury_(Earth-616)",
3,/wiki/Ace_Maxwell_(Earth-616),Ace Maxwell,Unknown,,
4,/wiki/Abigail_Boylen_(Earth-616),"Abigail ""Abby"" Boylen",Cloud 9,", #cite_note-Avengers_The_Initiative_Vol_1_1-2",", /wiki/Champions_(Earth-616), /wiki/Undergrou..."
...,...,...,...,...,...
28034,/wiki/Zxaxz_(Earth-616),Zxaxz,Unknown,,
28035,/wiki/Zuwena_(Earth-616),Zuwena,Unknown,,", /wiki/Elephant%27s_Trunk_(Earth-616)"
28036,/wiki/Zurvan_(Earth-616),Zurvan,Unknown,", /wiki/Ahura_Mazda_(Earth-616), /wiki/Ahriman...",
28037,/wiki/Zygo_(Earth-616),Zygo,General Zygo,,


In [41]:
characters_marvel['Relatives'] = characters_marvel['Relatives'].str.replace(', ','',1)
characters_marvel['Affiliation'] = characters_marvel['Affiliation'].str.replace(', ','',1)

characters_marvel['Relatives'] = characters_marvel['Relatives'].str.split(', ')
characters_marvel['Affiliation'] = characters_marvel['Affiliation'].str.split(', ')

characters_marvel

Unnamed: 0,URL,Real Name,Current Alias,Relatives,Affiliation
0,/wiki/Aaron_Fox_(Earth-616),Aaron Fox,Unknown,[/wiki/Beth_Fox_(Earth-616)],[]
1,/wiki/Acrobat_(1940s)_(Earth-616),Unknown,Acrobat,[],[]
2,/wiki/Abigail_Mercury_(Clone)_(Earth-616),Abigail Mercury,Unknown,[/wiki/Abigail_Mercury_(Earth-616)],[]
3,/wiki/Ace_Maxwell_(Earth-616),Ace Maxwell,Unknown,[],[]
4,/wiki/Abigail_Boylen_(Earth-616),"Abigail ""Abby"" Boylen",Cloud 9,[#cite_note-Avengers_The_Initiative_Vol_1_1-2],"[/wiki/Champions_(Earth-616), /wiki/Undergroun..."
...,...,...,...,...,...
28034,/wiki/Zxaxz_(Earth-616),Zxaxz,Unknown,[],[]
28035,/wiki/Zuwena_(Earth-616),Zuwena,Unknown,[],[/wiki/Elephant%27s_Trunk_(Earth-616)]
28036,/wiki/Zurvan_(Earth-616),Zurvan,Unknown,"[/wiki/Ahura_Mazda_(Earth-616), /wiki/Ahriman_...",[]
28037,/wiki/Zygo_(Earth-616),Zygo,General Zygo,[],[]


### Save the cleaned dataframe:

In [42]:
pickle.dump(characters_marvel, open('data/clean_marvel.txt','wb'))

# Part 2: DC 

In [43]:
with open('data/characters_dc.txt', 'rb') as f:
    characters_dc = pickle.load(f)

characters_dc

Unnamed: 0,URL,Real Name,Current Alias,Relatives,Affiliation
0,/wiki/Aaron_Hayley_(New_Earth),Aaron Hayley,Swamp Thing,,
1,/wiki/Abigail_Fine_(Smallville),Abigail Fine,Abigail Fine,", /wiki/Elise_Fine_(Smallville)",
2,/wiki/Adam_Strange_(JSA:_The_Golden_Age),Adam Strange,Adam Strange,,
3,/wiki/Alan_Barnes_(New_Earth),Alan Barnes,Brainstorm,,
4,/wiki/Alan_Scott_(Earth_2),Alan Scott,Green Lantern,", /wiki/Sam_Zhao_(Earth_2)",", /wiki/Wonders_of_the_World, /wiki/The_Green"
...,...,...,...,...,...
10472,/wiki/Zeta_(Earth-One),Unknown,Zeta,,", /wiki/Pantheon"
10473,/wiki/Zotan_(Earth-S),Zotan,Zotan,,
10474,/wiki/Zond_(Earth-One),Zond,Zond the Sorcerer,,", /wiki/Morgaine_le_Fey"
10475,/wiki/Zora_Vi-Lar_(Earth-One),Zora Vi-Lar,Black Flame,,


### `Real Name` cleaning:

In [44]:
characters_dc['Real Name'].value_counts()

Unknown            4361
Bruce Wayne         166
Kal-El              118
Lois Lane            86
None                 77
                   ... 
Sebastian Clark       1
David Stevens         1
Hurrambi Marlo        1
Nicki Jones           1
Khaji Da              1
Name: Real Name, Length: 10899, dtype: int64

In [45]:
characters_dc["Real Name"] = characters_dc["Real Name"].str.replace(r'\s\[\d\]', '')
characters_dc["Real Name"] = characters_dc["Real Name"].str.replace(r'\[\d\]', '')

# Removing links of the format [#]
characters_dc.loc[characters_dc['Real Name']=='None', 'Real Name'] ='Unknown'
characters_dc.loc[characters_dc['Real Name']=='', 'Real Name'] ='Unknown'

characters_dc['Real Name'].value_counts()

Unknown              4501
Bruce Wayne           166
Kal-El                118
Lois Lane              86
Alfred Pennyworth      64
                     ... 
Sebastian Clark         1
David Stevens           1
Hurrambi Marlo          1
Nicki Jones             1
Kara Zor-El II          1
Name: Real Name, Length: 10887, dtype: int64

### `Current Alias` cleaning:

In [46]:
characters_dc['Current Alias'].value_counts()

                       4979
Green Lantern           444
Batman                  203
Superman                150
Wonder Woman             98
                       ... 
Green Hood                1
Carla Marcus-Jordan       1
Herculina                 1
White Whip                1
Olivia Hardy              1
Name: Current Alias, Length: 9334, dtype: int64

In [47]:
characters_dc.loc[characters_dc['Current Alias']=='', 'Current Alias'] ='Unknown'

# Removing links of the format [#]
characters_dc['Current Alias'] = characters_dc['Current Alias'].str.replace(r'\s\[\d\]', '')
characters_dc['Current Alias'] = characters_dc['Current Alias'].str.replace(r'\[\d\]', '')

characters_dc["Current Alias"].value_counts()

Unknown                4979
Green Lantern           444
Batman                  203
Superman                150
Wonder Woman             98
                       ... 
Carla Marcus-Jordan       1
Herculina                 1
White Whip                1
Airstryke                 1
Crisis                    1
Name: Current Alias, Length: 9329, dtype: int64

### `Relatives` and `Affiliation` cleaning:

**Again, we transform everything to a list.**

In [48]:
characters_dc['Relatives'] = characters_dc['Relatives'].str.replace(', ','',1)
characters_dc['Affiliation'] = characters_dc['Affiliation'].str.replace(', ','',1)

characters_dc['Relatives'] = characters_dc['Relatives'].str.split(', ')
characters_dc['Affiliation'] = characters_dc['Affiliation'].str.split(', ')

characters_dc

Unnamed: 0,URL,Real Name,Current Alias,Relatives,Affiliation
0,/wiki/Aaron_Hayley_(New_Earth),Aaron Hayley,Swamp Thing,[],[]
1,/wiki/Abigail_Fine_(Smallville),Abigail Fine,Abigail Fine,[/wiki/Elise_Fine_(Smallville)],[]
2,/wiki/Adam_Strange_(JSA:_The_Golden_Age),Adam Strange,Adam Strange,[],[]
3,/wiki/Alan_Barnes_(New_Earth),Alan Barnes,Brainstorm,[],[]
4,/wiki/Alan_Scott_(Earth_2),Alan Scott,Green Lantern,[/wiki/Sam_Zhao_(Earth_2)],"[/wiki/Wonders_of_the_World, /wiki/The_Green]"
...,...,...,...,...,...
10472,/wiki/Zeta_(Earth-One),Unknown,Zeta,[],[/wiki/Pantheon]
10473,/wiki/Zotan_(Earth-S),Zotan,Zotan,[],[]
10474,/wiki/Zond_(Earth-One),Zond,Zond the Sorcerer,[],[/wiki/Morgaine_le_Fey]
10475,/wiki/Zora_Vi-Lar_(Earth-One),Zora Vi-Lar,Black Flame,[],[]


### Save the cleaned dataframe:

In [49]:
pickle.dump(characters_dc, open('data/clean_dc.txt','wb'))