In [2]:
# import library
import pandas as pd

# load dataset
url = "https://raw.githubusercontent.com/fivethirtyeight/data/master/comic-characters/marvel-wikia-data.csv"
marvel_df = pd.read_csv(url)

# print top 5 data
marvel_df.head()

Unnamed: 0,page_id,name,urlslug,ID,ALIGN,EYE,HAIR,SEX,GSM,ALIVE,APPEARANCES,FIRST APPEARANCE,Year
0,1678,Spider-Man (Peter Parker),\/Spider-Man_(Peter_Parker),Secret Identity,Good Characters,Hazel Eyes,Brown Hair,Male Characters,,Living Characters,4043.0,Aug-62,1962.0
1,7139,Captain America (Steven Rogers),\/Captain_America_(Steven_Rogers),Public Identity,Good Characters,Blue Eyes,White Hair,Male Characters,,Living Characters,3360.0,Mar-41,1941.0
2,64786,"Wolverine (James \""Logan\"" Howlett)",\/Wolverine_(James_%22Logan%22_Howlett),Public Identity,Neutral Characters,Blue Eyes,Black Hair,Male Characters,,Living Characters,3061.0,Oct-74,1974.0
3,1868,"Iron Man (Anthony \""Tony\"" Stark)",\/Iron_Man_(Anthony_%22Tony%22_Stark),Public Identity,Good Characters,Blue Eyes,Black Hair,Male Characters,,Living Characters,2961.0,Mar-63,1963.0
4,2460,Thor (Thor Odinson),\/Thor_(Thor_Odinson),No Dual Identity,Good Characters,Blue Eyes,Blond Hair,Male Characters,,Living Characters,2258.0,Nov-50,1950.0


In [3]:
# drop unnecessary column
marvel_df = marvel_df.drop(['urlslug', 'GSM'], axis=1)

# rename column name
marvel_df = marvel_df.rename(columns = {'page_id':'Page_Id', 'name':'Char Name', 
                                        'ID':'ID Status', 'ALIGN':'Align', 'EYE':'Eye', 
                                        'HAIR':'Hair', 'SEX':'Sex', 'ALIVE':'Alive', 
                                        'APPEARANCES':'Total Appearances', 
                                        'FIRST APPEARANCE':'First Appearance'})

# print top 5 data
marvel_df.head()

Unnamed: 0,Page_Id,Char Name,ID Status,Align,Eye,Hair,Sex,Alive,Total Appearances,First Appearance,Year
0,1678,Spider-Man (Peter Parker),Secret Identity,Good Characters,Hazel Eyes,Brown Hair,Male Characters,Living Characters,4043.0,Aug-62,1962.0
1,7139,Captain America (Steven Rogers),Public Identity,Good Characters,Blue Eyes,White Hair,Male Characters,Living Characters,3360.0,Mar-41,1941.0
2,64786,"Wolverine (James \""Logan\"" Howlett)",Public Identity,Neutral Characters,Blue Eyes,Black Hair,Male Characters,Living Characters,3061.0,Oct-74,1974.0
3,1868,"Iron Man (Anthony \""Tony\"" Stark)",Public Identity,Good Characters,Blue Eyes,Black Hair,Male Characters,Living Characters,2961.0,Mar-63,1963.0
4,2460,Thor (Thor Odinson),No Dual Identity,Good Characters,Blue Eyes,Blond Hair,Male Characters,Living Characters,2258.0,Nov-50,1950.0


In [4]:
# print data information
marvel_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 16376 entries, 0 to 16375
Data columns (total 11 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   Page_Id            16376 non-null  int64  
 1   Char Name          16376 non-null  object 
 2   ID Status          12606 non-null  object 
 3   Align              13564 non-null  object 
 4   Eye                6609 non-null   object 
 5   Hair               12112 non-null  object 
 6   Sex                15522 non-null  object 
 7   Alive              16373 non-null  object 
 8   Total Appearances  15280 non-null  float64
 9   First Appearance   15561 non-null  object 
 10  Year               15561 non-null  float64
dtypes: float64(2), int64(1), object(8)
memory usage: 1.4+ MB


In [5]:
# handling missing data


## fill null value in ID Status column with 'No Dual Identitiy'
marvel_df['ID Status'] = marvel_df['ID Status'].fillna('No Dual Identity')

## fill null value in Align column with 'Gray Characters'
marvel_df['Align'] = marvel_df['Align'].fillna('Gray Characters')

## fill null value in Eye column with 'Eye-Less' 
marvel_df['Eye'] = marvel_df['Eye'].fillna('Eye-Less') 

## fill null value in Hair column with 'Hair-Less'
marvel_df['Hair'] = marvel_df['Hair'].fillna('Hair-Less')

## fill null value in Sex column with ''Non-Binary'
marvel_df['Sex'] = marvel_df['Sex'].fillna('Non-Binary')

## fill null value in Total Appearances column with median value
marvel_df['Total Appearances'] = marvel_df['Total Appearances'].fillna(marvel_df['Total Appearances'].median())

# drop row wichh null value in First Appearances and Year column
marvel_df = marvel_df.dropna(axis=0)

# print data information after handling missing data
marvel_df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 15561 entries, 0 to 16175
Data columns (total 11 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   Page_Id            15561 non-null  int64  
 1   Char Name          15561 non-null  object 
 2   ID Status          15561 non-null  object 
 3   Align              15561 non-null  object 
 4   Eye                15561 non-null  object 
 5   Hair               15561 non-null  object 
 6   Sex                15561 non-null  object 
 7   Alive              15561 non-null  object 
 8   Total Appearances  15561 non-null  float64
 9   First Appearance   15561 non-null  object 
 10  Year               15561 non-null  float64
dtypes: float64(2), int64(1), object(8)
memory usage: 1.4+ MB


In [6]:
# convert data type in total appearances column
marvel_df = marvel_df.astype({'Total Appearances': int})

# convert data type in year column
marvel_df = marvel_df.astype({'Year': int})

# print top 5 data after cleansing data
marvel_df.head()

Unnamed: 0,Page_Id,Char Name,ID Status,Align,Eye,Hair,Sex,Alive,Total Appearances,First Appearance,Year
0,1678,Spider-Man (Peter Parker),Secret Identity,Good Characters,Hazel Eyes,Brown Hair,Male Characters,Living Characters,4043,Aug-62,1962
1,7139,Captain America (Steven Rogers),Public Identity,Good Characters,Blue Eyes,White Hair,Male Characters,Living Characters,3360,Mar-41,1941
2,64786,"Wolverine (James \""Logan\"" Howlett)",Public Identity,Neutral Characters,Blue Eyes,Black Hair,Male Characters,Living Characters,3061,Oct-74,1974
3,1868,"Iron Man (Anthony \""Tony\"" Stark)",Public Identity,Good Characters,Blue Eyes,Black Hair,Male Characters,Living Characters,2961,Mar-63,1963
4,2460,Thor (Thor Odinson),No Dual Identity,Good Characters,Blue Eyes,Blond Hair,Male Characters,Living Characters,2258,Nov-50,1950


In [13]:
# visualize data with altair
## visualize number appearances marvel character 

import altair as alt

marvel = marvel_df[:30]

chart1 = alt.Chart(marvel).mark_bar().encode(
    y=alt.Y("Char Name", sort="-x", title="Char Name"),
    x=alt.X("Total Appearances", title="Total Appearances"),
    color=alt.Color('Char Name:N', legend=None),
    tooltip=["Char Name", "Total Appearances"]
).properties(
    title='Top 30 Number Appearances in Comic of Marvel Characters',
    width=600,
    height=300
)

chart = chart1.configure_title(fontSize=20, anchor='start', color='black').configure_axis(
    labelFontSize=12,
    titleFontSize=14
).interactive()

chart1.display()


In [14]:
# visualize data with altair
## visualize number appearances marvel character and the year of first appearance 

marvel2 = marvel_df.head()

chart2 = alt.Chart(marvel2).mark_circle(
    opacity=0.8,
    stroke='black',
    strokeWidth=1,
    strokeOpacity=0.4
).encode(
    alt.X('Year:O')
        .title("The First Year Marvel Character Appeared in Comics"),
    alt.Y('Char Name')
        .title("Marvel Characters Names")
        .sort(field="Total Appearances", op="sum", order='descending'),
    alt.Size('Total Appearances:Q')
       .scale(range=[0, 1000])
        .title('Total Appearances')
        .legend(clipHeight=30, format='s'),
    alt.Color('Char Name').legend(None),
    tooltip=[
        "Char Name",
        alt.Tooltip("Year:T", format='%Y'),
        alt.Tooltip("Total Appearances:Q", format='~s')
    ],
).properties(
    width=450,
    height=320,
    title=alt.Title(
        text="Top 5 Number of Marvel Characters Appearances in Comics",
        subtitle="The size of the bubble represents the total appearances marvel characters in comics",
        anchor='start'
    )
).configure_axisY(
    domain=False,
    ticks=False,
    offset=10
).configure_axisX(
    grid=False,
).configure_view(
    stroke=None
).interactive()

chart2.display()

In [15]:
# visualize data with altair
## visualize number of living and decreased characters

Alive = pd.DataFrame(marvel_df['Alive'].value_counts().reset_index())
print(Alive)
      
base = alt.Chart(Alive).encode(
    theta=alt.Theta("count:Q", stack=True),
    radius=alt.Radius("count", scale=alt.Scale(type="sqrt", zero=True, rangeMin=20)),
    color="count:N",
    
).properties(
    title="Number of Living and Decreased Characters",
)

c1 = base.mark_arc(innerRadius=20, stroke="#fff")

c2 = base.mark_text(radiusOffset=40).encode(text="Alive")

c1 + c2

                 Alive  count
0    Living Characters  11921
1  Deceased Characters   3640


In [16]:
# visualize data with altair
## visualize number of Sex type characters

Sex = pd.DataFrame(marvel_df['Sex'].value_counts().reset_index().sort_values(by=['count'], ascending=False))
print(Sex)
      
base = alt.Chart(Sex).encode(
    theta=alt.Theta("count:Q", stack=True),
    radius=alt.Radius("count", scale=alt.Scale(type="sqrt", zero=True, rangeMin=20)),
    color="count:N",
    
).properties(
    title="Number of Sex Type Characters",
)

c1 = base.mark_arc(innerRadius=20, stroke="#fff")

c2 = base.mark_text(radiusOffset=40).encode(text="Sex")

c1 + c2

                      Sex  count
0         Male Characters  11100
1       Female Characters   3628
2              Non-Binary    795
3      Agender Characters     36
4  Genderfluid Characters      2


In [17]:
# visualize data with altair
## visualize number of hair type characters

Hair = pd.DataFrame(marvel_df['Hair'].value_counts().reset_index().sort_values(by=['count'], ascending=False))
print(Hair)

alt.Chart(Hair).mark_bar().encode(
    alt.X("Hair"),
    alt.Y('count'),
    alt.Color("Hair").scale(scheme='pinkyellowgreen')
).properties(
    title="Number of Hair Type Characters"
)

                     Hair  count
0               Hair-Less   4003
1              Black Hair   3585
2              Brown Hair   2244
3              Blond Hair   1498
4                 No Hair   1118
5                    Bald    786
6              White Hair    719
7                Red Hair    599
8               Grey Hair    514
9              Green Hair    112
10            Auburn Hair     73
11              Blue Hair     55
12  Strawberry Blond Hair     46
13            Orange Hair     43
14            Purple Hair     42
15              Pink Hair     30
16          Variable Hair     29
17            Yellow Hair     20
18            Silver Hair     16
19              Gold Hair      8
20     Reddish Blond Hair      6
21       Light Brown Hair      5
22           Magenta Hair      5
23      Orange-brown Hair      3
24            Bronze Hair      1
25              Dyed Hair      1


In [18]:
# visualize data with altair
## visualize number of eye type characters

Eye = pd.DataFrame(marvel_df['Eye'].value_counts().reset_index().sort_values(by=['count'], ascending=False))
print(Eye)
      
alt.Chart(Eye).mark_bar().encode(
    alt.X("Eye"),
    alt.Y('count'),
    alt.Color("Eye").scale(scheme='redyellowblue')
).properties(
    title="Number of Eye Type Characters"
)

                Eye  count
0          Eye-Less   9349
1         Blue Eyes   1857
2        Brown Eyes   1821
3        Green Eyes    580
4        Black Eyes    514
5          Red Eyes    480
6        White Eyes    363
7       Yellow Eyes    238
8         Grey Eyes     92
9        Hazel Eyes     67
10    Variable Eyes     46
11      Purple Eyes     25
12      Orange Eyes     24
13          One Eye     21
14        Pink Eyes     19
15        Gold Eyes     14
16      Silver Eyes     11
17      Violet Eyes     10
18          No Eyes      7
19       Amber Eyes      6
20  Yellow Eyeballs      6
21    Multiple Eyes      5
22   Black Eyeballs      3
23     Magenta Eyes      2
24    Compound Eyes      1


In [19]:
# visualize data with altair
## visualize number of align type characters

Align = pd.DataFrame(marvel_df['Align'].value_counts().reset_index().sort_values(by=['count'], ascending=False))
print(Align)

base = alt.Chart(Align).encode(
    alt.Theta("count:Q").stack(True),
    alt.Color("Align:N").legend(None)
).properties(
    title="Number of Align Type Characters"
    )

pie = base.mark_arc(outerRadius=120)
text = base.mark_text(radius=160, size=10).encode(text="Align:N")

pie + text

                Align  count
0      Bad Characters   6450
1     Good Characters   4366
2     Gray Characters   2672
3  Neutral Characters   2073


In [20]:
# visualize data with altair
## visualize number of ID Status type characters

ID_Status = pd.DataFrame(marvel_df['ID Status'].value_counts().reset_index().sort_values(by=['count'], ascending=False))
print(ID_Status)

alt.Chart(ID_Status).mark_circle(size=100).encode(
    x='ID Status',
    y='count',
    color = alt.Color('ID Status:N', legend=None),
    tooltip=['ID Status', 'count']
).interactive()

                       ID Status  count
0                Secret Identity   5976
1               No Dual Identity   5200
2                Public Identity   4370
3  Known to Authorities Identity     15
