In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

import thinkstats2
import thinkplot

import statsmodels.formula.api as smf

import pickle

In [2]:
dc_df = pd.read_csv('dc_comic.csv')
marvel_df = pd.read_csv('marvel_comic.csv')

In [3]:
# Add column identifying the character's universe ( DC or Marvel).
dc_df['dcmar'] = 1
marvel_df['dcmar'] = 0

# Make the year variable consistent.
marvel_df['YEAR'] = marvel_df['Year']
marvel_df = marvel_df.drop('Year', axis=1)

# Combine.
frames = [dc_df, marvel_df]
both_df = pd.concat(frames, sort=False)

In [4]:
dc_df.head()

Unnamed: 0,page_id,name,urlslug,ID,ALIGN,EYE,HAIR,SEX,GSM,ALIVE,APPEARANCES,FIRST APPEARANCE,YEAR,dcmar
0,1422,Batman (Bruce Wayne),\/wiki\/Batman_(Bruce_Wayne),Secret Identity,Good Characters,Blue Eyes,Black Hair,Male Characters,,Living Characters,3093.0,"1939, May",1939.0,1
1,23387,Superman (Clark Kent),\/wiki\/Superman_(Clark_Kent),Secret Identity,Good Characters,Blue Eyes,Black Hair,Male Characters,,Living Characters,2496.0,"1986, October",1986.0,1
2,1458,Green Lantern (Hal Jordan),\/wiki\/Green_Lantern_(Hal_Jordan),Secret Identity,Good Characters,Brown Eyes,Brown Hair,Male Characters,,Living Characters,1565.0,"1959, October",1959.0,1
3,1659,James Gordon (New Earth),\/wiki\/James_Gordon_(New_Earth),Public Identity,Good Characters,Brown Eyes,White Hair,Male Characters,,Living Characters,1316.0,"1987, February",1987.0,1
4,1576,Richard Grayson (New Earth),\/wiki\/Richard_Grayson_(New_Earth),Secret Identity,Good Characters,Blue Eyes,Black Hair,Male Characters,,Living Characters,1237.0,"1940, April",1940.0,1


In [5]:
dc_df.describe()

Unnamed: 0,page_id,APPEARANCES,YEAR,dcmar
count,6896.0,6541.0,6827.0,6896.0
mean,147441.209252,23.625134,1989.766662,1.0
std,108388.631149,87.378509,16.824194,0.0
min,1380.0,1.0,1935.0,1.0
25%,44105.5,2.0,1983.0,1.0
50%,141267.0,6.0,1992.0,1.0
75%,213203.0,15.0,2003.0,1.0
max,404010.0,3093.0,2013.0,1.0


In [6]:
both_df.columns

Index(['page_id', 'name', 'urlslug', 'ID', 'ALIGN', 'EYE', 'HAIR', 'SEX',
       'GSM', 'ALIVE', 'APPEARANCES', 'FIRST APPEARANCE', 'YEAR', 'dcmar'],
      dtype='object')

In [7]:
marvel_df.head()

Unnamed: 0,page_id,name,urlslug,ID,ALIGN,EYE,HAIR,SEX,GSM,ALIVE,APPEARANCES,FIRST APPEARANCE,dcmar,YEAR
0,1678,Spider-Man (Peter Parker),\/Spider-Man_(Peter_Parker),Secret Identity,Good Characters,Hazel Eyes,Brown Hair,Male Characters,,Living Characters,4043.0,Aug-62,0,1962.0
1,7139,Captain America (Steven Rogers),\/Captain_America_(Steven_Rogers),Public Identity,Good Characters,Blue Eyes,White Hair,Male Characters,,Living Characters,3360.0,Mar-41,0,1941.0
2,64786,"Wolverine (James \""Logan\"" Howlett)",\/Wolverine_(James_%22Logan%22_Howlett),Public Identity,Neutral Characters,Blue Eyes,Black Hair,Male Characters,,Living Characters,3061.0,Oct-74,0,1974.0
3,1868,"Iron Man (Anthony \""Tony\"" Stark)",\/Iron_Man_(Anthony_%22Tony%22_Stark),Public Identity,Good Characters,Blue Eyes,Black Hair,Male Characters,,Living Characters,2961.0,Mar-63,0,1963.0
4,2460,Thor (Thor Odinson),\/Thor_(Thor_Odinson),No Dual Identity,Good Characters,Blue Eyes,Blond Hair,Male Characters,,Living Characters,2258.0,Nov-50,0,1950.0


In [8]:
marvel_df.describe()

Unnamed: 0,page_id,APPEARANCES,dcmar,YEAR
count,16376.0,15280.0,16376.0,15561.0
mean,300232.082377,17.033377,0.0,1984.951803
std,253460.403399,96.372959,0.0,19.663571
min,1025.0,1.0,0.0,1939.0
25%,28309.5,1.0,0.0,1974.0
50%,282578.0,3.0,0.0,1990.0
75%,509077.0,8.0,0.0,2000.0
max,755278.0,4043.0,0.0,2013.0


In [9]:
marvel_df.describe()

Unnamed: 0,page_id,APPEARANCES,dcmar,YEAR
count,16376.0,15280.0,16376.0,15561.0
mean,300232.082377,17.033377,0.0,1984.951803
std,253460.403399,96.372959,0.0,19.663571
min,1025.0,1.0,0.0,1939.0
25%,28309.5,1.0,0.0,1974.0
50%,282578.0,3.0,0.0,1990.0
75%,509077.0,8.0,0.0,2000.0
max,755278.0,4043.0,0.0,2013.0


In [10]:
# Most of the characters have a 'GSM' of N/A, which means that 
# they don't identify as one of the gender or sexual minorities 
# listed. These N/As are replaced by a code in the event that 
# we want to use this variable as a predictor so that we don't 
# lose most rows of data to dropna().
both_df['GSM'] = both_df['GSM'].fillna('Not Stated')
marvel_df['GSM'] = marvel_df['GSM'].fillna('Not Stated')
dc_df['GSM'] = dc_df['GSM'].fillna('Not Stated')

In [11]:
# First appearance is very similar to the YEAR variable, 
# and we decided not to use it in our analysis.
both_df = both_df.drop('FIRST APPEARANCE', axis=1)
marvel_df = marvel_df.drop('FIRST APPEARANCE', axis=1)
dc_df = dc_df.drop('FIRST APPEARANCE', axis=1)

In [12]:
def map_cat_to_number(variable):
    """Creates a mapping from a categorical label to an integer.
    
    Args:
        variable (pandas Series): The categorical variable to map.
    
    Returns:
        cat_dict (dictionary): A dictionary with keys as the categorical
            labels and values as the integers corresponding to the label.
        mapped_variable (pandas Series): The variable with categorical 
            labels replaced with numbers.
    """
    categories = list(variable.cat.categories)
    cat_dict = {}
    for code, cat in enumerate(categories):
        cat_dict[cat] = code
    
    mapped_variable = variable.map(cat_dict)
    return cat_dict, mapped_variable

In [13]:
def object_types_to_numeric(df):
    """Converts columns of type 'object' to numeric values.
    
    Args:
        df (pandas DataFrame): The data frame in which to convert columns.
        
    Returns:
        df (pandas DataFrame): The data frame with converted columns.
    """
    object_columns = df.select_dtypes(include=['object']).columns
    cat_dicts = {}
    for col in df.columns:
        if col in object_columns:
            cat_dicts[col], df[col] = map_cat_to_number(df[col].astype('category'))
            
    return cat_dicts, df

In [14]:
# Convert data frames to saveable formats.
cat_dicts_both, both_df = object_types_to_numeric(both_df)
cat_dicts_marvel, marvel_df = object_types_to_numeric(marvel_df)
cat_dicts_dc, dc_df = object_types_to_numeric(dc_df)

In [15]:
# Save categorical mappings.
with open('cat_dicts_both.pkl', 'wb') as f:
    pickle.dump(cat_dicts_both, f)

with open('cat_dicts_marvel.pkl', 'wb') as f:
    pickle.dump(cat_dicts_marvel, f)
    
with open('cat_dicts_dc.pkl', 'wb') as f:
    pickle.dump(cat_dicts_dc, f)

In [16]:
# Save the data frames as hdf files.
both_df.to_hdf('both_df.hdf5', mode='w', key='both')
marvel_df.to_hdf('marvel_df.hdf5', mode='w', key='marvel')
dc_df.to_hdf('dc_df.hdf5', mode='w', key='dc')