In [1]:
import pandas as pd
import numpy as np

In [2]:
people = {
    'first': ['Corey', 'Jane', 'John', 'Chris', np.nan, None, 'NA'],
    'last': ['Schafer', 'Doe', 'Doe', 'Schafer', np.nan, np.nan, 'Missing'],
    'email': ['Corey.Schafer@email.com', 'Jane.Doe@email.com', 'John.Doe@email.com', None, np.nan, 'Anonymous@email.com', 'NA'],
    'age': ['33', '55', '63', '36', None, None, 'Missing']
}

In [3]:
df = pd.DataFrame(people)
df

Unnamed: 0,first,last,email,age
0,Corey,Schafer,Corey.Schafer@email.com,33
1,Jane,Doe,Jane.Doe@email.com,55
2,John,Doe,John.Doe@email.com,63
3,Chris,Schafer,,36
4,,,,
5,,,Anonymous@email.com,
6,,Missing,,Missing


In [4]:
df.dropna()

Unnamed: 0,first,last,email,age
0,Corey,Schafer,Corey.Schafer@email.com,33
1,Jane,Doe,Jane.Doe@email.com,55
2,John,Doe,John.Doe@email.com,63
6,,Missing,,Missing


In [5]:
df.dropna(axis='index', how='any')

Unnamed: 0,first,last,email,age
0,Corey,Schafer,Corey.Schafer@email.com,33
1,Jane,Doe,Jane.Doe@email.com,55
2,John,Doe,John.Doe@email.com,63
6,,Missing,,Missing


In [6]:
df.dropna(axis='index', how='all')

Unnamed: 0,first,last,email,age
0,Corey,Schafer,Corey.Schafer@email.com,33
1,Jane,Doe,Jane.Doe@email.com,55
2,John,Doe,John.Doe@email.com,63
3,Chris,Schafer,,36
5,,,Anonymous@email.com,
6,,Missing,,Missing


In [7]:
df.dropna(axis='columns', how='all')

Unnamed: 0,first,last,email,age
0,Corey,Schafer,Corey.Schafer@email.com,33
1,Jane,Doe,Jane.Doe@email.com,55
2,John,Doe,John.Doe@email.com,63
3,Chris,Schafer,,36
4,,,,
5,,,Anonymous@email.com,
6,,Missing,,Missing


In [8]:
df.dropna(axis='columns', how='any')

0
1
2
3
4
5
6


In [9]:
df.dropna(axis='index', how='any', subset=['email'])

Unnamed: 0,first,last,email,age
0,Corey,Schafer,Corey.Schafer@email.com,33
1,Jane,Doe,Jane.Doe@email.com,55
2,John,Doe,John.Doe@email.com,63
5,,,Anonymous@email.com,
6,,Missing,,Missing


In [10]:
df.dropna(axis='index', how='all', subset=['last', 'email'])

Unnamed: 0,first,last,email,age
0,Corey,Schafer,Corey.Schafer@email.com,33
1,Jane,Doe,Jane.Doe@email.com,55
2,John,Doe,John.Doe@email.com,63
3,Chris,Schafer,,36
5,,,Anonymous@email.com,
6,,Missing,,Missing


In [11]:
df.replace('NA', np.nan, inplace=True)
df.replace('Missing', np.nan, inplace=True)
df

Unnamed: 0,first,last,email,age
0,Corey,Schafer,Corey.Schafer@email.com,33.0
1,Jane,Doe,Jane.Doe@email.com,55.0
2,John,Doe,John.Doe@email.com,63.0
3,Chris,Schafer,,36.0
4,,,,
5,,,Anonymous@email.com,
6,,,,


In [12]:
df.dropna()

Unnamed: 0,first,last,email,age
0,Corey,Schafer,Corey.Schafer@email.com,33
1,Jane,Doe,Jane.Doe@email.com,55
2,John,Doe,John.Doe@email.com,63


In [13]:
df.isna()

Unnamed: 0,first,last,email,age
0,False,False,False,False
1,False,False,False,False
2,False,False,False,False
3,False,False,True,False
4,True,True,True,True
5,True,True,False,True
6,True,True,True,True


In [14]:
df.fillna('MISSING')

Unnamed: 0,first,last,email,age
0,Corey,Schafer,Corey.Schafer@email.com,33
1,Jane,Doe,Jane.Doe@email.com,55
2,John,Doe,John.Doe@email.com,63
3,Chris,Schafer,MISSING,36
4,MISSING,MISSING,MISSING,MISSING
5,MISSING,MISSING,Anonymous@email.com,MISSING
6,MISSING,MISSING,MISSING,MISSING


In [15]:
df.fillna(0)

Unnamed: 0,first,last,email,age
0,Corey,Schafer,Corey.Schafer@email.com,33
1,Jane,Doe,Jane.Doe@email.com,55
2,John,Doe,John.Doe@email.com,63
3,Chris,Schafer,0,36
4,0,0,0,0
5,0,0,Anonymous@email.com,0
6,0,0,0,0


In [16]:
df[['first', 'last']].fillna(0)

Unnamed: 0,first,last
0,Corey,Schafer
1,Jane,Doe
2,John,Doe
3,Chris,Schafer
4,0,0
5,0,0
6,0,0


In [17]:
df.dtypes

first    object
last     object
email    object
age      object
dtype: object

In [18]:
type(np.nan)

float

In [19]:
try:
    df['age'].mean()
except TypeError as e:
    print("TypeError: {}".format(e))

TypeError: can only concatenate str (not "int") to str


In [20]:
df['age'] = df['age'].astype(float)

In [21]:
df.dtypes

first     object
last      object
email     object
age      float64
dtype: object

In [22]:
df['age'].mean()

46.75

In [23]:
na_vals = ['NA', 'Missing']
df = pd.read_csv('data/survey_results_public.csv', index_col='Respondent', na_values=na_vals)
schema_df = pd.read_csv('data/survey_results_schema.csv', index_col='Column')

In [24]:
df['YearsCode'].head(10)

Respondent
1       4
2     NaN
3       3
4       3
5      16
6      13
7       6
8       8
9      12
10     12
Name: YearsCode, dtype: object

In [25]:
try:
    df['YearsCode'].mean()
except TypeError as e:
    print("TypeError: {}".format(e))

TypeError: can only concatenate str (not "int") to str


In [26]:
try:
    df['YearsCode'] = df['YearsCode'].astype(float)
except ValueError as e:
    print("ValueError: {}".format(e))

ValueError: could not convert string to float: 'Less than 1 year'


In [27]:
df['YearsCode'].unique()

array(['4', nan, '3', '16', '13', '6', '8', '12', '2', '5', '17', '10',
       '14', '35', '7', 'Less than 1 year', '30', '9', '26', '40', '19',
       '15', '20', '28', '25', '1', '22', '11', '33', '50', '41', '18',
       '34', '24', '23', '42', '27', '21', '36', '32', '39', '38', '31',
       '37', 'More than 50 years', '29', '44', '45', '48', '46', '43',
       '47', '49'], dtype=object)

In [28]:
df['YearsCode'].replace('Less than 1 year', 0, inplace=True)

In [29]:
df['YearsCode'].replace('More than 50 years', 51, inplace=True)

In [30]:
df['YearsCode'] = df['YearsCode'].astype(float)
df.loc[:, 'YearsCode'].mean()

11.684505040635964

In [31]:
df.loc[:, 'YearsCode'].median()

9.0