# Importing the Libraries

In [1]:
import pandas as pd
import numpy as np

In [2]:
import random

# Generating the dataset

In [3]:
np.random.seed(0)
degree_option = ['Bachelor', 'Masters', 'No Degree']
age_option = [18, 25, 30]
degree = []
age = []
for i in range(0, 10):
    s = np.random.choice(degree_option)
    y = np.random.choice(age_option)
    degree.append(s)
    age.append(y)
    
for i in range(0,2):
    s = np.random.choice(list(range(0,7)),replace=False)
    y = np.random.choice(list(range(3,10)),replace=False)
    degree[s]= np.nan
    age[y] = np.nan
    

# Creating a dataframe and adding other columns

In [7]:
df = pd.DataFrame(np.array([degree,age]).T, columns=['Degree', 'Age'])

In [8]:

df['Degree'] = np.where(df['Degree']== 'nan', np.nan, df['Degree'])
df['Age'] = np.where(df['Age']== 'nan', np.nan, df['Age'])

df.insert(0,'city', [np.random.choice(['Berlin', 'Madrid', 'Lisbon']) for i in range(0,10)])
df['Salary'] = ['45,000'] + [np.nan for i in range(0,9)]
df['married'] = [np.random.choice([1,0]) for i in range(0,10)]


In [9]:
df

Unnamed: 0,city,Degree,Age,Salary,married
0,Lisbon,,25.0,45000.0,0
1,Berlin,Bachelor,25.0,,1
2,Lisbon,,30.0,,1
3,Lisbon,Bachelor,30.0,,1
4,Berlin,Bachelor,18.0,,0
5,Lisbon,Bachelor,,,0
6,Berlin,Masters,30.0,,1
7,Berlin,No Degree,,,0
8,Berlin,Masters,25.0,,1
9,Madrid,Masters,25.0,,1


## Creating Variations of the dataframe

In [10]:
df_example1 = df.drop('Salary', axis=1)
df_example2 =  df.drop(['Degree', 'Age'], axis=1)
df_example3 =  df.drop(['Degree','Salary'], axis=1)
df_example4 =  df.drop(['Age','Salary'], axis=1)

# Imputation methods
### For removing rows

In [11]:
df_example1

Unnamed: 0,city,Degree,Age,married
0,Lisbon,,25.0,0
1,Berlin,Bachelor,25.0,1
2,Lisbon,,30.0,1
3,Lisbon,Bachelor,30.0,1
4,Berlin,Bachelor,18.0,0
5,Lisbon,Bachelor,,0
6,Berlin,Masters,30.0,1
7,Berlin,No Degree,,0
8,Berlin,Masters,25.0,1
9,Madrid,Masters,25.0,1


In [12]:
remove_rows = df_example1.dropna(axis=0)

In [13]:
remove_rows

Unnamed: 0,city,Degree,Age,married
1,Berlin,Bachelor,25,1
3,Lisbon,Bachelor,30,1
4,Berlin,Bachelor,18,0
6,Berlin,Masters,30,1
8,Berlin,Masters,25,1
9,Madrid,Masters,25,1


### For removing columns

In [14]:
df_example2

Unnamed: 0,city,Salary,married
0,Lisbon,45000.0,0
1,Berlin,,1
2,Lisbon,,1
3,Lisbon,,1
4,Berlin,,0
5,Lisbon,,0
6,Berlin,,1
7,Berlin,,0
8,Berlin,,1
9,Madrid,,1


In [15]:
remove_column = df_example2.drop('Salary', axis=1)

In [16]:
remove_column

Unnamed: 0,city,married
0,Lisbon,0
1,Berlin,1
2,Lisbon,1
3,Lisbon,1
4,Berlin,0
5,Lisbon,0
6,Berlin,1
7,Berlin,0
8,Berlin,1
9,Madrid,1


### Mean Imputation

In [17]:
df_example3

Unnamed: 0,city,Age,married
0,Lisbon,25.0,0
1,Berlin,25.0,1
2,Lisbon,30.0,1
3,Lisbon,30.0,1
4,Berlin,18.0,0
5,Lisbon,,0
6,Berlin,30.0,1
7,Berlin,,0
8,Berlin,25.0,1
9,Madrid,25.0,1


In [18]:
age_mean = df_example3['Age'].astype(float).mean()
df_example3['Age'] = df_example3['Age'].fillna(age_mean)

In [21]:
age_mean

26.0

In [19]:
df_example3

Unnamed: 0,city,Age,married
0,Lisbon,25,0
1,Berlin,25,1
2,Lisbon,30,1
3,Lisbon,30,1
4,Berlin,18,0
5,Lisbon,26,0
6,Berlin,30,1
7,Berlin,26,0
8,Berlin,25,1
9,Madrid,25,1


### Mode Imputation

In [22]:
df_example4

Unnamed: 0,city,Degree,married
0,Lisbon,,0
1,Berlin,Bachelor,1
2,Lisbon,,1
3,Lisbon,Bachelor,1
4,Berlin,Bachelor,0
5,Lisbon,Bachelor,0
6,Berlin,Masters,1
7,Berlin,No Degree,0
8,Berlin,Masters,1
9,Madrid,Masters,1


In [23]:
degree_mode = df_example4['Degree'].mode()
df_example4['Degree'] = df_example4['Degree'].fillna(degree_mode[0])

In [24]:
df_example4

Unnamed: 0,city,Degree,married
0,Lisbon,Bachelor,0
1,Berlin,Bachelor,1
2,Lisbon,Bachelor,1
3,Lisbon,Bachelor,1
4,Berlin,Bachelor,0
5,Lisbon,Bachelor,0
6,Berlin,Masters,1
7,Berlin,No Degree,0
8,Berlin,Masters,1
9,Madrid,Masters,1


### Random sampling Imputation

In [25]:
df_example5 = df_example4.copy().drop('Degree', axis=1)
df_example5.insert(1, 'Age', df['Age'])

In [26]:
df_example5

Unnamed: 0,city,Age,married
0,Lisbon,25.0,0
1,Berlin,25.0,1
2,Lisbon,30.0,1
3,Lisbon,30.0,1
4,Berlin,18.0,0
5,Lisbon,,0
6,Berlin,30.0,1
7,Berlin,,0
8,Berlin,25.0,1
9,Madrid,25.0,1


In [27]:
# df_example5['age_impute'] = df_example5['Age'].copy()
df_example5.insert(2, 'age_impute', df['Age'])
random_sample = df_example5['Age'].dropna().sample(
    df_example5['Age'].isnull().sum(), random_state=0)

In [28]:
random_sample

8    25
2    30
Name: Age, dtype: object

In [29]:
random_sample.index = df_example5[df_example5['Age'].isnull()].index

In [33]:
random_sample

5    25
7    30
Name: Age, dtype: object

In [31]:
df_example5.loc[df_example5['Age'].isnull(), 'age_impute'] = random_sample

In [32]:
df_example5

Unnamed: 0,city,Age,age_impute,married
0,Lisbon,25.0,25,0
1,Berlin,25.0,25,1
2,Lisbon,30.0,30,1
3,Lisbon,30.0,30,1
4,Berlin,18.0,18,0
5,Lisbon,,25,0
6,Berlin,30.0,30,1
7,Berlin,,30,0
8,Berlin,25.0,25,1
9,Madrid,25.0,25,1


### Arbitrary Values Imputation

In [34]:
df_example6 =  df.drop(['Degree','Salary'], axis=1)
df_example6

Unnamed: 0,city,Age,married
0,Lisbon,25.0,0
1,Berlin,25.0,1
2,Lisbon,30.0,1
3,Lisbon,30.0,1
4,Berlin,18.0,0
5,Lisbon,,0
6,Berlin,30.0,1
7,Berlin,,0
8,Berlin,25.0,1
9,Madrid,25.0,1


In [35]:
df_example6['Age'] = df_example6['Age'].fillna(-1)
df_example6

Unnamed: 0,city,Age,married
0,Lisbon,25,0
1,Berlin,25,1
2,Lisbon,30,1
3,Lisbon,30,1
4,Berlin,18,0
5,Lisbon,-1,0
6,Berlin,30,1
7,Berlin,-1,0
8,Berlin,25,1
9,Madrid,25,1


### Missing Category Imputation

In [36]:
df_example7 =  df.drop(['Age','Salary'], axis=1)
df_example7

Unnamed: 0,city,Degree,married
0,Lisbon,,0
1,Berlin,Bachelor,1
2,Lisbon,,1
3,Lisbon,Bachelor,1
4,Berlin,Bachelor,0
5,Lisbon,Bachelor,0
6,Berlin,Masters,1
7,Berlin,No Degree,0
8,Berlin,Masters,1
9,Madrid,Masters,1


In [37]:
df_example7['Degree'] = df_example7['Degree'].fillna('Missing')
df_example7

Unnamed: 0,city,Degree,married
0,Lisbon,Missing,0
1,Berlin,Bachelor,1
2,Lisbon,Missing,1
3,Lisbon,Bachelor,1
4,Berlin,Bachelor,0
5,Lisbon,Bachelor,0
6,Berlin,Masters,1
7,Berlin,No Degree,0
8,Berlin,Masters,1
9,Madrid,Masters,1


### Missing Indicator Method 

In [38]:
df_example8 =  df.drop(['Age','Salary'], axis=1)
df_example8

Unnamed: 0,city,Degree,married
0,Lisbon,,0
1,Berlin,Bachelor,1
2,Lisbon,,1
3,Lisbon,Bachelor,1
4,Berlin,Bachelor,0
5,Lisbon,Bachelor,0
6,Berlin,Masters,1
7,Berlin,No Degree,0
8,Berlin,Masters,1
9,Madrid,Masters,1


In [39]:
df_example8.insert(2, 'Degree_indicator',(np.where(df_example8['Degree'].isna(), 1, 0)) )
df_example8['Degree'] = df_example8['Degree'].fillna(degree_mode[0])

In [40]:
df_example8

Unnamed: 0,city,Degree,Degree_indicator,married
0,Lisbon,Bachelor,1,0
1,Berlin,Bachelor,0,1
2,Lisbon,Bachelor,1,1
3,Lisbon,Bachelor,0,1
4,Berlin,Bachelor,0,0
5,Lisbon,Bachelor,0,0
6,Berlin,Masters,0,1
7,Berlin,No Degree,0,0
8,Berlin,Masters,0,1
9,Madrid,Masters,0,1
