# Preprocess missing data using Pandas for Python

In [None]:
import pandas as pd
import matplotlib.pyplot as plt

In [None]:
df = pd.read_csv("titanic_original.csv")
print(df.head(20))

# Visualize missing data

In [None]:
# Dictionary with amount of NaN per column

columns = df.columns
D = {}
for i in columns:
    D[str(i)] = df[i].isnull().sum()
print(D)

In [None]:
# Plot missing values per column

plt.bar(range(len(D)), D.values(), align='center')
plt.xticks(range(len(D)), D.keys(),rotation='vertical')
plt.title('Missing values per column (out of 1310 rows)')
plt.show()

# Basic solutions

In [None]:
# Complete-case analysis: Drop rows with a NaN value 

df2 = df.dropna(axis=0, how='any')
print(df2.head(10))

# --Only use if few NaNs (<5%)--

# Single value imputation/filling

In [None]:
# Fill NaNs with mean of the column (only numerical data)
mean_age = df['age'].mean()
print(mean_age)

df_mean = df.fillna({'age': mean_age})
print(df_mean[['age']].head(20))



In [None]:
# Dictionary with amount of NaN per column
columns = df_mean.columns
D1 = {}
for i in columns:
    D1[str(i)] = df_mean[i].isnull().sum()

# Plot missing values per column
plt.bar(range(len(D1)), D1.values(), align='center')
plt.xticks(range(len(D1)), D1.keys(),rotation='vertical')
plt.title('Missing values per column (out of 1310 rows)')
plt.show()

In [None]:
print("SD before mean imputation: "+str(df['age'].std()))
print("SD after mean imputation: "+str(df_mean['age'].std()))

# This is underestimating the true SD

In [None]:
# Backward filling
df_back = df.fillna(method="bfill")

print(df_back[['age']].head(20))

# Replicating outliers

In [None]:
# Forward filling
df_forward = df.fillna(method="ffill")

print(df_forward[['age']].head(20))