#Different Types of Cleaning
#1. Remove unwanted observations
#2. Fix Structural Errors
#3. Remove Unwanted Outliers
#4. Missing Categorical Data
#5. Missing Numeric Data

# 2.1 Remove Unwanted Observations

In [None]:
#Duplicate Observations
#Irrelevant Observations

In [None]:
#Import Libraries
import numpy as np
import pandas as pd
pd.set_option('display.max_columns', 100)
from matplotlib import pyplot as plt

%matplotlib inline

import seaborn as sns
sns.set_style('darkgrid')

In [None]:
df = pd.read_csv('real_estate_data.csv')

In [None]:
df= df.drop_duplicates()
print(df.shape)

# 2.2 Fix structural errors

In [None]:
#displayy unique values of 'basement'
df.basement.unique()

In [None]:
#missing basement values should be 0
df['basement'] = df.basement.fillna(0)
df.basement

In [None]:
#displayy unique values of 'basement'
df.basement.unique()

In [None]:
#Class distribution for 'roof'
sns.countplot(y='roof', data=df)

In [None]:
#composition should be 'Composition'
df.roof.replace('composition', 'Composition', inplace = True)

#'asphalt' should be "Asphalt"
df.roof.replace('asphalt', 'Asphalt', inplace=True)

#'shake-single' and  'asphalt,shake-shingle' should be 'Shake Shingle'
df.roof.replace(['shake-shingle', 'asphalt,shake-shingle'], 'Shake Shingle', inplace=True)

In [None]:
#Class distribution for 'roof'
sns.countplot(y='roof', data=df)

In [None]:
sns.countplot(y='exterior_walls', data=df)

In [None]:
#'Rock Stone' should be 'Masonry'
df.exterior_walls.replace('Rock, Stone', 'Masonry', inplace = True)
sns.countplot(y='exterior_walls', data=df)

In [None]:
#''Concrete' and 'Block'] should be 'Concrete Block'
df.exterior_walls.replace(['Concrete','Block'], 'Concrete Block', inplace = True)
sns.countplot(y='exterior_walls', data=df)

# Good Reasons for removing Outliers

1. Suspicious measurements that are unlikely to be real estate data
2. outliers that belong to different population
3. outliers that belong to different problem

# 2.3 Remove Unwanted Outliers

In [None]:
#box plot of 'tx_price' using the seaborn library
sns.boxplot(df.tx_price)

In [None]:
sns.boxplot(df.lot_size)

In [None]:
# box plot of 'tx_price' using seaborn library
 sns.boxplot(df.tx_price)
plt.xlim(0,1000000)
plt.show()

# violin plot of 'tx_price' using seaborn library
sns.violinplot(df.tx_price)
plt.show()

In [None]:
#plot the violin plot for 'beds' 'sqft' and 'lot_size'
sns.violinplot(df.beds)
plt.show

In [None]:
sns.violinplot(df.sqft)
plt.show()

sns.violinplot(df.lot_size)
plt.show()

In [None]:
#sort df.lot_size and display the top 5 samples
df.lot_size.sort_values(ascending=False).head()

In [None]:
#Remove lot_size outliers
df = df[df.lot_size <= 500000]

#print length of df
print(len(df))

# 2.4. Missing Categorical Data

In [None]:
#Drop 
#Impute
#Flag&Fill

In [None]:
# Display number of missing values by feature (categorical)
df.select_dtypes(include=['object']).isnull().sum()

In [None]:
#Fill missing values in exterior with 'Missing'
df['exterior_walls'] = df['exterior_walls'].fillna('missing')
df.exterior_walls.isnull().sum()

In [None]:
#Fill missing categorical values
for column in df.select_dtypes(include=['object']):
    df[column] = df[column].fillna('Missing')

In [None]:
df.select_dtypes(include=['object']).isnull().sum()

# 2.5. Missing Numeric Data

In [None]:
# Display number of missing values by feature (Numerical)
df.select_dtypes(exclude=['object']).isnull().sum()

In [None]:
#Save cleaned dataframe to new file
df.to_csv('cleaned_df.csv',index=None)