# Dealing with Missing Values

In [1]:
import pandas as pd
import numpy as np


In [2]:
df=pd.read_csv('housing.csv')

In [3]:
df.shape

(20640, 10)

In [4]:
df.head()

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,median_house_value,ocean_proximity
0,-122.23,37.88,41.0,880.0,129.0,322.0,126.0,8.3252,452600.0,NEAR BAY
1,-122.22,37.86,21.0,7099.0,1106.0,2401.0,1138.0,8.3014,358500.0,NEAR BAY
2,-122.24,37.85,52.0,1467.0,190.0,496.0,177.0,7.2574,352100.0,NEAR BAY
3,-122.25,37.85,52.0,1274.0,235.0,558.0,219.0,5.6431,341300.0,NEAR BAY
4,-122.25,37.85,52.0,1627.0,280.0,565.0,259.0,3.8462,342200.0,NEAR BAY


In [5]:
df.columns

Index(['longitude', 'latitude', 'housing_median_age', 'total_rooms',
       'total_bedrooms', 'population', 'households', 'median_income',
       'median_house_value', 'ocean_proximity'],
      dtype='object')

In [6]:
df.dtypes

longitude             float64
latitude              float64
housing_median_age    float64
total_rooms           float64
total_bedrooms        float64
population            float64
households            float64
median_income         float64
median_house_value    float64
ocean_proximity        object
dtype: object

## Deal with Missing Values  

In [7]:
df.isnull().sum()

longitude               0
latitude                0
housing_median_age      0
total_rooms             0
total_bedrooms        207
population              0
households              0
median_income           0
median_house_value      0
ocean_proximity         0
dtype: int64

In [8]:
# Percentage of missing data
df.isnull().sum().sum()/np.product(df.shape)*100

0.1002906976744186

In [9]:
# Drop missing values
df_cp=df.copy()
df_cp.dropna(inplace=True)
df_cp.isnull().sum()

longitude             0
latitude              0
housing_median_age    0
total_rooms           0
total_bedrooms        0
population            0
households            0
median_income         0
median_house_value    0
ocean_proximity       0
dtype: int64

In [10]:
# Fill missing values
df_cp2=df.copy()
df_cp2.fillna(0).isna().sum()

longitude             0
latitude              0
housing_median_age    0
total_rooms           0
total_bedrooms        0
population            0
households            0
median_income         0
median_house_value    0
ocean_proximity       0
dtype: int64

In [11]:
#replaces each missing value with the most recent preceding value 
#that is not NaN.
df_cp2.fillna(method='ffill',axis=0).isna().sum()

longitude             0
latitude              0
housing_median_age    0
total_rooms           0
total_bedrooms        0
population            0
households            0
median_income         0
median_house_value    0
ocean_proximity       0
dtype: int64

In [12]:
# Imputation/Imputing missing values using Sklearn
from sklearn.impute import SimpleImputer


In [13]:
simple_imputer =SimpleImputer(missing_values=np.nan,strategy='mean')


In [14]:
df_cp2['total_bedrooms']=simple_imputer.fit_transform
(df['total_bedrooms'].values.reshape(-1,1))

In [15]:
df_cp2.isna().sum()

longitude             0
latitude              0
housing_median_age    0
total_rooms           0
total_bedrooms        0
population            0
households            0
median_income         0
median_house_value    0
ocean_proximity       0
dtype: int64

# Parsing Dates

In [16]:
df=pd.read_csv('parsing_dates.csv',names=['Date','Todo','Price'])

In [17]:
df

Unnamed: 0,Date,Todo,Price
0,2018-02-20,Dinning,200
1,2019-03-23,Shopping,330
2,2020-09-20,Jogging,400
3,2021-10-04,Partying,500
4,2022-11-09,Coding,20


In [18]:
df.dtypes

Date     object
Todo     object
Price     int64
dtype: object

In [19]:
df['parsed_date']=pd.to_datetime(df['Date'],format='%Y-%m-%d')

In [20]:
df.dtypes

Date                   object
Todo                   object
Price                   int64
parsed_date    datetime64[ns]
dtype: object

# Categorical Encoding

In [21]:
df=pd.DataFrame({'Fruits':['Mango','Apple','Orange','Pineapple','Grape'],
                'Weights':[20,10,15,500,4]})

In [22]:
df

Unnamed: 0,Fruits,Weights
0,Mango,20
1,Apple,10
2,Orange,15
3,Pineapple,500
4,Grape,4


# Label Encoding

In [23]:
df.dtypes

Fruits     object
Weights     int64
dtype: object

In [24]:
df['Fruits']=df['Fruits'].astype('category')

In [25]:
df.dtypes

Fruits     category
Weights       int64
dtype: object

In [26]:
# Using pandas
#The .cat accessor is used to work with categorical data in pandas.
#The .codes attribute returns the numerical codes corresponding to 
#the categoriesof the categorical data. Each unique category is assigned
#an integer code.
df['Encoding_with_pandas']=df['Fruits'].cat.codes

In [27]:
df

Unnamed: 0,Fruits,Weights,Encoding_with_pandas
0,Mango,20,2
1,Apple,10,0
2,Orange,15,3
3,Pineapple,500,4
4,Grape,4,1


In [28]:
# using sklearn
from sklearn.preprocessing import LabelEncoder

In [29]:
label_encoder=LabelEncoder()

In [30]:
label_encoder.fit(df['Fruits'])

In [31]:
df['Encoded_with_sklearn']=label_encoder.transform(df['Fruits'])

In [32]:
df

Unnamed: 0,Fruits,Weights,Encoding_with_pandas,Encoded_with_sklearn
0,Mango,20,2,2
1,Apple,10,0,0
2,Orange,15,3,3
3,Pineapple,500,4,4
4,Grape,4,1,1


## One Hot Encoding

In [33]:
from sklearn.preprocessing import OneHotEncoder

In [34]:
one_hot_encoder=OneHotEncoder()

In [35]:
transformed_one_hot_encoding=one_hot_encoder.fit_transform(df[['Fruits']])

In [36]:
df2=pd.DataFrame(transformed_one_hot_encoding.toarray(),dtype='int')
df2

Unnamed: 0,0,1,2,3,4
0,0,0,1,0,0
1,1,0,0,0,0
2,0,0,0,1,0
3,0,0,0,0,1
4,0,1,0,0,0


In [37]:
dummy_df=pd.get_dummies(df['Fruits'])

In [38]:
df3=df.join(dummy_df)
df3

Unnamed: 0,Fruits,Weights,Encoding_with_pandas,Encoded_with_sklearn,Apple,Grape,Mango,Orange,Pineapple
0,Mango,20,2,2,False,False,True,False,False
1,Apple,10,0,0,True,False,False,False,False
2,Orange,15,3,3,False,False,False,True,False
3,Pineapple,500,4,4,False,False,False,False,True
4,Grape,4,1,1,False,True,False,False,False
