In [1]:
import pandas as pd
from io import StringIO

In [15]:
csv_data = ''' A,B,C,D
1.0,2.0,3.0,4.0
5.0,6.0,,8.0
10.0,11.0,12.0'''

df = pd.read_csv(StringIO(csv_data))
df

Unnamed: 0,A,B,C,D
0,1.0,2.0,3.0,4.0
1,5.0,6.0,,8.0
2,10.0,11.0,12.0,


In [5]:
df.isnull().sum()

 A    0
B     0
C     1
D     1
dtype: int64

In [6]:
#Dropping rows that have at least 1 NA
df.dropna()

Unnamed: 0,A,B,C,D
0,1.0,2.0,3.0,4.0


In [7]:
# Dropping columns that have at least 1 NA
df.dropna(axis = 1)

Unnamed: 0,A,B
0,1.0,2.0
1,5.0,6.0
2,0.0,11.0


In [10]:
#Only drop rows where all columns are NAN
df.dropna(how='all')

#Drop rows that have not at least 4 non-NaN Values
df.dropna(thresh=4)

#only drop rows where NaN appear in specific columns (i.e. C)
df.dropna(subset=['C'])

Unnamed: 0,A,B,C,D
0,1.0,2.0,3.0,4.0
2,0.0,11.0,12.0,


#### Generally losing data is bad and should be avoided. To help we can impute data

In [11]:
#### Mean Imputation

from sklearn.preprocessing import Imputer

In [17]:
imr = Imputer(missing_values='NaN', strategy='mean', axis = 0)
imr = imr.fit(df)
imputed_data = imr.transform(df.values)
imputed_data

array([[  1. ,   2. ,   3. ,   4. ],
       [  5. ,   6. ,   7.5,   8. ],
       [ 10. ,  11. ,  12. ,   6. ]])

Other options for strategy are median or most_frequent

## Catgorical Data

In [18]:
import pandas as pd

df = pd.DataFrame([
        ['green', 'M', 10.1, 'class1'],
        ['red', 'L', 13.5, 'class2'],
        ['blue', 'XL', 15.3, 'class1']])
df.columns = ['color', 'size', 'price', 'classlabel']
df

Unnamed: 0,color,size,price,classlabel
0,green,M,10.1,class1
1,red,L,13.5,class2
2,blue,XL,15.3,class1


In [20]:
size_mapping = {
    'XL': 3,
    'L' : 2,
    'M' : 1
}
df['size'] = df['size'].map(size_mapping)
df

Unnamed: 0,color,size,price,classlabel
0,green,1,10.1,class1
1,red,2,13.5,class2
2,blue,3,15.3,class1


In [23]:
import numpy as np
class_mapping = {label:idx for idx,label in enumerate(np.unique(df['classlabel']))}
class_mapping

{'class1': 0, 'class2': 1}

In [24]:
df['classlabel'] = df['classlabel'].map(class_mapping)

In [25]:
df

Unnamed: 0,color,size,price,classlabel
0,green,1,10.1,0
1,red,2,13.5,1
2,blue,3,15.3,0


In [27]:
inv_class_map = {v: k for k, v in class_mapping.items()}
df['classlabel'] = df['classlabel'].map(inv_class_map)
df

Unnamed: 0,color,size,price,classlabel
0,green,1,10.1,class1
1,red,2,13.5,class2
2,blue,3,15.3,class1


In [28]:
from sklearn.preprocessing import LabelEncoder
class_le = LabelEncoder()
y = class_le.fit_transform(df['classlabel'].values)
y

array([0, 1, 0], dtype=int64)

In [29]:
class_le.inverse_transform(y)

array(['class1', 'class2', 'class1'], dtype=object)

#### Onehot encoding on nominal features

In [30]:
X = df[['color', 'size', 'price']].values
color_le = LabelEncoder()
X[:, 0] = color_le.fit_transform(X[:,0])
X

array([[1, 1, 10.1],
       [2, 2, 13.5],
       [0, 3, 15.3]], dtype=object)

In [31]:
## THE ABOVE IS BAD since it assumes an ordering of the colors
## Use One-hot enoding to create dummy features

from sklearn.preprocessing import OneHotEncoder
ohe = OneHotEncoder(categorical_features=[0])
ohe.fit_transform(X).toarray()

array([[  0. ,   1. ,   0. ,   1. ,  10.1],
       [  0. ,   0. ,   1. ,   2. ,  13.5],
       [  1. ,   0. ,   0. ,   3. ,  15.3]])

In [32]:
##can also use "get_dummies" to make dummy variables

pd.get_dummies(df[["price", "color", "size"]])

Unnamed: 0,price,size,color_blue,color_green,color_red
0,10.1,1,0,1,0
1,13.5,2,0,0,1
2,15.3,3,1,0,0
