In [1]:
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all" # to run all arguments and not just the last one
from IPython.display import Image
%matplotlib inline

In [2]:
import pandas as pd
from io import StringIO

In [12]:
csv_data = '''A,B,C,D
1.0,2.0,3.0,4.0
5.0,6.0,,8.0
10.0,11.0,12.0,'''

df = pd.read_csv(StringIO(csv_data))
df

Unnamed: 0,A,B,C,D
0,1.0,2.0,3.0,4.0
1,5.0,6.0,,8.0
2,10.0,11.0,12.0,


In [9]:
# we can use the isnull function to look for missing values
df.isnull().sum() # C and D have one null(NaN) each

A    0
B    0
C    1
D    1
dtype: int64

Unnamed: 0,A,B,C,D
0,1.0,2.0,3.0,4.0
1,5.0,6.0,,8.0
2,10.0,11.0,12.0,


In [15]:
# To remove missing values
df.dropna() # will remove all rows with any Nan
df.dropna(axis=1) # to remove any column with NaN

# there are more parameters in dropna
df.dropna(how='all')    # to remove if all in the selected axis are NaN
df.dropna(thresh=4)     # drop rows that don't have at least 4 NaN values
df.dropna(subset=['C']) # drop rows where NaN appears in specific columns(here C)

Unnamed: 0,A,B,C,D
0,1.0,2.0,3.0,4.0


Unnamed: 0,A,B
0,1.0,2.0
1,5.0,6.0
2,10.0,11.0


Unnamed: 0,A,B,C,D
0,1.0,2.0,3.0,4.0
1,5.0,6.0,,8.0
2,10.0,11.0,12.0,


Unnamed: 0,A,B,C,D
0,1.0,2.0,3.0,4.0


Unnamed: 0,A,B,C,D
0,1.0,2.0,3.0,4.0
2,10.0,11.0,12.0,


In [20]:
# Imputing missing values
from sklearn.preprocessing import Imputer
imr = Imputer(missing_values='NaN', strategy='mean', axis=0) # axis=0; impute along columns, axis=1 impute along rows
imr = imr.fit(df)
imputed_data = imr.transform(df.values)
imputed_data

array([[  1. ,   2. ,   3. ,   4. ],
       [  5. ,   6. ,   7.5,   8. ],
       [ 10. ,  11. ,  12. ,   6. ]])

In [3]:
# Handling categorical data
import pandas as pd
df = pd.DataFrame([
        ['green', 'M', 10.1, 'class1'],
        ['red', 'L', 13.5, 'class2'],
        ['blue', 'XL', 15.3, 'class1']
    ])
df.columns = ['color', 'size', 'price', 'classlabel']
df # there is a nominal feature, an ordinal feature and a numerical feature
# the learning algorithms we use for class labels don't use ordinal information in class labels

Unnamed: 0,color,size,price,classlabel
0,green,M,10.1,class1
1,red,L,13.5,class2
2,blue,XL,15.3,class1


In [None]:
# to make sure that the learning algorithm interprets the ordinal features correctly, we convert them to integers