In [1]:
import numpy as np
import pandas as pd
from sklearn import datasets
from io import StringIO

In [2]:
csv_data = '''A,B,C,D,E
            5.0,2.0,3.0,,6
            1.0,6.0,,8.0,5
            0.0,11.0,12.0,4.0,5
            3.0,,3.0,5.0,
            5.0,1.0,4.0,2.0,4
           '''

In [3]:
df = pd.read_csv(StringIO(csv_data))

In [4]:
df

Unnamed: 0,A,B,C,D,E
0,5.0,2.0,3.0,,6.0
1,1.0,6.0,,8.0,5.0
2,0.0,11.0,12.0,4.0,5.0
3,3.0,,3.0,5.0,
4,5.0,1.0,4.0,2.0,4.0


In [5]:
df.dropna()

Unnamed: 0,A,B,C,D,E
2,0.0,11.0,12.0,4.0,5.0
4,5.0,1.0,4.0,2.0,4.0


In [6]:
df.dropna(how='all')

Unnamed: 0,A,B,C,D,E
0,5.0,2.0,3.0,,6.0
1,1.0,6.0,,8.0,5.0
2,0.0,11.0,12.0,4.0,5.0
3,3.0,,3.0,5.0,
4,5.0,1.0,4.0,2.0,4.0


In [7]:
df.dropna(subset=['C'])

Unnamed: 0,A,B,C,D,E
0,5.0,2.0,3.0,,6.0
2,0.0,11.0,12.0,4.0,5.0
3,3.0,,3.0,5.0,
4,5.0,1.0,4.0,2.0,4.0


In [8]:
df.fillna(0)

Unnamed: 0,A,B,C,D,E
0,5.0,2.0,3.0,0.0,6.0
1,1.0,6.0,0.0,8.0,5.0
2,0.0,11.0,12.0,4.0,5.0
3,3.0,0.0,3.0,5.0,0.0
4,5.0,1.0,4.0,2.0,4.0


In [9]:
df['B'] = df['B'].fillna(df['B'].mean())
df

Unnamed: 0,A,B,C,D,E
0,5.0,2.0,3.0,,6.0
1,1.0,6.0,,8.0,5.0
2,0.0,11.0,12.0,4.0,5.0
3,3.0,5.0,3.0,5.0,
4,5.0,1.0,4.0,2.0,4.0


In [10]:
df['C'] = df['C'].fillna(df['C'].mode())
df

Unnamed: 0,A,B,C,D,E
0,5.0,2.0,3.0,,6.0
1,1.0,6.0,,8.0,5.0
2,0.0,11.0,12.0,4.0,5.0
3,3.0,5.0,3.0,5.0,
4,5.0,1.0,4.0,2.0,4.0


In [11]:
df['D'] = df['D'].fillna(df['D'].median())
df

Unnamed: 0,A,B,C,D,E
0,5.0,2.0,3.0,4.5,6.0
1,1.0,6.0,,8.0,5.0
2,0.0,11.0,12.0,4.0,5.0
3,3.0,5.0,3.0,5.0,
4,5.0,1.0,4.0,2.0,4.0


In [12]:
df['E'] = df['E'].fillna(df['E'].min())
df

Unnamed: 0,A,B,C,D,E
0,5.0,2.0,3.0,4.5,6.0
1,1.0,6.0,,8.0,5.0
2,0.0,11.0,12.0,4.0,5.0
3,3.0,5.0,3.0,5.0,4.0
4,5.0,1.0,4.0,2.0,4.0


In [13]:
df2 = pd.DataFrame(
    [['green', 'M', 10.1, 1],
    ['red', 'L', 13.5, 2],
    ['blue', 'XL', 15.3, 1]]
)
df2.columns = ['color', 'size', 'price', 'classlabel']
df2

Unnamed: 0,color,size,price,classlabel
0,green,M,10.1,1
1,red,L,13.5,2
2,blue,XL,15.3,1


In [14]:
size_mapping = {
    'XL':3,
    'L':2,
    'M':1
}
df2['size'] = df2['size'].map(size_mapping)
df2

Unnamed: 0,color,size,price,classlabel
0,green,1,10.1,1
1,red,2,13.5,2
2,blue,3,15.3,1


In [15]:
pd.get_dummies(df2['color'])

Unnamed: 0,blue,green,red
0,0,1,0
1,0,0,1
2,1,0,0


In [16]:
onehot_encoding = pd.get_dummies(df2['color'], prefix = 'color')

In [17]:
df2 = df2.drop('color', 1)
df2

  df2 = df2.drop('color', 1)


Unnamed: 0,size,price,classlabel
0,1,10.1,1
1,2,13.5,2
2,3,15.3,1


In [18]:
pd.concat([onehot_encoding, df2],axis=1)

Unnamed: 0,color_blue,color_green,color_red,size,price,classlabel
0,0,1,0,1,10.1,1
1,0,0,1,2,13.5,2
2,1,0,0,3,15.3,1


In [19]:
from IPython.display import Math

In [20]:
Math(r'x^{(i)}_{norm}=\frac{x^{(i)}-x_{min}}{x_{max}-x_{min}}')

<IPython.core.display.Math object>

In [21]:
wine = datasets.load_wine()
x = pd.DataFrame(wine['data'], columns=wine['feature_names'])
print("target_names: "+str(wine['target_names']))
y = pd.DataFrame(wine['target'], columns=['target_names'])
data = pd.concat([x,y], axis=1)
data.head(3)

target_names: ['class_0' 'class_1' 'class_2']


Unnamed: 0,alcohol,malic_acid,ash,alcalinity_of_ash,magnesium,total_phenols,flavanoids,nonflavanoid_phenols,proanthocyanins,color_intensity,hue,od280/od315_of_diluted_wines,proline,target_names
0,14.23,1.71,2.43,15.6,127.0,2.8,3.06,0.28,2.29,5.64,1.04,3.92,1065.0,0
1,13.2,1.78,2.14,11.2,100.0,2.65,2.76,0.26,1.28,4.38,1.05,3.4,1050.0,0
2,13.16,2.36,2.67,18.6,101.0,2.8,3.24,0.3,2.81,5.68,1.03,3.17,1185.0,0


In [22]:
data['alcohol'] = (data['alcohol'] - data['alcohol'].min())/\
                            (data['alcohol'].max() - data['alcohol'].min())
data.head()

Unnamed: 0,alcohol,malic_acid,ash,alcalinity_of_ash,magnesium,total_phenols,flavanoids,nonflavanoid_phenols,proanthocyanins,color_intensity,hue,od280/od315_of_diluted_wines,proline,target_names
0,0.842105,1.71,2.43,15.6,127.0,2.8,3.06,0.28,2.29,5.64,1.04,3.92,1065.0,0
1,0.571053,1.78,2.14,11.2,100.0,2.65,2.76,0.26,1.28,4.38,1.05,3.4,1050.0,0
2,0.560526,2.36,2.67,18.6,101.0,2.8,3.24,0.3,2.81,5.68,1.03,3.17,1185.0,0
3,0.878947,1.95,2.5,16.8,113.0,3.85,3.49,0.24,2.18,7.8,0.86,3.45,1480.0,0
4,0.581579,2.59,2.87,21.0,118.0,2.8,2.69,0.39,1.82,4.32,1.04,2.93,735.0,0


In [23]:
Math(r'x^{(i)}_{std}=\frac{x^{(i)}-\mu_{x}}{\sigma_{x}}')

<IPython.core.display.Math object>

In [24]:
data['alcohol'] = (data['alcohol'] - data['alcohol'].mean())/\
                            (data['alcohol'].std())
data.head()

Unnamed: 0,alcohol,malic_acid,ash,alcalinity_of_ash,magnesium,total_phenols,flavanoids,nonflavanoid_phenols,proanthocyanins,color_intensity,hue,od280/od315_of_diluted_wines,proline,target_names
0,1.514341,1.71,2.43,15.6,127.0,2.8,3.06,0.28,2.29,5.64,1.04,3.92,1065.0,0
1,0.245597,1.78,2.14,11.2,100.0,2.65,2.76,0.26,1.28,4.38,1.05,3.4,1050.0,0
2,0.196325,2.36,2.67,18.6,101.0,2.8,3.24,0.3,2.81,5.68,1.03,3.17,1185.0,0
3,1.686791,1.95,2.5,16.8,113.0,3.85,3.49,0.24,2.18,7.8,0.86,3.45,1480.0,0
4,0.294868,2.59,2.87,21.0,118.0,2.8,2.69,0.39,1.82,4.32,1.04,2.93,735.0,0
