<a href="https://colab.research.google.com/github/afif-af/ml_pytorch_scikitlearn/blob/main/Chapter04ml_pytorch.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [99]:
import pandas as pd
from io import StringIO


In [100]:
csv_data=\
'''A,B,C,D
1.0,2.0,3.0,4.0
5.0,6.0,,8.0
10.0,11.0,12.0'''

In [101]:
df=pd.read_csv(StringIO(csv_data))
df

Unnamed: 0,A,B,C,D
0,1.0,2.0,3.0,4.0
1,5.0,6.0,,8.0
2,10.0,11.0,12.0,


In [102]:
df.isnull().sum()

Unnamed: 0,0
A,0
B,0
C,1
D,1


In [103]:
df.values

array([[ 1.,  2.,  3.,  4.],
       [ 5.,  6., nan,  8.],
       [10., 11., 12., nan]])

In [104]:
df.dropna(axis=0)

Unnamed: 0,A,B,C,D
0,1.0,2.0,3.0,4.0


In [105]:
df.dropna(axis=1)

Unnamed: 0,A,B
0,1.0,2.0
1,5.0,6.0
2,10.0,11.0


In [106]:
df.dropna(how='all')

Unnamed: 0,A,B,C,D
0,1.0,2.0,3.0,4.0
1,5.0,6.0,,8.0
2,10.0,11.0,12.0,


In [107]:
df.dropna(thresh=4)

Unnamed: 0,A,B,C,D
0,1.0,2.0,3.0,4.0


In [108]:
df.dropna(subset=['C'])

Unnamed: 0,A,B,C,D
0,1.0,2.0,3.0,4.0
2,10.0,11.0,12.0,


In [109]:
from sklearn.impute import SimpleImputer
import numpy as np

In [110]:
imr= SimpleImputer(missing_values=np.nan, strategy='mean')
imr=imr.fit(df.values)
imputed_data=imr.transform(df.values)
imputed_data

array([[ 1. ,  2. ,  3. ,  4. ],
       [ 5. ,  6. ,  7.5,  8. ],
       [10. , 11. , 12. ,  6. ]])

In [111]:
df.fillna(df.mean())

Unnamed: 0,A,B,C,D
0,1.0,2.0,3.0,4.0
1,5.0,6.0,7.5,8.0
2,10.0,11.0,12.0,6.0


In [112]:
import pandas as pd
df=pd.DataFrame([
    ['green','M',10.1,'class2'],
    ['red','L',13.5,'class1'],
    ['blue','XL', 15.3,'class2']
])

In [113]:
df.columns=['color','size','price','classlabel']

In [114]:
df

Unnamed: 0,color,size,price,classlabel
0,green,M,10.1,class2
1,red,L,13.5,class1
2,blue,XL,15.3,class2


In [115]:
size_mapping={
    'XL':3,
    'L':2,
    'M':1
}

In [116]:
df['size']=df['size'].map(size_mapping)

In [117]:
df

Unnamed: 0,color,size,price,classlabel
0,green,1,10.1,class2
1,red,2,13.5,class1
2,blue,3,15.3,class2


In [118]:
inv_size_mapping={v: k for k, v in size_mapping.items()}
df['size'].map(inv_size_mapping)

Unnamed: 0,size
0,M
1,L
2,XL


In [119]:
import numpy as np
class_mapping={label:idx for idx, label in
               enumerate (np.unique(df['classlabel']))}
class_mapping

{'class1': 0, 'class2': 1}

In [120]:
df['classlabel']=df['classlabel'].map(class_mapping)

In [121]:
df

Unnamed: 0,color,size,price,classlabel
0,green,1,10.1,1
1,red,2,13.5,0
2,blue,3,15.3,1


In [122]:
inv_class_mapping={v: k for k, v in class_mapping.items()}
df['classlabel']=df['classlabel'].map(inv_class_mapping)
df

Unnamed: 0,color,size,price,classlabel
0,green,1,10.1,class2
1,red,2,13.5,class1
2,blue,3,15.3,class2


In [123]:
from sklearn.preprocessing import LabelEncoder
class_le=LabelEncoder()
y=class_le.fit_transform(df['classlabel'].values)
y

array([1, 0, 1])

In [124]:
class_le.inverse_transform(y)

array(['class2', 'class1', 'class2'], dtype=object)

In [125]:
x=df[['color','size','price']].values
color_le=LabelEncoder()
x[:,0]=color_le.fit_transform(x[:, 0])
x

array([[1, 1, 10.1],
       [2, 2, 13.5],
       [0, 3, 15.3]], dtype=object)

In [128]:
from sklearn.preprocessing import OneHotEncoder
x=df[['color','size','price']].values
color_ohe=OneHotEncoder()
color_ohe.fit_transform(x[:,0].reshape(-1,1)).toarray()

array([[0., 1., 0.],
       [0., 0., 1.],
       [1., 0., 0.]])

In [129]:
from sklearn.compose import ColumnTransformer
x=df[['color','size','price']].values
c_transf=ColumnTransformer([
    ('onehot', OneHotEncoder(),[0]),
    ('nothing','passthrough',[1,2])
    ])

In [130]:
c_transf.fit_transform(x).astype(float)

array([[ 0. ,  1. ,  0. ,  1. , 10.1],
       [ 0. ,  0. ,  1. ,  2. , 13.5],
       [ 1. ,  0. ,  0. ,  3. , 15.3]])

In [131]:
pd.get_dummies(df[['color','size','price']])

Unnamed: 0,size,price,color_blue,color_green,color_red
0,1,10.1,False,True,False
1,2,13.5,False,False,True
2,3,15.3,True,False,False


In [132]:
pd.get_dummies(df[['color','size','price']], drop_first=True)

Unnamed: 0,size,price,color_green,color_red
0,1,10.1,True,False
1,2,13.5,False,True
2,3,15.3,False,False


In [133]:
color_ohe=OneHotEncoder(categories='auto', drop='first')
c_transf=ColumnTransformer([
    ('onehot', color_ohe,[0]),
    ('nothing', 'passthrough',[1,2])
])

In [134]:
c_transf.fit_transform(x).astype(float)

array([[ 1. ,  0. ,  1. , 10.1],
       [ 0. ,  1. ,  2. , 13.5],
       [ 0. ,  0. ,  3. , 15.3]])

In [135]:
df=pd.DataFrame([['green','M',10.1,'class2'],
                 ['red','L',13.5,'class1'],
                 ['blue','XL',15.3,'class2']])
df.columns=['colors','size','price','classlabel']
df

Unnamed: 0,colors,size,price,classlabel
0,green,M,10.1,class2
1,red,L,13.5,class1
2,blue,XL,15.3,class2


In [137]:
df['x>M']=df['size'].apply(
    lambda x: 1 if x in {'L', 'XL'} else 0
)
df['x>L']=df['size'].apply(
    lambda x: 1 if x=='XL' else 0
)

In [138]:
del df['size']
df

Unnamed: 0,colors,price,classlabel,x>M,x>L
0,green,10.1,class2,0,0
1,red,13.5,class1,1,0
2,blue,15.3,class2,1,1


In [143]:
df_wine =pd.read_csv(
    'https://archive.ics.uci.edu/ml/'
    'machine-learning-databases/wine/wine.data',
    header=None
)

In [145]:
df_wine.columns=['Class label','Alcohol','Malic acid','Ash','Alcalinity of ash','Magnesium',
         'Total phenols','Flavanoids','Nonflavanoid phenols','Proanthocyanins','Color intensity','Hue','OD280/OD315 of diluted wines',
         'Proline']
print('Class label',np.unique(df_wine['Class label']))

Class label [1 2 3]


In [146]:
df_wine.head()

Unnamed: 0,Class label,Alcohol,Malic acid,Ash,Alcalinity of ash,Magnesium,Total phenols,Flavanoids,Nonflavanoid phenols,Proanthocyanins,Color intensity,Hue,OD280/OD315 of diluted wines,Proline
0,1,14.23,1.71,2.43,15.6,127,2.8,3.06,0.28,2.29,5.64,1.04,3.92,1065
1,1,13.2,1.78,2.14,11.2,100,2.65,2.76,0.26,1.28,4.38,1.05,3.4,1050
2,1,13.16,2.36,2.67,18.6,101,2.8,3.24,0.3,2.81,5.68,1.03,3.17,1185
3,1,14.37,1.95,2.5,16.8,113,3.85,3.49,0.24,2.18,7.8,0.86,3.45,1480
4,1,13.24,2.59,2.87,21.0,118,2.8,2.69,0.39,1.82,4.32,1.04,2.93,735


In [147]:
from sklearn.model_selection import train_test_split

In [148]:
x,y=df_wine.iloc[:,1].values, df_wine.iloc[:,0].values
x_train,x_test,y_train,y_test=train_test_split(x,y,test_size=0.3, random_state=0,stratify=y)