### Imputing missing values dengan Imputer

In [23]:
import pandas as pd
from sklearn.preprocessing import Imputer

In [24]:
df = pd.read_csv('Data.csv')
df

Unnamed: 0,Country,Age,Salary,Purchased,Unnamed: 4
0,France,44.0,72000.0,No,
1,Spain,27.0,48000.0,Yes,
2,Germany,30.0,54000.0,No,
3,Spain,38.0,61000.0,No,
4,Germany,40.0,,Yes,
5,France,35.0,58000.0,Yes,
6,Spain,,52000.0,No,
7,France,48.0,79000.0,Yes,
8,Germany,50.0,83000.0,No,
9,France,37.0,67000.0,Yes,


In [25]:
df.isnull().sum()

Country        0
Age            1
Salary         1
Purchased      0
Unnamed: 4    10
dtype: int64

In [26]:
df.dropna()

Unnamed: 0,Country,Age,Salary,Purchased,Unnamed: 4


In [27]:
# drop kolom spesifik yang mengandung NaN 
df.dropna(subset=['Age'])

Unnamed: 0,Country,Age,Salary,Purchased,Unnamed: 4
0,France,44.0,72000.0,No,
1,Spain,27.0,48000.0,Yes,
2,Germany,30.0,54000.0,No,
3,Spain,38.0,61000.0,No,
4,Germany,40.0,,Yes,
5,France,35.0,58000.0,Yes,
7,France,48.0,79000.0,Yes,
8,Germany,50.0,83000.0,No,
9,France,37.0,67000.0,Yes,


In [28]:
df.iloc[:, 1:3]

Unnamed: 0,Age,Salary
0,44.0,72000.0
1,27.0,48000.0
2,30.0,54000.0
3,38.0,61000.0
4,40.0,
5,35.0,58000.0
6,,52000.0
7,48.0,79000.0
8,50.0,83000.0
9,37.0,67000.0


In [29]:
# replace every occurrence of missing_values to one defined by strategy
# which can be mean, median, mode. Axis = 0 means rows, 1 means column
# imputer untuk me-replace missing value

imputer = Imputer(missing_values='NaN', strategy='mean', axis = 0)
df.iloc[:, 1:3] = imputer.fit_transform(df.iloc[:, 1:3])
df

Unnamed: 0,Country,Age,Salary,Purchased,Unnamed: 4
0,France,44.0,72000.0,No,
1,Spain,27.0,48000.0,Yes,
2,Germany,30.0,54000.0,No,
3,Spain,38.0,61000.0,No,
4,Germany,40.0,63777.777778,Yes,
5,France,35.0,58000.0,Yes,
6,Spain,38.777778,52000.0,No,
7,France,48.0,79000.0,Yes,
8,Germany,50.0,83000.0,No,
9,France,37.0,67000.0,Yes,


### 2. Encoding Data Kategori

In [30]:
# Label Encoder will replace every categorical variable with number. Useful for replacing yes by 1, no by 0.
# One Hot Encoder will create a separate column for every variable and give a value of 1 where the variable is present
from sklearn.preprocessing import LabelEncoder

In [31]:
label_encoder = LabelEncoder()
temp = df.copy() #pakai .copy agar refrensinya tidak terbawa/ tidak terubah
temp.iloc[:, 0] = label_encoder.fit_transform(df.iloc[:, 0]) #untuk country
print(label_encoder.classes_)
temp.iloc[:, 3] = label_encoder.fit_transform(df.iloc[:, 3]) #untuk purchased
print(label_encoder.classes_)
print(temp)

['France' 'Germany' 'Spain']
['No' 'Yes']
   Country        Age        Salary  Purchased  Unnamed: 4
0        0  44.000000  72000.000000          0         NaN
1        2  27.000000  48000.000000          1         NaN
2        1  30.000000  54000.000000          0         NaN
3        2  38.000000  61000.000000          0         NaN
4        1  40.000000  63777.777778          1         NaN
5        0  35.000000  58000.000000          1         NaN
6        2  38.777778  52000.000000          0         NaN
7        0  48.000000  79000.000000          1         NaN
8        1  50.000000  83000.000000          0         NaN
9        0  37.000000  67000.000000          1         NaN


In [32]:
# you can pass an array of indices of categorical features
# one_hot_encoder = OneHotEncoder(categorical_features=[0])
# temp = df.copy()
# temp.iloc[:, 0] = one_hot_encoder.fit_transform(df.iloc[:, :0])
# temp
# you can achieve the same thing using get_dummies
pd.get_dummies(df.iloc[:, :-1])

Unnamed: 0,Age,Salary,Country_France,Country_Germany,Country_Spain,Purchased_No,Purchased_Yes
0,44.0,72000.0,1,0,0,1,0
1,27.0,48000.0,0,0,1,0,1
2,30.0,54000.0,0,1,0,1,0
3,38.0,61000.0,0,0,1,1,0
4,40.0,63777.777778,0,1,0,0,1
5,35.0,58000.0,1,0,0,0,1
6,38.777778,52000.0,0,0,1,1,0
7,48.0,79000.0,1,0,0,0,1
8,50.0,83000.0,0,1,0,1,0
9,37.0,67000.0,1,0,0,0,1


### 3. Binarizing

In [33]:
from sklearn.datasets import load_iris

iris_dataset = load_iris()
X = iris_dataset.data
y = iris_dataset.target
feature_names = iris_dataset.feature_names
print(feature_names)

['sepal length (cm)', 'sepal width (cm)', 'petal length (cm)', 'petal width (cm)']


In [34]:
X[:, 1]

array([3.5, 3. , 3.2, 3.1, 3.6, 3.9, 3.4, 3.4, 2.9, 3.1, 3.7, 3.4, 3. ,
       3. , 4. , 4.4, 3.9, 3.5, 3.8, 3.8, 3.4, 3.7, 3.6, 3.3, 3.4, 3. ,
       3.4, 3.5, 3.4, 3.2, 3.1, 3.4, 4.1, 4.2, 3.1, 3.2, 3.5, 3.1, 3. ,
       3.4, 3.5, 2.3, 3.2, 3.5, 3.8, 3. , 3.8, 3.2, 3.7, 3.3, 3.2, 3.2,
       3.1, 2.3, 2.8, 2.8, 3.3, 2.4, 2.9, 2.7, 2. , 3. , 2.2, 2.9, 2.9,
       3.1, 3. , 2.7, 2.2, 2.5, 3.2, 2.8, 2.5, 2.8, 2.9, 3. , 2.8, 3. ,
       2.9, 2.6, 2.4, 2.4, 2.7, 2.7, 3. , 3.4, 3.1, 2.3, 3. , 2.5, 2.6,
       3. , 2.6, 2.3, 2.7, 3. , 2.9, 2.9, 2.5, 2.8, 3.3, 2.7, 3. , 2.9,
       3. , 3. , 2.5, 2.9, 2.5, 3.6, 3.2, 2.7, 3. , 2.5, 2.8, 3.2, 3. ,
       3.8, 2.6, 2.2, 3.2, 2.8, 2.8, 2.7, 3.3, 3.2, 2.8, 3. , 2.8, 3. ,
       2.8, 3.8, 2.8, 2.8, 2.6, 3. , 3.4, 3.1, 3. , 3.1, 3.1, 3.1, 2.7,
       3.2, 3.3, 3. , 2.5, 3. , 3.4, 3. ])

Kita akan mengubah 0 jika dibawah rata-rata, dan 1 jika diatas rata-rata

In [35]:
from sklearn.preprocessing import Binarizer
binarizer_obj = Binarizer(threshold=X[:, 1].mean()) #pakai threshold untuk menentukan angka jika dibawah/atas rata2. untuk kasus ini 0 jika dibawah dan 1 jika diatas
X[:, 1:2] = binarizer_obj.fit_transform(X[:, 1].reshape(-1, 1))
X[:, 1]

array([1., 0., 1., 1., 1., 1., 1., 1., 0., 1., 1., 1., 0., 0., 1., 1., 1.,
       1., 1., 1., 1., 1., 1., 1., 1., 0., 1., 1., 1., 1., 1., 1., 1., 1.,
       1., 1., 1., 1., 0., 1., 1., 0., 1., 1., 1., 0., 1., 1., 1., 1., 1.,
       1., 1., 0., 0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 1., 0., 0.,
       0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       1., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 1., 0.,
       0., 0., 0., 0., 0., 0., 0., 1., 1., 0., 0., 0., 0., 1., 0., 1., 0.,
       0., 1., 0., 0., 0., 1., 1., 0., 0., 0., 0., 0., 1., 0., 0., 0., 0.,
       1., 1., 0., 1., 1., 1., 0., 1., 1., 0., 0., 0., 1., 0.])

### 4. Fitur Scaling

In [19]:
import pandas as pd
import numpy as np

from sklearn.preprocessing import StandardScaler, Normalizer, MinMaxScaler

df = pd.read_csv('Data.csv').dropna()
print(df)
X = df[["Age", "Salary"]].values.astype(np.float64)
print(X)

Empty DataFrame
Columns: [Country, Age, Salary, Purchased, Unnamed: 4]
Index: []
[]


In [22]:
standard_scaler = StandardScaler()
normalizer = Normalizer()
min_max_scaler = MinMaxScaler()

print("Standardization")
print(standard_scaler.fit_transform(X,[y]))

# print("Normalizing")
# print(normalizer.fit_transform(X))

# print("MinMax Scaling")
# print(min_max_scaler.fit_transform(X))

Standardization


ValueError: Found array with 0 sample(s) (shape=(0, 2)) while a minimum of 1 is required by StandardScaler.

### 5.1 Count Vectorizer

In [36]:
from sklearn.feature_extraction.text import CountVectorizer #"feature_extraction.text" adalah submodul

docs = ["Mayur mayur is a nice boy.", "Mayur rock! wohooo!", "My name is Mayur, and I am a Pythonista!"]
cv = CountVectorizer() #inisiasi baru
X = cv.fit_transform(docs) #fit_transform adalah methode
print(X) 
print(cv.vocabulary_) 
print(X.todense()) # kemunculan per kata



  (0, 2)	1
  (0, 7)	1
  (0, 3)	1
  (0, 4)	2
  (1, 10)	1
  (1, 9)	1
  (1, 4)	1
  (2, 8)	1
  (2, 0)	1
  (2, 1)	1
  (2, 6)	1
  (2, 5)	1
  (2, 3)	1
  (2, 4)	1
{'mayur': 4, 'is': 3, 'nice': 7, 'boy': 2, 'rock': 9, 'wohooo': 10, 'my': 5, 'name': 6, 'and': 1, 'am': 0, 'pythonista': 8}
[[0 0 1 1 2 0 0 1 0 0 0]
 [0 0 0 0 1 0 0 0 0 1 1]
 [1 1 0 1 1 1 1 0 1 0 0]]


In [41]:
from sklearn.feature_extraction import DictVectorizer

docs = [{"Aku": 1, "suka": 1, "makan": 2}, {"Aku": 1, "tidak": 1, "suka": 2, "makan": 3, "kambing": 1, "bakar": 2, "madu": 3}]
dv = DictVectorizer(sort=False) #jadi sort=false ini untuk mengurutkan indeks per kata, lihat jadi urutkan? 0-6
X = dv.fit_transform(docs)
print(X)
print(dv.vocabulary_)
print(X.todense())

  (0, 0)	1.0
  (0, 1)	1.0
  (0, 2)	2.0
  (1, 0)	1.0
  (1, 1)	2.0
  (1, 2)	3.0
  (1, 3)	1.0
  (1, 4)	1.0
  (1, 5)	2.0
  (1, 6)	3.0
{'Aku': 0, 'suka': 1, 'makan': 2, 'tidak': 3, 'kambing': 4, 'bakar': 5, 'madu': 6}
[[1. 1. 2. 0. 0. 0. 0.]
 [1. 2. 3. 1. 1. 2. 3.]]


In [47]:
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer

tfidf_vectorizer = TfidfVectorizer() 
cv_vectorizer = CountVectorizer()
docs = ["Mayur is a Guitarist Guitarist", "Mayur is Musician", "Mayur is also a programmer"]
X_idf = tfidf_vectorizer.fit_transform(docs)
X_cv = cv_vectorizer.fit_transform(docs)
print(X_idf.todense())
print(tfidf_vectorizer.vocabulary_)
print(X_cv.todense())

[[0.         0.92276146 0.27249889 0.27249889 0.         0.        ]
 [0.         0.         0.45329466 0.45329466 0.76749457 0.        ]
 [0.6088451  0.         0.35959372 0.35959372 0.         0.6088451 ]]
{'mayur': 3, 'is': 2, 'guitarist': 1, 'musician': 4, 'also': 0, 'programmer': 5}
[[0 2 1 1 0 0]
 [0 0 1 1 1 0]
 [1 0 1 1 0 1]]
