Handling Missing Data using SimpleImputer

In [9]:
from sklearn.impute import SimpleImputer
import numpy as np

data = np.array([[1,2,np.nan],
                [4,np.nan,6],
                [np.nan,8,9]
                ])

imputer = SimpleImputer(strategy='mean') #other strategies --> constant [strategy = 'constant', fill_value = x], median,

imputedData = imputer.fit_transform(data)

print(data)
print(imputedData)

[[ 1.  2. nan]
 [ 4. nan  6.]
 [nan  8.  9.]]
[[1.  2.  7.5]
 [4.  5.  6. ]
 [2.5 8.  9. ]]


Handling missing data using KNN imputer

In [14]:
from sklearn.impute import KNNImputer
import numpy as np

data = np.array([[1,2,np.nan],
                [4,np.nan,6],
                [np.nan,8,9],
                 [4,np.nan,2]
                ])

knnImputer = KNNImputer(n_neighbors=2)

imputedData = knnImputer.fit_transform(data)

print(data)
print(imputedData)

[[ 1.  2. nan]
 [ 4. nan  6.]
 [nan  8.  9.]
 [ 4. nan  2.]]
[[1.  2.  4. ]
 [4.  5.  6. ]
 [2.5 8.  9. ]
 [4.  5.  2. ]]


Using fit() and transform with MaxMinScaler 

In [21]:
from sklearn.preprocessing import MinMaxScaler
import numpy as np

data = np.array([[10,20], [15,30], [25,40], [30,60]
                ])
scaler = MinMaxScaler()
scaler.fit(data)

scaled_data = scaler.transform(data)
print(scaled_data)

[[0.   0.  ]
 [0.25 0.25]
 [0.75 0.5 ]
 [1.   1.  ]]


Feature Extration using CountVectorizer

In [22]:
from sklearn.feature_extraction.text import CountVectorizer
documents = [
    "Your Service is very very bad",
    "Tcs is service based company.",
    "You work in bad service company."
]
count_vectorizer = CountVectorizer()

count_matrix = count_vectorizer.fit_transform(documents)

print("Vocabulary:", count_vectorizer.vocabulary_)
print(count_matrix.toarray())

Vocabulary: {'your': 10, 'service': 5, 'is': 4, 'very': 7, 'bad': 0, 'tcs': 6, 'based': 1, 'company': 2, 'you': 9, 'work': 8, 'in': 3}
[[1 0 0 0 1 1 0 2 0 0 1]
 [0 1 1 0 1 1 1 0 0 0 0]
 [1 0 1 1 0 1 0 0 1 1 0]]


Feature Extraction

In [31]:
from sklearn.feature_extraction import DictVectorizer

data=[
    {'age':30, 'gender': 'female'},
    {'age': 12, 'gender': 'male'},
    {'age': 35, 'gender': 'male'} 
]

vectorizer = DictVectorizer()

features_matrix = vectorizer.fit_transform(data) 

feature_names = vectorizer.get_feature_names_out()
print (feature_names)
print (features_matrix)

['age' 'gender=female' 'gender=male']
<Compressed Sparse Row sparse matrix of dtype 'float64'
	with 6 stored elements and shape (3, 3)>
  Coords	Values
  (0, 0)	30.0
  (0, 1)	1.0
  (1, 0)	12.0
  (1, 2)	1.0
  (2, 0)	35.0
  (2, 2)	1.0


Encoding Category Variables 
Label Encoding

In [3]:
from sklearn.preprocessing import LabelEncoder

data = ['red', 'blue', 'green', 'blue', 'red']
label_encoder = LabelEncoder()
encoded_data = label_encoder.fit_transform(data)

print(encoded_data)


[2 0 1 0 2]


One-Hot Encoding

In [7]:
from sklearn.preprocessing import OneHotEncoder
import numpy as np

# Sample data
data = np.array(['red', 'blue', 'green', 'blue', 'red']).reshape(-1, 1)

# Initialize OneHotEncoder
onehot_encoder = OneHotEncoder(sparse_output=False)

# Fit and transform the data
encoded_data = onehot_encoder.fit_transform(data)

print(encoded_data)


[[0. 0. 1.]
 [1. 0. 0.]
 [0. 1. 0.]
 [1. 0. 0.]
 [0. 0. 1.]]
