<a href="https://colab.research.google.com/github/zainali60/machine-learning-with-python-practice/blob/main/chap4_handling_numerical_data.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import numpy as np
from sklearn import preprocessing

In [2]:
# Create feature
feature = np.array([[-500.5],
                    [-100.1],
                    [0],
                    [100.1],
                    [900.9]])
# Create scaler
minmax_scale = preprocessing.MinMaxScaler(feature_range=(0, 1))

In [3]:
minmax_scale


MinMaxScaler(copy=True, feature_range=(0, 1))

In [4]:
# Scale feature
scaled_feature = minmax_scale.fit_transform(feature)

In [5]:
scaled_feature

array([[0.        ],
       [0.28571429],
       [0.35714286],
       [0.42857143],
       [1.        ]])

In [6]:
# Create feature
x = np.array([[-1000.1],
              [-200.2],
              [500.5],
              [600.6],
              [9000.9]])
# Create scaler
scaler = preprocessing.StandardScaler()

In [7]:
scaler

StandardScaler(copy=True, with_mean=True, with_std=True)

In [8]:
# Transform the feature
standardized = scaler.fit_transform(x)
standardized

array([[-0.76058269],
       [-0.54177196],
       [-0.35009716],
       [-0.32271504],
       [ 1.97516685]])

In [9]:
# Print mean and standard deviation
print("Mean:", round(standardized.mean()))
print("Standard deviation:", standardized.std())

Mean: 0
Standard deviation: 1.0


In [10]:
# Create scaler
robust_scaler = preprocessing.RobustScaler()

In [11]:
# Transform feature
robust_scaler.fit_transform(x)

array([[-1.87387612],
       [-0.875     ],
       [ 0.        ],
       [ 0.125     ],
       [10.61488511]])

In [12]:
from sklearn.preprocessing import Normalizer

In [13]:
# Create feature matrix
features = np.array([[0.5, 0.5],
                     [1.1, 3.4],
                     [1.5, 20.2],
                     [1.63, 34.4],
                     [10.9, 3.3]])
# Create normalizer
normalizer = Normalizer(norm="l2")
normalizer

Normalizer(copy=True, norm='l2')

In [14]:
# Transform feature matrix
normalizer.transform(features)

array([[0.70710678, 0.70710678],
       [0.30782029, 0.95144452],
       [0.07405353, 0.99725427],
       [0.04733062, 0.99887928],
       [0.95709822, 0.28976368]])

In [15]:
# Transform feature matrix
features_l2_norm = Normalizer(norm="l2").transform(features)
# Show feature matrix
features_l2_norm

array([[0.70710678, 0.70710678],
       [0.30782029, 0.95144452],
       [0.07405353, 0.99725427],
       [0.04733062, 0.99887928],
       [0.95709822, 0.28976368]])

In [16]:
# Transform feature matrix
features_l1_norm = Normalizer(norm="l1").transform(features)
# Show feature matrix
features_l1_norm

array([[0.5       , 0.5       ],
       [0.24444444, 0.75555556],
       [0.06912442, 0.93087558],
       [0.04524008, 0.95475992],
       [0.76760563, 0.23239437]])

In [17]:
 # Print sum
print("Sum of the first observation\'s values:",
 features_l1_norm[0, 0] + features_l1_norm[0, 1])

Sum of the first observation's values: 1.0


In [18]:
from sklearn.preprocessing import PolynomialFeatures

In [22]:
# Create feature matrix
features = np.array([[2, 3],[2, 3],[2, 3]])

# Create PolynomialFeatures object
polynomial_interaction = PolynomialFeatures(degree=3, include_bias=False)
polynomial_interaction

PolynomialFeatures(degree=3, include_bias=False, interaction_only=False,
                   order='C')

In [23]:
# Create polynomial features
polynomial_interaction.fit_transform(features)


array([[ 2.,  3.,  4.,  6.,  9.,  8., 12., 18., 27.],
       [ 2.,  3.,  4.,  6.,  9.,  8., 12., 18., 27.],
       [ 2.,  3.,  4.,  6.,  9.,  8., 12., 18., 27.]])

In [24]:
from sklearn.preprocessing import FunctionTransformer

In [25]:
# Create feature matrix
features = np.array([[2, 3],
                     [2, 3],
                     [2, 3]])
# Define a simple function
def add_ten(x):
 return x + 10
# Create transformer
ten_transformer = FunctionTransformer(add_ten)
# Transform feature matrix
ten_transformer.transform(features)

array([[12, 13],
       [12, 13],
       [12, 13]])

In [27]:
import pandas as pd
df = pd.DataFrame(features, columns=["Feature_1","Feature_2"])
df

Unnamed: 0,Feature_1,Feature_2
0,2,3
1,2,3
2,2,3


In [28]:
df.apply(add_ten)

Unnamed: 0,Feature_1,Feature_2
0,12,13
1,12,13
2,12,13


In [29]:
from sklearn.covariance import EllipticEnvelope
from sklearn.datasets import make_blobs

In [30]:
# Create simulated data
features, _ = make_blobs(n_samples = 10,n_features = 2,centers = 1,random_state = 1)


In [31]:
# Replace the first observation's values with extreme values
features[0,0] = 10000
features[0,1] = 10000

In [32]:
outlier_detector = EllipticEnvelope(contamination=.1)
# Fit detector
outlier_detector.fit(features)
# Predict outliers
outlier_detector.predict(features)

array([-1,  1,  1,  1,  1,  1,  1,  1,  1,  1])

In [33]:
# Create one feature
feature = features[:,0]
# Create a function to return index of outliers
def indicies_of_outliers(x):
 q1, q3 = np.percentile(x, [25, 75])
 iqr = q3 - q1
 lower_bound = q1 - (iqr * 1.5)
 upper_bound = q3 + (iqr * 1.5)
 return np.where((x > upper_bound) | (x < lower_bound))
# Run function
indicies_of_outliers(feature)

(array([0]),)

In [34]:
# Create DataFrame
houses = pd.DataFrame()
houses['Price'] = [534433, 392333, 293222, 4322032]
houses['Bathrooms'] = [2, 3.5, 2, 116]
houses['Square_Feet'] = [1500, 2500, 1500, 48000]

In [35]:
# Filter observations
houses[houses['Bathrooms'] < 20]

Unnamed: 0,Price,Bathrooms,Square_Feet
0,534433,2.0,1500
1,392333,3.5,2500
2,293222,2.0,1500


In [43]:
# Create feature based on boolean condition
houses["Outlier"] = np.where(houses["Bathrooms"] < 20,0,1)
houses

Unnamed: 0,Price,Bathrooms,Square_Feet,Outlier
0,534433,2.0,1500,0
1,392333,3.5,2500,0
2,293222,2.0,1500,0
3,4322032,116.0,48000,1


In [44]:
# Log feature
houses["Log_Of_Square_Feet"] = [np.log(x) for x in houses["Square_Feet"]]
houses

Unnamed: 0,Price,Bathrooms,Square_Feet,Outlier,Log_Of_Square_Feet
0,534433,2.0,1500,0,7.31322
1,392333,3.5,2500,0,7.824046
2,293222,2.0,1500,0,7.31322
3,4322032,116.0,48000,1,10.778956


In [45]:
from sklearn.preprocessing import Binarizer


In [46]:
age = np.array([[6],[12],[20],[36],[65]])
# Create binarizer
binarizer = Binarizer(18)

In [47]:
# Transform feature
binarizer.fit_transform(age)

array([[0],
       [0],
       [1],
       [1],
       [1]])

In [48]:
# Bin feature
np.digitize(age, bins=[20,30,64])

array([[0],
       [0],
       [1],
       [2],
       [3]])

In [50]:
np.digitize(age, bins=[18])

array([[0],
       [0],
       [1],
       [1],
       [1]])

In [51]:
from sklearn.datasets import make_blobs
from sklearn.cluster import KMeans

In [52]:
# Make simulated feature matrix
features, _ = make_blobs(n_samples = 50,
 n_features = 2,
 centers = 3,
 random_state = 1)
# Create DataFrame
dataframe = pd.DataFrame(features, columns=["feature_1", "feature_2"])
# Make k-means clusterer
clusterer = KMeans(3, random_state=0)
# Fit clusterer
clusterer.fit(features)
# Predict values
dataframe["group"] = clusterer.predict(features)
# View first few observations
dataframe.head(5)

Unnamed: 0,feature_1,feature_2,group
0,-9.877554,-3.336145,2
1,-7.28721,-8.353986,0
2,-6.943061,-7.023744,0
3,-7.440167,-8.791959,0
4,-6.641388,-8.075888,0


In [53]:
features = np.array([[1.1, 11.1],[2.2, 22.2],[3.3, 33.3],[4.4, 44.4],[np.nan, 55]])
features

array([[ 1.1, 11.1],
       [ 2.2, 22.2],
       [ 3.3, 33.3],
       [ 4.4, 44.4],
       [ nan, 55. ]])

In [54]:
# Keep only observations that are not (denoted by ~) missing
features[~np.isnan(features).any(axis=1)]

array([[ 1.1, 11.1],
       [ 2.2, 22.2],
       [ 3.3, 33.3],
       [ 4.4, 44.4]])

In [55]:
dataframe = pd.DataFrame(features, columns=["feature_1", "feature_2"])
# Remove observations with missing values
dataframe.dropna()

Unnamed: 0,feature_1,feature_2
0,1.1,11.1
1,2.2,22.2
2,3.3,33.3
3,4.4,44.4
