# Tạo mới đặt trưng - Feature extraction

## 1. Sử dụng biến đổi toán học

In [1]:
import pandas as pd

In [2]:
df = pd.DataFrame({
    "verlocity" : [7, 9, 11, 20, 10],
    "time" : [8, 10, 6, 4, 3]
})
df

Unnamed: 0,verlocity,time
0,7,8
1,9,10
2,11,6
3,20,4
4,10,3


In [3]:
df['distance'] = df['verlocity'] * df['time']
df

Unnamed: 0,verlocity,time,distance
0,7,8,56
1,9,10,90
2,11,6,66
3,20,4,80
4,10,3,30


## 2. Đếm số lần, tần suất xuất hiện

In [4]:
df = pd.DataFrame({
    "Value" : [100, 150, 50, 200, 100, 100],
    "Color" : ["Red", "Red", "Blue", "Red", "Green", "Blue"]
})
df

Unnamed: 0,Value,Color
0,100,Red
1,150,Red
2,50,Blue
3,200,Red
4,100,Green
5,100,Blue


In [9]:
color_count = df['Color'].value_counts().to_dict()
color_count

{'Red': 3, 'Blue': 2, 'Green': 1}

In [10]:
df['Color_count'] = df['Color'].map(color_count)
df

Unnamed: 0,Value,Color,Color_count
0,100,Red,3
1,150,Red,3
2,50,Blue,2
3,200,Red,3
4,100,Green,1
5,100,Blue,2


## 3. Đếm số lượng theo nhiều cột

In [11]:
df = pd.DataFrame({
    "Bus" : [0, 0, 0, 1, 0],
    "Car" : [0, 0, 0, 0, 1],
    "Motobike" : [0, 1, 0, 0, 1]
})
df

Unnamed: 0,Bus,Car,Motobike
0,0,0,0
1,0,0,1
2,0,0,0
3,1,0,0
4,0,1,1


In [13]:
vehicles = ['Car', 'Motobike', 'Bus']
df['Use_vehicle'] = df[vehicles].max(axis=1)
df

Unnamed: 0,Bus,Car,Motobike,Use_vehicle
0,0,0,0,0
1,0,0,1,1
2,0,0,0,0
3,1,0,0,1
4,0,1,1,1


## 4. Phân rã đặc trưng từ chuỗi (string) có cấu trúc

In [14]:
df = pd.DataFrame({
    "OSInfo" : ['Apple MacOS X 10.0', 'Microsoft Windows 11', 'Mcrosoft Window XP']
})
df

Unnamed: 0,OSInfo
0,Apple MacOS X 10.0
1,Microsoft Windows 11
2,Mcrosoft Window XP


In [21]:
df['OSInfo'].str.split(' ', n=1,expand=True)

Unnamed: 0,0,1
0,Apple,MacOS X 10.0
1,Microsoft,Windows 11
2,Mcrosoft,Window XP


In [23]:
df['OSInfo'].str.split(' ', n=1,expand=True).rename(columns={0:'Company', 1:'OS'})

Unnamed: 0,Company,OS
0,Apple,MacOS X 10.0
1,Microsoft,Windows 11
2,Mcrosoft,Window XP


In [24]:
df.join(df['OSInfo'].str.split(' ', n=1,expand=True).rename(columns={0:'Company', 1:'OS'}))

Unnamed: 0,OSInfo,Company,OS
0,Apple MacOS X 10.0,Apple,MacOS X 10.0
1,Microsoft Windows 11,Microsoft,Windows 11
2,Mcrosoft Window XP,Mcrosoft,Window XP


## 5. Tổng hợp đặc trưng

In [26]:
df = pd.DataFrame({
    "Make" : ["Toyota", "Audi", "Honda", "Honda", "Toyota", "Mercedes"],
    "Type" : ["Senda", "Senda", "Crossover", "Hatchback", "SUV", "Sendan"],
})
df

Unnamed: 0,Make,Type
0,Toyota,Senda
1,Audi,Senda
2,Honda,Crossover
3,Honda,Hatchback
4,Toyota,SUV
5,Mercedes,Sendan


In [27]:
df["Make_Type"] = df["Make"] + "_" + df["Type"]
df

Unnamed: 0,Make,Type,Make_Type
0,Toyota,Senda,Toyota_Senda
1,Audi,Senda,Audi_Senda
2,Honda,Crossover,Honda_Crossover
3,Honda,Hatchback,Honda_Hatchback
4,Toyota,SUV,Toyota_SUV
5,Mercedes,Sendan,Mercedes_Sendan


## 6. Tổng hợp đặc trưng theo nhóm

In [28]:
df = pd.DataFrame({
    "City" : ["Danang", "HCM", "Hanoi", "HCM", "HCM", "Hanoi", "Danang"],
    "Salary" : [10, 20, 15, 8, 12, 15, 14]
})
df

Unnamed: 0,City,Salary
0,Danang,10
1,HCM,20
2,Hanoi,15
3,HCM,8
4,HCM,12
5,Hanoi,15
6,Danang,14


In [29]:
df.groupby("City")["Salary"].agg('mean')

Unnamed: 0_level_0,Salary
City,Unnamed: 1_level_1
Danang,12.0
HCM,13.333333
Hanoi,15.0


In [30]:
df.groupby("City")["Salary"].transform('mean')

Unnamed: 0,Salary
0,12.0
1,13.333333
2,15.0
3,13.333333
4,13.333333
5,15.0
6,12.0


In [32]:
df['AvgSalary'] = df.groupby("City")["Salary"].transform('mean')
df

Unnamed: 0,City,Salary,AvgSalary
0,Danang,10,12.0
1,HCM,20,13.333333
2,Hanoi,15,15.0
3,HCM,8,13.333333
4,HCM,12,13.333333
5,Hanoi,15,15.0
6,Danang,14,12.0


## 7. Đặc trưng cụm (phân khúc)

In [42]:
df = pd.DataFrame({
    "City" : ["Danang", "HCM", "Hanoi", "HCM", "HCM", "Hanoi", "Danang"],
    "Salary" : [10, 20, 15, 8, 12, 15, 14]
})
df

Unnamed: 0,City,Salary
0,Danang,10
1,HCM,20
2,Hanoi,15
3,HCM,8
4,HCM,12
5,Hanoi,15
6,Danang,14


In [44]:
from sklearn.cluster import KMeans
import numpy as np

kmeans = KMeans(n_clusters=3, n_init='auto')
df['SalCluster'] = kmeans.fit_predict(np.array(df['Salary']).reshape(-1,1))
df['SalCluster'] = df["SalCluster"].astype('category')
df

Unnamed: 0,City,Salary,SalCluster
0,Danang,10,1
1,HCM,20,2
2,Hanoi,15,0
3,HCM,8,1
4,HCM,12,0
5,Hanoi,15,0
6,Danang,14,0


## 8. Sử dụng đặc trưng giảm chiều với PCA

In [49]:
import numpy as np
import matplotlib.pyplot as plt
from sklearn.decomposition import PCA
from sklearn import datasets
import pandas as pd
from sklearn.preprocessing import StandardScaler

iris = datasets.load_iris()
X = iris.data

scaler=StandardScaler()
X = scaler.fit_transform(X)
X = pd.DataFrame(data=X, columns=iris['feature_names'])


pca = PCA(n_components=2)
X_new = pca.fit_transform(X)

X.join(pd.DataFrame(data=X_new, columns=['PC1', 'PC2']))

Unnamed: 0,sepal length (cm),sepal width (cm),petal length (cm),petal width (cm),PC1,PC2
0,-0.900681,1.019004,-1.340227,-1.315444,-2.264703,0.480027
1,-1.143017,-0.131979,-1.340227,-1.315444,-2.080961,-0.674134
2,-1.385353,0.328414,-1.397064,-1.315444,-2.364229,-0.341908
3,-1.506521,0.098217,-1.283389,-1.315444,-2.299384,-0.597395
4,-1.021849,1.249201,-1.340227,-1.315444,-2.389842,0.646835
...,...,...,...,...,...,...
145,1.038005,-0.131979,0.819596,1.448832,1.870503,0.386966
146,0.553333,-1.282963,0.705921,0.922303,1.564580,-0.896687
147,0.795669,-0.131979,0.819596,1.053935,1.521170,0.269069
148,0.432165,0.788808,0.933271,1.448832,1.372788,1.011254
