In [1]:
# part 1
# feature extraction 特征提取 


In [13]:
from sklearn.feature_extraction import DictVectorizer
# DictVectorizer 字典特征提取
# 数据集当中类型特征特别多的情况需要特征提取
# 矩阵 - matrix - 二维数组
# 向量 - vector - 一维数组

# 重点： 类别通过 one-hot 编码

dictData = [{
            'city':"北京",'temperature':100
            },{
            'city':"上海",'temperature':60
            },{
            'city':"深圳",'temperature':30
            },]
# 1. 实例化转换器类
transfer_sparse = DictVectorizer()
transfer = DictVectorizer(sparse=False)

# 2. fit_transform() default return sparse matrix.
# 默认返回稀疏矩阵
data_new_s = transfer_sparse.fit_transform(dictData)
data_new = transfer.fit_transform(dictData)

print(transfer_sparse.feature_names_)
print(data_new_s)
# (0, 1)	1.0
# (0, 3)	100.0
# (1, 0)	1.0
# (1, 3)	60.0
# (2, 2)	1.0
# (2, 3)	30.0
print(transfer.feature_names_)
print(data_new)
# ['city=上海', 'city=北京', 'city=深圳', 'temperature']
# [[  0.   1.   0. 100.]
#  [  1.   0.   0.  60.]
#  [  0.   0.   1.  30.]]

# 稀疏矩阵前的元组是非稀疏矩阵的位置，后面是值
# 稀疏矩阵只有非零值
# 当类型特别多时，用稀疏矩阵能有效节省内存


['city=上海', 'city=北京', 'city=深圳', 'temperature']
  (0, 1)	1.0
  (0, 3)	100.0
  (1, 0)	1.0
  (1, 3)	60.0
  (2, 2)	1.0
  (2, 3)	30.0
['city=上海', 'city=北京', 'city=深圳', 'temperature']
[[  0.   1.   0. 100.]
 [  1.   0.   0.  60.]
 [  0.   0.   1.  30.]]


In [26]:
# 文本特征提取(english)

from sklearn.feature_extraction.text import CountVectorizer 
# function 1
# CountVectorizer 
# return: matrix of token counts

data = ["life is short,i like python","life is too long,i dislike python"]
# 1. 实例化转换器类
transfer = CountVectorizer()
# fit_transform
data_new = transfer.fit_transform(data)
print(transfer.get_feature_names_out())
print(data_new.toarray())


['dislike' 'is' 'life' 'like' 'long' 'python' 'short' 'too']
[[0 1 1 1 0 1 1 0]
 [1 1 1 0 1 1 0 1]]


In [5]:
# 文本特征提取（chinese）
from sklearn.feature_extraction.text import CountVectorizer 
import jieba
data = ["一种还是一种今天很残酷，明天更残酷，后天很美好，但绝对大部分是死在明天晚上，所以每个人不要放弃今天。",
        "我们看到的从很远星系来的光是在几百万年之前发出的，这样当我们看到宇宙时，我们是在看它的过去。",
        "如果只用一种方式了解某样事物，你就不会真正了解它。了解事物真正含义的秘密取决于如何将其与我们所了解的事物相联系。"]
data_cut = []
for s in data:
    data_cut.append(" ".join(jieba.lcut(s)))
# 1. 实例化转换器类
transfer = CountVectorizer(stop_words=["一种","不要"]) # stop_words 停用词表
# fit_transform
data_new = transfer.fit_transform(data_cut)
print(transfer.get_feature_names_out())
print(data_new.toarray())






['不会' '之前' '了解' '事物' '今天' '光是在' '几百万年' '发出' '取决于' '只用' '后天' '含义' '大部分'
 '如何' '如果' '宇宙' '我们' '所以' '放弃' '方式' '明天' '星系' '晚上' '某样' '残酷' '每个' '看到'
 '真正' '秘密' '绝对' '美好' '联系' '过去' '还是' '这样']
[[0 0 0 0 2 0 0 0 0 0 1 0 1 0 0 0 0 1 1 0 2 0 1 0 2 1 0 0 0 1 1 0 0 1 0]
 [0 1 0 0 0 1 1 1 0 0 0 0 0 0 0 1 3 0 0 0 0 1 0 0 0 0 2 0 0 0 0 0 1 0 1]
 [1 0 4 3 0 0 0 0 1 1 0 1 0 1 1 0 1 0 0 1 0 0 0 1 0 0 0 2 1 0 0 1 0 0 0]]


In [10]:
# tf-idf 文本特征提取
# term frequency–inverse document frequency
# tf-idf 用于挖掘文章中的关键词
# tf 词频 某一个词在文章中的频率 term frequency
# idf 逆向文档频率 一个词语普遍重要性的度量inverse document frequency
# idf = lg(语料库的文档总数/包含该词的文档数+1)
# tf-if = tf * idf
# tf-idf 值越大，用来做分类的价值就越高
from sklearn.feature_extraction.text import TfidfVectorizer
import numpy as np
transfer = TfidfVectorizer()
data_new = transfer.fit_transform(data_cut)
print(transfer.get_feature_names_out())
print(data_new.toarray())
print(np.amax(data_new)) # 最大 tf-idf


['一种' '不会' '不要' '之前' '了解' '事物' '今天' '光是在' '几百万年' '发出' '取决于' '只用' '后天' '含义'
 '大部分' '如何' '如果' '宇宙' '我们' '所以' '放弃' '方式' '明天' '星系' '晚上' '某样' '残酷' '每个'
 '看到' '真正' '秘密' '绝对' '美好' '联系' '过去' '还是' '这样']
[[0.30847454 0.         0.20280347 0.         0.         0.
  0.40560694 0.         0.         0.         0.         0.
  0.20280347 0.         0.20280347 0.         0.         0.
  0.         0.20280347 0.20280347 0.         0.40560694 0.
  0.20280347 0.         0.40560694 0.20280347 0.         0.
  0.         0.20280347 0.20280347 0.         0.         0.20280347
  0.        ]
 [0.         0.         0.         0.2410822  0.         0.
  0.         0.2410822  0.2410822  0.2410822  0.         0.
  0.         0.         0.         0.         0.         0.2410822
  0.55004769 0.         0.         0.         0.         0.2410822
  0.         0.         0.         0.         0.48216441 0.
  0.         0.         0.         0.         0.2410822  0.
  0.2410822 ]
 [0.12001469 0.15780489 0.         0