In [17]:
import numpy as np 
import scipy
import sklearn 

In [18]:
feature_matrix = np.load('F:/dataset/feature_matrix_save.npy')
class_list = np.load('F:/dataset/class_result_save.npy')

### 数据归一化

In [19]:
from sklearn import preprocessing
##归一化
normalized_x=preprocessing.normalize(feature_matrix)
print(normalized_x)
##标准化
standardized_x=preprocessing.scale(feature_matrix)
print(standardized_x)

[[0.31622777 0.31622777 0.31622777 ... 0.         0.         0.        ]
 [0.         0.         0.         ... 0.         0.         0.        ]
 [0.         0.         0.         ... 0.         0.         0.        ]
 ...
 [0.         0.         0.         ... 0.         0.         0.        ]
 [0.         0.         0.         ... 0.         0.         0.        ]
 [0.         0.         0.         ... 0.         0.         0.        ]]
[[ 7.38617327  4.24099011 14.10673598 ... -0.0316386  -0.0316386
  -0.0316386 ]
 [-0.1353881  -0.15381829 -0.07088812 ... -0.0316386  -0.0316386
  -0.0316386 ]
 [-0.1353881  -0.15381829 -0.07088812 ... -0.0316386  -0.0316386
  -0.0316386 ]
 ...
 [-0.1353881  -0.15381829 -0.07088812 ... -0.0316386  -0.0316386
  -0.0316386 ]
 [-0.1353881  -0.15381829 -0.07088812 ... -0.0316386  -0.0316386
  -0.0316386 ]
 [-0.1353881  -0.15381829 -0.07088812 ... -0.0316386  -0.0316386
  -0.0316386 ]]




### 特征选择

在解决一个实际问题的过程中，选择合适的特征或者构建特征的能力特别重要。这成为特征选择或者特征工程。
特征选择时一个很需要创造力的过程，更多的依赖于直觉和专业知识，并且有很多现成的算法来进行特征的选择。

In [22]:
from sklearn.ensemble import RandomTreesEmbedding
model = RandomTreesEmbedding()
print(feature_matrix.shape) # 原特征矩阵规模
feature_matrix =(model.fit(feature_matrix, class_list)).transform(feature_matrix)
print("************************************************")
print(feature_matrix.shape) # 特征选择后 特征矩阵的规模

(1000, 3992)
************************************************
(1000, 76)


### 特征提取

用TFIDF算法来计算特征词的权重值是表示当一个词在这篇文档中出现的频率越高，同时在其他文档中出现的次数越少，则表明该词对于表示这篇文档的区分能力越强，所以其权重值就应该越大。

In [24]:
from sklearn.feature_extraction.text import TfidfTransformer
tfidf_transformer = TfidfTransformer()
feature_matrix = tfidf_transformer.fit_transform(feature_matrix).toarray()

### 朴素贝叶斯(Naive Bayes)
朴素贝叶斯是一个很著名的机器学习算法，主要是根据训练样本的特征来计算各个类别的概率，在多分类问题上用的比较多。

In [25]:
from sklearn import metrics
from sklearn.naive_bayes import GaussianNB
# 构建朴素贝叶斯模型
model = GaussianNB()
model.fit(feature_matrix, class_list)
print(model)
# 使用测试集进行测试(此处将训练集做测试集)
expected = class_list
predicted = model.predict(feature_matrix)
# 输出测试效果
print(metrics.classification_report(expected, predicted))
print(metrics.confusion_matrix(expected, predicted))

GaussianNB(priors=None)
             precision    recall  f1-score   support

          0       1.00      0.08      0.15       500
          1       0.52      1.00      0.68       500

avg / total       0.76      0.54      0.42      1000

[[ 40 460]
 [  0 500]]


### k近邻(k-Nearest Neighbours)
k近邻算法常常被用作是分类算法一部分，比如可以用它来评估特征，在特征选择上我们可以用到它。

In [26]:
from sklearn import metrics
from sklearn.neighbors import KNeighborsClassifier
# 构建knn模型
model = KNeighborsClassifier()
model.fit(feature_matrix, class_list)
print(model)
# 使用测试集进行测试(此处将训练集做测试集)
expected = class_list
predicted = model.predict(feature_matrix)
# 输出测试效果
print(metrics.classification_report(expected, predicted))
print(metrics.confusion_matrix(expected, predicted))

KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
           metric_params=None, n_jobs=1, n_neighbors=5, p=2,
           weights='uniform')
             precision    recall  f1-score   support

          0       0.51      0.99      0.68       500
          1       0.88      0.06      0.11       500

avg / total       0.70      0.53      0.39      1000

[[496   4]
 [470  30]]


### 决策树(Decision Tree)
分类与回归树(Classification and Regression Trees ,CART)算法常用于特征含有类别信息的分类或者回归问题，这种方法非常适用于多分类情况。

In [27]:
from sklearn import metrics
from sklearn.tree import DecisionTreeClassifier
# 构建决策数模型
model = DecisionTreeClassifier()
model.fit(feature_matrix, class_list)
print(model)
# 使用测试集进行测试(此处将训练集做测试集)
expected = class_list
predicted = model.predict(feature_matrix)
# 输出测试效果
print(metrics.classification_report(expected, predicted))
print(metrics.confusion_matrix(expected, predicted))

DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=None,
            max_features=None, max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, presort=False, random_state=None,
            splitter='best')
             precision    recall  f1-score   support

          0       0.94      0.09      0.17       500
          1       0.52      0.99      0.69       500

avg / total       0.73      0.54      0.43      1000

[[ 46 454]
 [  3 497]]
