## 创建大数据集

In [2]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import warnings
warnings.filterwarnings('ignore')

In [20]:
from sklearn.datasets import make_classification

x, y = make_classification(n_samples=10**5,
                           n_features=5,
                           n_informative=3,
                           random_state=101)
D = np.c_[y, x]
np.savetxt('data_set/large_dataset_10_5.csv', D, delimiter=',')
del (D, x, y)
x, y = make_classification(n_samples=10**6,
                           n_features=5,
                           n_informative=3,
                           random_state=101)
D = np.c_[y, x]
np.savetxt('data_set/large_dataset_10_6.csv', D, delimiter=',')
del (D, x, y)
x, y = make_classification(n_samples=10**7,
                           n_features=5,
                           n_informative=3,
                           random_state=101)
D = np.c_[y, x]
np.savetxt('data_set/large_dataset_10_7.csv', D, delimiter=',')
del (D, x, y)

In [34]:
import os

os.remove('data_set/large_dataset_10_5.csv')
os.remove('data_set/large_dataset_10_6.csv')
os.remove('data_set/large_dataset_10_7.csv')

In [12]:
from sklearn.naive_bayes import MultinomialNB, GaussianNB, BernoulliNB
from sklearn.linear_model import Perceptron, SGDClassifier, PassiveAggressiveClassifier
from datetime import datetime

In [27]:
classifiers = {
    'SGD hinge loss':
        SGDClassifier(loss='hinge', random_state=101, max_iter=10),
    'SGD log loss':
        SGDClassifier(loss='log', random_state=101, max_iter=10),
    'Perceptron':
        Perceptron(random_state=101, max_iter=100),
    'Bernoulli':
        BernoulliNB(),
    'Gaussian':
        GaussianNB(),
    'PassiveAggressive':
        PassiveAggressiveClassifier(random_state=101, max_iter=10)
}

In [29]:
for algo in classifiers:
    start = datetime.now()
    streaming = pd.read_csv('data_set/large_dataset_10_6.csv',
                            header=None,
                            chunksize=1000)
    model = classifiers[algo]
    cumulative_accuracy = []
    for n, chunk in enumerate(streaming):
        y = chunk.iloc[:, 0]
        X = chunk.iloc[:, 1:]
        if n >= 800:
            cumulative_accuracy.append(model.score(X, y))
        model.partial_fit(X, y, classes=np.unique(y))
    elapsed_time = datetime.now() - start
    print('{} : mean accuracy {:.2f}% in {} seconds'.format(
        algo, np.mean(cumulative_accuracy)*100, elapsed_time.total_seconds()))

SGD hinge loss : mean accuracy 76.66% in 10.005246 seconds
SGD log loss : mean accuracy 74.41% in 10.134123 seconds
Perceptron : mean accuracy 66.97% in 9.312127 seconds
Bernoulli : mean accuracy 65.29% in 9.634428 seconds
Gaussian : mean accuracy 86.35% in 9.904968 seconds
PassiveAggressive : mean accuracy 71.08% in 10.08805 seconds


In [30]:
l7=pd.read_csv('data_set/large_dataset_10_7.csv')

In [31]:
l7.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 9999999 entries, 0 to 9999998
Data columns (total 6 columns):
 #   Column                     Dtype  
---  ------                     -----  
 0   1.000000000000000000e+00   float64
 1   -4.976686997050616679e-01  float64
 2   -1.810883063255535275e+00  float64
 3   -8.626345720665737904e-01  float64
 4   2.328431776653829832e-01   float64
 5   -4.629339945485108743e+00  float64
dtypes: float64(6)
memory usage: 457.8 MB


In [32]:
l6=pd.read_csv('data_set/large_dataset_10_6.csv')

In [33]:
l6.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 999999 entries, 0 to 999998
Data columns (total 6 columns):
 #   Column                     Non-Null Count   Dtype  
---  ------                     --------------   -----  
 0   1.000000000000000000e+00   999999 non-null  float64
 1   2.869647574340774998e+00   999999 non-null  float64
 2   -1.020285386650018999e+00  999999 non-null  float64
 3   -2.092419718316489874e+00  999999 non-null  float64
 4   -3.937372437152131255e-02  999999 non-null  float64
 5   -2.629424326924810096e+00  999999 non-null  float64
dtypes: float64(6)
memory usage: 45.8 MB
