In [1]:
import numpy as np
from sklearn.linear_model import SGDClassifier
from sklearn.datasets import make_classification
from sklearn.model_selection import train_test_split
from sklearn.metrics import ConfusionMatrixDisplay, confusion_matrix, classification_report

In [2]:
X, y = make_classification(n_samples=50000, n_features=10, n_classes=3, n_clusters_per_class=1)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.15)

In [3]:
clf1 = SGDClassifier(max_iter=1000, tol=0.01)
clf1.fit(X_train, y_train)

SGDClassifier(tol=0.01)

In [4]:
train_score = clf1.score(X_train, y_train)
test_score = clf1.score(X_test, y_test)
train_score, test_score

(0.8951058823529412, 0.8961333333333333)

In [5]:
print(classification_report(y_test, clf1.predict(X_test)))

              precision    recall  f1-score   support

           0       0.98      0.98      0.98      2480
           1       0.82      0.90      0.86      2531
           2       0.90      0.80      0.85      2489

    accuracy                           0.90      7500
   macro avg       0.90      0.90      0.90      7500
weighted avg       0.90      0.90      0.90      7500



## Using partial_fit method

In [6]:
train_data = np.concatenate((X_train, y_train[:, np.newaxis]), axis=1)

In [7]:
train_data[0:5]

array([[ 1.24042289,  0.10002442, -0.44983233, -1.92482739,  0.57771092,
         0.65775891,  2.02609801,  1.7058331 ,  2.16055171,  0.63410265,
         0.        ],
       [-0.32757087, -0.78129376, -0.42310614,  1.39521212,  1.01430331,
         0.10536407, -1.89602304, -1.21252172,  0.4683628 , -0.94102037,
         1.        ],
       [ 0.82448869, -0.11673529, -0.00438467, -1.40797018, -1.93312086,
        -0.06397497, -0.62749607,  1.24430831, -0.10589255,  0.53361687,
         0.        ],
       [-1.34712599, -0.63465092, -0.15818871,  0.44066522, -1.45568132,
         0.75359541,  1.07637709, -0.43508308, -0.0181856 ,  0.75027002,
         2.        ],
       [-0.82155069,  0.46849312, -1.13541348,  0.49511237,  0.14247327,
        -0.61890584,  1.35470773, -0.45983958,  0.47064362,  0.26011256,
         2.        ]])

In [8]:
a = np.asarray(train_data)
np.savetxt('train_data.csv', a, delimiter=',')

In [9]:
clf2 = SGDClassifier(max_iter=1000, tol=0.01)

Now, read from this file in chunks using pandas read_csv method

In [10]:
import pandas as pd

chunksize = 1000

iter = 1
for train_df in pd.read_csv("train_data.csv", chunksize=chunksize, iterator=True):
  X_train_partial = train_df.iloc[:, 0:10]  # since there are 10 features in dataset
  y_train_partial = train_df.iloc[:, 10] # last column is the label
  # Need to pass the classes in the first iteration, since it's not guaranteed to have the same classes in all chunks of data.
  if iter == 1:
    clf2.partial_fit(X_train_partial, y_train_partial, classes=np.array([0, 1, 2]))
  else:
    clf2.partial_fit(X_train_partial, y_train_partial)
  print(f'After iteration #{iter}')
  print(clf2.coef_)
  print(clf2.intercept_)
  iter += 1

After iteration #1
[[ 17.30883616   8.94935066   8.67703669 -44.86769448   4.76750203
   -8.1339534    5.81022729  39.27658731   9.92539366  24.55562183]
 [ 40.03097522  -7.7488511   -0.6438107   20.36076595  17.14732982
   25.17687676   3.04335293 -15.81677537   7.97610946 -51.47513358]
 [-31.99984564  15.34895285  11.27343507  24.04851121   2.3288867
  -12.60072897 -13.55759605 -22.00399477   0.22593542   5.97665104]]
[-25.46436397 -67.14918763 -24.29929563]
After iteration #2
[[ 1.77273707e+01  1.51456220e+01 -1.33094504e+01 -3.28089535e+01
  -7.06402115e+00 -1.56310001e-01 -5.81126073e+00  2.89330167e+01
  -1.97381710e-02  1.36853513e+01]
 [ 2.21627664e+01  1.44431576e+01 -3.37304224e+00  9.05392325e+00
   3.52132611e+00  2.19125648e+00 -1.23570595e+01 -6.85052387e+00
  -6.71864314e+00 -2.65636119e+01]
 [-1.55476653e+01  1.07396712e-01 -2.89256672e+00 -2.72098995e+00
   2.73076117e+00  2.14088231e+00 -4.90161332e+00  1.68636418e+00
   7.01503202e+00  1.54683645e+01]]
[-20.40928495 

In [11]:
test_score = clf2.score(X_test, y_test)
print(test_score)

0.8794666666666666


  "X does not have valid feature names, but"


In [12]:
print(classification_report(y_test, clf2.predict(X_test)))

              precision    recall  f1-score   support

           0       0.99      0.94      0.96      2480
           1       0.80      0.88      0.84      2531
           2       0.86      0.82      0.84      2489

    accuracy                           0.88      7500
   macro avg       0.88      0.88      0.88      7500
weighted avg       0.88      0.88      0.88      7500



  "X does not have valid feature names, but"
