In [38]:
from sklearn.datasets import fetch_20newsgroups

documents_train = fetch_20newsgroups(
    subset = "train",
    categories = ["comp.graphics", "comp.windows.x", "rec.autos", "sci.space"]
)

documents_test = fetch_20newsgroups(
    subset = "test",
    categories = ["comp.graphics", "comp.windows.x", "rec.autos", "sci.space"]
)

In [39]:
len(documents_train["data"])
len(documents_test["data"])

1574

In [40]:
print(documents_train["data"][0])
print(documents_train["target"][0])
print(documents_train.target_names)

From: orourke@sophia.smith.edu (Joseph O'Rourke)
Subject: Re: Delaunay Triangulation
Organization: Smith College, Northampton, MA, US
Lines: 22

In article <lsk1v9INN93c@caspian.usc.edu> zyeh@caspian.usc.edu (zhenghao yeh) writes:
>
>Does anybody know what Delaunay Triangulation is?
>Is there any reference to it? 
>Is it useful for creating 3-D objects? If yes, what's the advantage?

There is a vast literature on Delaunay triangulations, literally
hundreds of papers.  A program is even provided with every copy of 
Mathematica nowadays.  You might look at this if you are interested in 
using it for creating 3D objects:

@article{Boissonnat5,
  author = "J.D. Boissonnat",
  title = "Geometric Structures for Three-Dimensional Shape Representation",
  journal = "ACM Transactions on Graphics",
  month = "October",
  year = {1984},
  volume = {3},
  number = {4},
  pages = {266-286}
}


0
['comp.graphics', 'comp.windows.x', 'rec.autos', 'sci.space']


## Bag of Words

In [41]:
from sklearn.feature_extraction.text import CountVectorizer

vectorizer = CountVectorizer()
vectorizer.fit(documents_train["data"])

X_train = vectorizer.transform(documents_train["data"])
X_test = vectorizer.transform(documents_test["data"])
X_train.todense()

matrix([[0, 0, 0, ..., 0, 0, 0],
        [0, 0, 0, ..., 0, 0, 0],
        [0, 0, 0, ..., 0, 0, 0],
        ...,
        [0, 0, 0, ..., 0, 0, 0],
        [0, 0, 0, ..., 0, 0, 0],
        [0, 0, 0, ..., 0, 0, 0]])

In [42]:
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import classification_report

mdl = MultinomialNB()
mdl.fit(X_train, documents_train["target"])

ypred = mdl.predict(X_test)

print(classification_report(documents_test["target"], ypred))

              precision    recall  f1-score   support

           0       0.81      0.90      0.85       389
           1       0.93      0.80      0.86       395
           2       0.97      0.99      0.98       396
           3       0.95      0.97      0.96       394

    accuracy                           0.91      1574
   macro avg       0.92      0.91      0.91      1574
weighted avg       0.92      0.91      0.91      1574



## Term Frequency

In [43]:
from sklearn.feature_extraction.text import TfidfTransformer

tf_transformer = TfidfTransformer(use_idf = False)

X_train_tf = tf_transformer.fit_transform(X_train)
X_test_tf = tf_transformer.transform(X_test)

mdl.fit(X_train_tf, documents_train["target"])

ypred = mdl.predict(X_test_tf)

print(classification_report(documents_test["target"], ypred))

              precision    recall  f1-score   support

           0       0.82      0.78      0.80       389
           1       0.92      0.79      0.85       395
           2       0.85      0.98      0.91       396
           3       0.92      0.95      0.93       394

    accuracy                           0.87      1574
   macro avg       0.88      0.87      0.87      1574
weighted avg       0.88      0.87      0.87      1574



## Term Frequency Inverse Document Frequency

In [44]:
tfidf_transformer = TfidfTransformer(use_idf = True)

X_train_tfidf = tfidf_transformer.fit_transform(X_train)
X_test_tfidf = tfidf_transformer.transform(X_test)

mdl.fit(X_train_tfidf, documents_train["target"])

ypred = mdl.predict(X_test_tfidf)

print(classification_report(documents_test["target"], ypred))

              precision    recall  f1-score   support

           0       0.85      0.80      0.83       389
           1       0.91      0.84      0.87       395
           2       0.90      0.98      0.94       396
           3       0.93      0.97      0.95       394

    accuracy                           0.90      1574
   macro avg       0.90      0.90      0.90      1574
weighted avg       0.90      0.90      0.90      1574



# Deep Leaning

In [45]:
import tensorflow as tf
from tensorflow.keras import layers, Sequential

In [46]:
X_train.shape

(2364, 36608)

In [47]:
text_vectorizer = layers.TextVectorization(
    max_tokens = 10000,
    output_mode = "tf-idf"
)

text_vectorizer.adapt(documents_train["data"])
X_train = text_vectorizer(documents_train["data"])
X_test = text_vectorizer(documents_test["data"])

X_train

<tf.Tensor: shape=(2364, 10000), dtype=float32, numpy=
array([[ 47.9559   ,   0.7299776,   0.765898 , ...,   0.       ,
          0.       ,   0.       ],
       [ 37.29903  ,   0.7299776,   2.297694 , ...,   0.       ,
          0.       ,   0.       ],
       [ 15.985299 ,   3.649888 ,   1.531796 , ...,   0.       ,
          0.       ,   0.       ],
       ...,
       [ 26.642166 ,   5.1098433,   1.531796 , ...,   0.       ,
          0.       ,   0.       ],
       [ 79.9265   ,  18.979418 ,   7.65898  , ...,   0.       ,
          0.       ,   0.       ],
       [101.24023  ,   8.759731 ,   5.361286 , ...,   0.       ,
          0.       ,   0.       ]], dtype=float32)>

In [48]:
mdl = Sequential()

mdl.add(layers.Input(shape=(X_train.shape[1], )))
mdl.add(layers.Dense(128, activation = "relu"))
mdl.add(layers.Dense(128, activation = "relu"))
mdl.add(layers.Dense(4, activation = "softmax"))

mdl.compile(optimizer = "adam", loss = "sparse_categorical_crossentropy", metrics = ["accuracy"])

mdl.fit(X_train, documents_train["target"], epochs = 10)

Epoch 1/10
[1m74/74[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 4ms/step - accuracy: 0.6886 - loss: 0.9243
Epoch 2/10
[1m74/74[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 3ms/step - accuracy: 0.9927 - loss: 0.0346
Epoch 3/10
[1m74/74[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 4ms/step - accuracy: 1.0000 - loss: 0.0063
Epoch 4/10
[1m74/74[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 4ms/step - accuracy: 1.0000 - loss: 0.0019
Epoch 5/10
[1m74/74[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 4ms/step - accuracy: 1.0000 - loss: 9.3713e-04
Epoch 6/10
[1m74/74[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 4ms/step - accuracy: 1.0000 - loss: 4.7390e-04
Epoch 7/10
[1m74/74[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 4ms/step - accuracy: 1.0000 - loss: 2.4962e-04
Epoch 8/10
[1m74/74[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 5ms/step - accuracy: 1.0000 - loss: 2.0040e-04
Epoch 9/10
[1m74/74[0m [32m━━━━━━━━━━

<keras.src.callbacks.history.History at 0x333719c10>

In [49]:
mdl.evaluate(X_test, documents_test.target)

[1m50/50[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 763us/step - accuracy: 0.8950 - loss: 0.5053


[0.4895995259284973, 0.9040660858154297]

## Sequential Data for Deep Learning

In [50]:
text_vectorizer = layers.TextVectorization(
    max_tokens = 1000,
    output_mode = "int",
    output_sequence_length = 10
)

text_vectorizer.adapt(documents_train["data"])
X_train = text_vectorizer(documents_train["data"])
X_test = text_vectorizer(documents_test["data"])

X_train = tf.one_hot(X_train, depth = 1000)
X_test = tf.one_hot(X_test, depth = 1000)
X_train.shape

TensorShape([2364, 10, 1000])

In [51]:
mdl = Sequential()

mdl.add(layers.Input(shape=X_train.shape[1:]))
mdl.add(layers.LSTM(128))
mdl.add(layers.Dense(128, activation = "relu"))
mdl.add(layers.Dense(4, activation = "softmax"))

mdl.compile(optimizer = "adam", loss = "sparse_categorical_crossentropy", metrics = ["accuracy"])

mdl.fit(X_train, documents_train["target"], epochs = 10)

Epoch 1/10
[1m74/74[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 8ms/step - accuracy: 0.3584 - loss: 1.3473
Epoch 2/10
[1m74/74[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 10ms/step - accuracy: 0.6494 - loss: 0.8918
Epoch 3/10
[1m74/74[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 9ms/step - accuracy: 0.8057 - loss: 0.5554
Epoch 4/10
[1m74/74[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 9ms/step - accuracy: 0.8464 - loss: 0.4274
Epoch 5/10
[1m74/74[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 8ms/step - accuracy: 0.8623 - loss: 0.3891
Epoch 6/10
[1m74/74[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 8ms/step - accuracy: 0.8839 - loss: 0.3174
Epoch 7/10
[1m74/74[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 8ms/step - accuracy: 0.8936 - loss: 0.2826
Epoch 8/10
[1m74/74[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 8ms/step - accuracy: 0.8963 - loss: 0.2786
Epoch 9/10
[1m74/74[0m [32m━━━━━━━━━━━━━━━━━━━━[0m

<keras.src.callbacks.history.History at 0x162e8ddd0>

In [52]:
mdl.evaluate(X_test, documents_test["target"])

[1m50/50[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 3ms/step - accuracy: 0.6707 - loss: 1.0174


[1.0309085845947266, 0.6766200661659241]

## Embedding

In [58]:
text_vectorizer = layers.TextVectorization(
    max_tokens = 10000,
    output_mode = "int",
    output_sequence_length = 100
)

text_vectorizer.adapt(documents_train["data"])
X_train = text_vectorizer(documents_train["data"])
X_test = text_vectorizer(documents_test["data"])

In [59]:
mdl = Sequential()

mdl.add(layers.Embedding(input_dim=10000, output_dim=128))
mdl.add(layers.LSTM(128))
mdl.add(layers.Dense(128, activation = "relu"))
mdl.add(layers.Dense(4, activation = "softmax"))

mdl.compile(optimizer = "adam", loss = "sparse_categorical_crossentropy", metrics = ["accuracy"])

mdl.fit(X_train, documents_train["target"], epochs = 10)

Epoch 1/10
[1m74/74[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 48ms/step - accuracy: 0.2999 - loss: 1.3521
Epoch 2/10
[1m74/74[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 53ms/step - accuracy: 0.4944 - loss: 0.8975
Epoch 3/10
[1m74/74[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 53ms/step - accuracy: 0.5591 - loss: 0.8250
Epoch 4/10
[1m74/74[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 49ms/step - accuracy: 0.7080 - loss: 0.6216
Epoch 5/10
[1m74/74[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 52ms/step - accuracy: 0.7587 - loss: 0.5558
Epoch 6/10
[1m74/74[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 46ms/step - accuracy: 0.8293 - loss: 0.4510
Epoch 7/10
[1m74/74[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 54ms/step - accuracy: 0.7945 - loss: 0.5858
Epoch 8/10
[1m74/74[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 47ms/step - accuracy: 0.8449 - loss: 0.4302
Epoch 9/10
[1m74/74[0m [32m━━━━━━━━━━━━━━━━━━

<keras.src.callbacks.history.History at 0x162b964d0>

In [60]:
mdl.evaluate(X_test, documents_test["target"])

[1m50/50[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 14ms/step - accuracy: 0.7654 - loss: 0.7655


[0.7629914879798889, 0.7642948031425476]