In [1]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score,classification_report
import numpy as np

from os import listdir
from os.path import isfile, join, isdir



In [2]:
from tensorflow.keras import layers
from tensorflow.keras import models
import tensorflow as tf
from tensorflow.keras import activations
from tensorflow.keras import utils

  _np_qint8 = np.dtype([("qint8", np.int8, 1)])
  _np_quint8 = np.dtype([("quint8", np.uint8, 1)])
  _np_qint16 = np.dtype([("qint16", np.int16, 1)])
  _np_quint16 = np.dtype([("quint16", np.uint16, 1)])
  _np_qint32 = np.dtype([("qint32", np.int32, 1)])
  np_resource = np.dtype([("resource", np.ubyte, 1)])
  _np_qint8 = np.dtype([("qint8", np.int8, 1)])
  _np_quint8 = np.dtype([("quint8", np.uint8, 1)])
  _np_qint16 = np.dtype([("qint16", np.int16, 1)])
  _np_quint16 = np.dtype([("quint16", np.uint16, 1)])
  _np_qint32 = np.dtype([("qint32", np.int32, 1)])
  np_resource = np.dtype([("resource", np.ubyte, 1)])


# Transform your data into matrices

## 1. Define the classes 

First we want to define the classes into which we want to classify the data. In the folder 'bbc' you may find 1 folder per category. Each folder holds a list of txt files, all representing one text that belongs to that specific category.

Can you find a way to read the different categories automatically from the 'bbc' folder

In [3]:
path_origin = 'bbc/'

categories = [f for f in listdir(path_origin) if isdir(join(path_origin, f))]

#Don't change this.
print(categories)
assert sorted(categories) == ['business', 'entertainment', 'politics', 'sport', 'tech']

['entertainment', 'business', 'sport', 'politics', 'tech']


## 2. Read the data into x and y

Construct two lists x and y. 
- x has to be a list of strings where 1 string is the data within 1 .txt file and we want to read all the subfolders within /bbc/.
- y has the same length of x but holds the corresponding label that belongs to the text in x. So the text on position 50 in x has the label on position 50 in y.

Tip: sometimes their may be errors when trying to read a file. You don't have to focus on solving this problem. Just use try: ... except: to skip the files that pose problems. Be sure to only add a label to y when you have actually managed to read a file and add it to X.

In [4]:
X = []
y = []
for cat in categories:
    mypath = path_origin+cat+'/'
    print(mypath)
    files = [f for f in listdir(mypath) if isfile(join(mypath, f))]
    for file in files:
        try:
            f = open(mypath+file, "r")
            string = f.read()
        except:
            continue
        if len(string)>0:
            X+=[string]
            y+=[cat]
            string = ''
        if len(X) != len(y):
            break

#Don't change this.            
print(len(X), len(y))
assert len(X) == len(y)

bbc/entertainment/
bbc/business/
bbc/sport/
bbc/politics/
bbc/tech/
2224 2224


## 3. Split you data

We need to make a seperate test and training dataset. We use the train set to train a model. Then we use the test set to check how the model performs on data it has not seen before.

For this you can use the train_test_split() of sklearn. Please look up how it works. We want to have 25% of the data of X in the test set.

In [6]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=42)

#X_train, X_test, y_train, y_test = train_test_split(...)

Now this is done, we want to make sure all categories are well present in both datasets. Please run the cell below and check how much of each category we have within each dataset. Do you think all categories are enough represented?

In [7]:
#You don't have to change this, you just have to run it.
print([y.count(cat) for cat in categories])
print([y_train.count(cat) for cat in categories])
print([y_test.count(cat) for cat in categories])

[386, 510, 510, 417, 401]
[283, 372, 399, 318, 296]
[103, 138, 111, 99, 105]


## 4. TF-IDF

As a next step we want to transform our data into a matrix, using TF-IDF. We will use the predefined function of sklearn TFIDFVectorizer. Please have a look at the different arguments you can give. Are their any interesting arguments that might be interesting to add? Please make sure you leave max_features= num_features.

https://scikit-learn.org/stable/modules/generated/sklearn.feature_extraction.text.TfidfVectorizer.html

In [10]:
#Choose the number of features you want to learn, 
#this is equal to the number of words you want to take into account in your vocabulary.
num_features = 10000
#Number of categories. We will need this later.
num_cat = len(categories)

#Play with this
tfidf = TfidfVectorizer(max_features= num_features)

#Don't change this, but understand what is happening.
def tfidf_features(txt, flag):
    if flag == "train":
        x = tfidf.fit_transform(txt)
    else:
        x = tfidf.transform(txt)
    return x 


X_trans_train = tfidf_features(X_train, flag="train")
X_trans_test = tfidf_features(X_test, flag="test")

print(X_trans_train.shape, X_trans_test.shape)

(1668, 10000) (556, 10000)


## 5. Transform y into a matrix

Now the y vector is also made out of string, but remember, computers don't understand words. We have to attach a number to each class and perform a 1-hot encoding. See image below to understand 1-hot encoding or read more on the following link:

https://machinelearningmastery.com/how-to-one-hot-encode-sequence-data-in-python/#:~:text=A%20one%20hot%20encoding%20is,is%20marked%20with%20a%201.

<img src="assets/1hot.png" width=1000 height=1000 />

In [11]:
#You don't have to change any code in this cell. Please understand what is happening.
#First we transform the text label to a number
lb = LabelEncoder()
lb.fit(categories)
y_trans_train = lb.transform(y_train)
y_trans_test = lb.transform(y_test)

#Then we transform this to a one hot encoded matrix
y_hot_train=utils.to_categorical(y_trans_train, num_cat)
y_hot_test=utils.to_categorical(y_trans_test, num_cat)

print(y_trans_train.shape, y_trans_test.shape)

(1668,) (556,)


# Define your model

We are ready preparing our input and output model. Now we have to indicate how we want to build the model. Please fill in the gaps + add as much hidden layers as you see necessary

In [None]:
model = tf.keras.Sequential()
#define input layer
model.add(layers.Dense(1000, input_shape=(num_features,)))


#Add hidden layers here

#define output layer
model.add(Dense(num_cat))
model.add(Activation('softmax'))
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
print (model.summary())

In [16]:
model = tf.keras.Sequential()
#define input layer
model.add(layers.Dense(1000, input_shape=(num_features,)))
model.add(layers.Activation(activations.relu))
#model.add(layers.Dropout(0.5))

model.add(layers.Dense(500))
model.add(layers.Activation(activations.relu))
#model.add(layers.Dropout(0.5))

model.add(layers.Dense(50))
model.add(layers.Activation(activations.relu))
#model.add(layers.Dropout(0.5))

model.add(layers.Dense(num_cat))
model.add(layers.Activation(activations.softmax))

model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
print (model.summary())

Model: "sequential_1"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense_4 (Dense)              (None, 1000)              10001000  
_________________________________________________________________
activation_4 (Activation)    (None, 1000)              0         
_________________________________________________________________
dense_5 (Dense)              (None, 500)               500500    
_________________________________________________________________
activation_5 (Activation)    (None, 500)               0         
_________________________________________________________________
dense_6 (Dense)              (None, 50)                25050     
_________________________________________________________________
activation_6 (Activation)    (None, 50)                0         
_________________________________________________________________
dense_7 (Dense)              (None, 5)                

Now that we have defined the model, we will train it by feeding it with the input and output matrices we have prepared

In [17]:
# Model Training
nb_epochs = 1
batch_size = 64

history = model.fit(X_trans_train, y_hot_train, batch_size=batch_size, epochs=nb_epochs,verbose=1)



Now let the model make predictions based on
1. the input data it has already seen before
2. the new test data

In [18]:
y_train_predclass = model.predict_classes(X_trans_train,batch_size=batch_size)
y_test_predclass = model.predict_classes(X_trans_test,batch_size=batch_size)

Compare the performance of the two datasets

In [19]:
#Model Prediction

print("nnDeep Neural Network - Train accuracy:",round(accuracy_score( y_trans_train, y_train_predclass),3))

print("nDeep Neural Network - Test accuracy:",round(accuracy_score( y_trans_test,y_test_predclass),3))

print("nDeep Neural Network - Train Classification Report")

print (classification_report(y_trans_train,y_train_predclass))

print("nDeep Neural Network - Test Classification Report")

print (classification_report(y_trans_test,y_test_predclass))

nnDeep Neural Network - Train accuracy: 0.993
nDeep Neural Network - Test accuracy: 0.96
nDeep Neural Network - Train Classification Report
              precision    recall  f1-score   support

           0       0.99      0.99      0.99       372
           1       0.99      1.00      0.99       283
           2       1.00      0.99      0.99       318
           3       1.00      1.00      1.00       399
           4       0.98      0.99      0.99       296

    accuracy                           0.99      1668
   macro avg       0.99      0.99      0.99      1668
weighted avg       0.99      0.99      0.99      1668

nDeep Neural Network - Test Classification Report
              precision    recall  f1-score   support

           0       0.93      0.96      0.95       138
           1       0.96      0.98      0.97       103
           2       0.99      0.91      0.95        99
           3       1.00      0.98      0.99       111
           4       0.94      0.96      0.95       