In [53]:
import numpy as np
import pandas as pd 
import tensorflow as tf
from sklearn.model_selection import train_test_split
from tensorflow import keras
from sklearn.preprocessing import LabelBinarizer, LabelEncoder
from tensorflow.keras import models,layers

In [2]:
data = pd.read_csv("bbc-text.csv")

In [3]:
data.head()

Unnamed: 0,category,text
0,tech,tv future in the hands of viewers with home th...
1,business,worldcom boss left books alone former worldc...
2,sport,tigers wary of farrell gamble leicester say ...
3,sport,yeading face newcastle in fa cup premiership s...
4,entertainment,ocean s twelve raids box office ocean s twelve...


In [4]:
data['category'].value_counts()

sport            511
business         510
politics         417
tech             401
entertainment    386
Name: category, dtype: int64

In [34]:
X=data['text']
y=data['category']

In [35]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, random_state=10)
#Train test split

In [38]:
max_words = 1000
tokenize = keras.preprocessing.text.Tokenizer(num_words=max_words,char_level=False)

In [39]:
tokenize.fit_on_texts(X_train) # fit tokenizer to our training text data
x_train = tokenize.texts_to_matrix(X_train)
x_test = tokenize.texts_to_matrix(X_test)

In [40]:
x_train

array([[0., 1., 1., ..., 0., 0., 0.],
       [0., 1., 1., ..., 0., 0., 0.],
       [0., 1., 1., ..., 0., 0., 0.],
       ...,
       [0., 1., 1., ..., 0., 0., 0.],
       [0., 1., 1., ..., 0., 0., 0.],
       [0., 1., 1., ..., 0., 1., 0.]])

In [41]:
encoder = LabelEncoder()
encoder.fit(y_train)
y_train = encoder.transform(y_train)
y_test = encoder.transform(y_test)

In [42]:
y_test

array([1, 4, 3, 3, 3, 0, 0, 4, 1, 4, 0, 0, 4, 1, 1, 0, 4, 0, 2, 3, 1, 2,
       3, 3, 1, 3, 4, 3, 0, 2, 0, 0, 4, 4, 3, 1, 0, 0, 2, 2, 2, 1, 0, 4,
       4, 0, 4, 0, 2, 0, 1, 4, 2, 1, 0, 0, 0, 0, 1, 2, 1, 0, 3, 0, 3, 4,
       2, 0, 1, 0, 1, 2, 4, 4, 0, 3, 3, 0, 2, 2, 2, 1, 0, 0, 3, 1, 3, 1,
       3, 1, 4, 0, 0, 0, 1, 4, 3, 2, 0, 1, 2, 3, 3, 3, 1, 1, 4, 0, 1, 3,
       4, 1, 1, 3, 0, 3, 3, 2, 3, 1, 4, 0, 3, 4, 3, 3, 2, 4, 0, 2, 2, 4,
       3, 3, 1, 2, 2, 1, 2, 2, 2, 4, 4, 4, 2, 0, 2, 0, 0, 0, 0, 0, 0, 3,
       4, 2, 2, 0, 0, 4, 3, 4, 4, 1, 3, 1, 3, 2, 4, 4, 0, 2, 3, 3, 4, 0,
       0, 2, 4, 0, 2, 2, 4, 1, 0, 2, 3, 2, 0, 3, 2, 0, 4, 2, 2, 0, 1, 2,
       2, 1, 2, 0, 3, 1, 2, 4, 1, 4, 2, 0, 4, 2, 4, 1, 4, 2, 3, 3, 0, 4,
       1, 1, 3, 2, 3, 3, 0, 4, 3, 2, 0, 3, 1, 4, 4, 2, 0, 3, 4, 3, 1, 1,
       3, 2, 2, 0, 2, 1, 0, 4, 4, 0, 4, 3, 1, 0, 4, 3, 1, 1, 2, 3, 0, 3,
       3, 4, 2, 3, 0, 1, 2, 3, 0, 2, 3, 3, 4, 2, 2, 0, 2, 3, 2, 4, 2, 1,
       4, 2, 3, 4, 3, 1, 4, 2, 3, 3, 3, 2, 3, 3, 1,

In [43]:
# Converts the labels to a one-hot representation
num_classes = np.max(y_train) + 1
y_train = keras.utils.to_categorical(y_train, num_classes)
y_test = keras.utils.to_categorical(y_test, num_classes)

In [45]:
print('x_train shape:', x_train.shape)
print('x_test shape:', x_test.shape)
print('y_train shape:', y_train.shape)
print('y_test shape:', y_test.shape)

x_train shape: (1780, 1000)
x_test shape: (445, 1000)
y_train shape: (1780, 5)
y_test shape: (445, 5)


In [60]:
model = models.Sequential()
model.add(layers.Dense(1024,input_shape=(max_words,)))
model.add(layers.Dense(212,activation='relu')) #Use for relu in our model is that there isn't any negative dependacy in our dataset
model.add(layers.Activation('relu'))
model.add(layers.Dense(num_classes,activation='softmax'))
model.compile(loss='categorical_crossentropy',optimizer='adam',metrics=['accuracy'])

In [64]:
batch_size = 32
epochs = 4
drop_ratio = 0.5

In [65]:
history = model.fit(x_train, y_train,batch_size=batch_size,epochs=epochs,verbose=1,validation_split=0.1)

Train on 1602 samples, validate on 178 samples
Epoch 1/4
Epoch 2/4
Epoch 3/4
Epoch 4/4


In [71]:
score = model.evaluate(x_test, y_test,batch_size=batch_size)


