In [31]:
import pandas as pd
import numpy as np
from keras.models import Sequential
from scipy.ndimage.interpolation import shift
from keras import optimizers
from keras.layers import Dense, Activation, Dropout
from keras.layers import Conv1D, GlobalAveragePooling1D, MaxPooling1D
from keras.utils import np_utils
from scipy.sparse import csr_matrix, hstack
from sklearn.model_selection import train_test_split
from matplotlib import pyplot as plt
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.metrics import confusion_matrix, classification_report
from sklearn import preprocessing
from sklearn.externals import joblib

In [2]:
data = pd.read_csv('data_for_categorical.csv')

In [3]:
Classification = ["world","politics","sport","football","culture","business",
                  "lifeandstyle", "fashion","environment","technology","travel"]
data = data.loc[data['class'].isin(Classification)]
print(data.shape)

(396607, 2)


In [4]:
Y = data.as_matrix(columns=['class']).reshape(-1)
le = preprocessing.LabelEncoder()
le.fit(Y)
num_classes = len(list(le.classes_))
Y = le.transform(Y)
Y

array([10, 10, 10, ...,  9,  9,  9])

In [5]:
Y = np_utils.to_categorical(Y, num_classes)
Y

array([[ 0.,  0.,  0., ...,  0.,  0.,  1.],
       [ 0.,  0.,  0., ...,  0.,  0.,  1.],
       [ 0.,  0.,  0., ...,  0.,  0.,  1.],
       ..., 
       [ 0.,  0.,  0., ...,  0.,  1.,  0.],
       [ 0.,  0.,  0., ...,  0.,  1.,  0.],
       [ 0.,  0.,  0., ...,  0.,  1.,  0.]])

In [6]:
vectorizer = TfidfVectorizer(max_features=7000)
X = vectorizer.fit_transform(data['name'].tolist())
X

<396607x7000 sparse matrix of type '<class 'numpy.float64'>'
	with 3219916 stored elements in Compressed Sparse Row format>

In [7]:
X_train, X_test, y_train, y_test = train_test_split(X, Y, train_size=0.8)
X_train= X_train.toarray()
X_test = X_test.toarray()

In [24]:
model = Sequential()
model.add(Dense(1000, activation='relu', input_shape=(X.shape[1],)))
model.add(Dropout(0.5))
model.add(Dense(100, activation='relu', input_shape=(X.shape[1],)))
model.add(Dropout(0.5))
model.add(Dense(200, activation='relu', input_shape=(X.shape[1],)))
model.add(Dropout(0.5))
model.add(Dense(300, activation='relu', input_shape=(X.shape[1],)))
model.add(Dropout(0.5))
model.add(Dense(500, activation='relu', input_shape=(X.shape[1],)))
model.add(Dropout(0.5))
model.add(Dense(num_classes, activation='softmax'))
RMS = optimizers.RMSprop(lr=0.001, rho=0.9, epsilon=1e-08, decay=0.0001)
model.compile(optimizer=RMS,
              loss='categorical_crossentropy',
              metrics=['accuracy'])
model.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense_47 (Dense)             (None, 1000)              7001000   
_________________________________________________________________
dropout_40 (Dropout)         (None, 1000)              0         
_________________________________________________________________
dense_48 (Dense)             (None, 100)               100100    
_________________________________________________________________
dropout_41 (Dropout)         (None, 100)               0         
_________________________________________________________________
dense_49 (Dense)             (None, 200)               20200     
_________________________________________________________________
dropout_42 (Dropout)         (None, 200)               0         
_________________________________________________________________
dense_50 (Dense)             (None, 300)               60300     
__________

In [None]:
history = model.fit(X_train, y_train, 
                    epochs=20,
                    validation_data=(X_test, y_test),
                    batch_size=10000)

In [27]:
pred = np.argmax(model.predict(X_test), axis=1)
true = np.argmax(y_test, axis=1)
print(classification_report(true, pred, target_names=list(le.classes_)))

              precision    recall  f1-score   support

    business       0.81      0.83      0.82      8645
     culture       0.66      0.60      0.63      1558
 environment       0.72      0.72      0.72      4430
     fashion       0.65      0.63      0.64      1164
    football       0.93      0.94      0.93     14764
lifeandstyle       0.75      0.80      0.77      7442
    politics       0.81      0.80      0.81      5960
       sport       0.93      0.91      0.92     15198
  technology       0.77      0.82      0.79      4248
      travel       0.79      0.74      0.76      1671
       world       0.86      0.82      0.84     14242

 avg / total       0.85      0.84      0.84     79322



In [28]:
model.save('news_title_cls85.h5')

In [32]:
joblib.dump(vectorizer, 'tfidf_vectorizer.pkl')

['tfidf_vectorizer.pkl']

In [33]:
#joblib.load('tfidf_vectorizer.pkl')

TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=7000, min_df=1,
        ngram_range=(1, 1), norm='l2', preprocessor=None, smooth_idf=True,
        stop_words=None, strip_accents=None, sublinear_tf=False,
        token_pattern='(?u)\\b\\w\\w+\\b', tokenizer=None, use_idf=True,
        vocabulary=None)