In [1]:
import pandas as pd
import numpy as np
from keras.models import Sequential
from scipy.ndimage.interpolation import shift
from keras import optimizers
from keras.layers import Dense, Activation, Dropout, LSTM
from keras.layers import Conv1D, GlobalAveragePooling1D, MaxPooling1D
from keras.utils import np_utils
from scipy.sparse import csr_matrix, hstack
from sklearn.model_selection import train_test_split
from matplotlib import pyplot as plt
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.metrics import confusion_matrix, classification_report
from sklearn import preprocessing
from sklearn.externals import joblib
from sklearn.model_selection import cross_val_predict, cross_val_score, StratifiedKFold, KFold
from sklearn.metrics import confusion_matrix, classification_report, mean_squared_error

Using TensorFlow backend.


In [2]:
data = pd.read_csv('data_for_categorical.csv')

In [3]:
Classification = ["world","politics","sport","football","culture","business",
                  "lifeandstyle", "fashion","environment","technology","travel"]
data = data.loc[data['class'].isin(Classification)]
print(data.shape)

(396607, 2)


In [4]:
Y = data.as_matrix(columns=['class']).reshape(-1)
le = preprocessing.LabelEncoder()
le.fit(Y)
num_classes = len(list(le.classes_))
Y = le.transform(Y)
Y

array([10, 10, 10, ...,  9,  9,  9])

In [5]:
Y = np_utils.to_categorical(Y, num_classes)
Y

array([[ 0.,  0.,  0., ...,  0.,  0.,  1.],
       [ 0.,  0.,  0., ...,  0.,  0.,  1.],
       [ 0.,  0.,  0., ...,  0.,  0.,  1.],
       ..., 
       [ 0.,  0.,  0., ...,  0.,  1.,  0.],
       [ 0.,  0.,  0., ...,  0.,  1.,  0.],
       [ 0.,  0.,  0., ...,  0.,  1.,  0.]])

In [6]:
vectorizer = TfidfVectorizer(min_df=55, ngram_range=(1, 3))
X = vectorizer.fit_transform(data['name'].tolist())
X

<396607x13135 sparse matrix of type '<class 'numpy.float64'>'
	with 4157578 stored elements in Compressed Sparse Row format>

In [7]:
def create_model():
    model = Sequential()
    model.add(Dense(13, activation='tanh', input_shape=(X.shape[1],), bias_initializer='RandomNormal'))
    model.add(Dropout(0.5))
    model.add(Dense(num_classes, activation='softmax'))
    model.compile(optimizer='RMSprop',
                  loss='categorical_crossentropy',
                  metrics=['accuracy'])
    model.summary()
    return model

In [8]:
def train_and_evaluate_model(model, X_train, y_train, X_test, y_test):
    history = model.fit(X_train, y_train, 
                        epochs=6,
                        batch_size=10000)
    score = model.evaluate(X_test, y_test)[1]
    print('\n valid acc :%.4f'%(score))
    pred = np.argmax(model.predict(X_test), axis=1)
    true = np.argmax(y_test, axis=1)
    print(classification_report(true, pred, target_names=list(le.classes_)))
    return score

In [9]:
# X_train, X_test, y_train, y_test = train_test_split(X, Y, train_size=0.8)
# X_train= X_train.toarray()
# X_test = X_test.toarray()
X =X.toarray()

In [10]:
model = create_model()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense_1 (Dense)              (None, 13)                170768    
_________________________________________________________________
dropout_1 (Dropout)          (None, 13)                0         
_________________________________________________________________
dense_2 (Dense)              (None, 11)                154       
Total params: 170,922
Trainable params: 170,922
Non-trainable params: 0
_________________________________________________________________


In [13]:
# history = model.fit(X_train, y_train, 
#                     epochs=5,
#                     validation_data = (X_test, y_test),
#                     batch_size=8000)
# pred = np.argmax(model.predict(X_test), axis=1)
# true = np.argmax(y_test, axis=1)
# print(classification_report(true, pred, target_names=list(le.classes_)))
# 0.5092
#0.5889
history = model.fit(X, Y, 
                    epochs=5,
                    batch_size=8000)
# pred = np.argmax(model.predict(X_test), axis=1)
# true = np.argmax(y_test, axis=1)
# print(classification_report(true, pred, target_names=list(le.classes_)))

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


In [None]:
# kf = KFold(n_splits=5, shuffle=True)
# acc = []
# X = X.toarray()
# for train_idx, test_idx in kf.split(X):
#     print ("Running Fold")
#     model = create_model()
#     acc.append(train_and_evaluate_model(model, X[train_idx], Y[train_idx], X[test_idx], Y[test_idx]))
#     del model
# print('mean acc:%.4f'%(np.mean(acc)))

In [12]:
model.save('news_title_cls85.h5')
joblib.dump(vectorizer, 'tfidf_vectorizer.pkl')

['tfidf_vectorizer.pkl']