In [1]:
import pandas as pd
import numpy as np
from keras.models import Sequential
from scipy.ndimage.interpolation import shift
from keras import optimizers
from keras.layers import Dense, Activation, Dropout, LSTM, Merge, Input, Embedding
from keras.layers import Conv1D, GlobalAveragePooling1D, MaxPooling1D
from keras.utils import np_utils
from scipy.sparse import csr_matrix, hstack
from sklearn.model_selection import train_test_split
from matplotlib import pyplot as plt
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.metrics import confusion_matrix, classification_report
from sklearn import preprocessing
from sklearn.externals import joblib
from sklearn.model_selection import cross_val_predict, cross_val_score, StratifiedKFold, KFold
from sklearn.metrics import confusion_matrix, classification_report, mean_squared_error

Using TensorFlow backend.


In [2]:
data = pd.read_csv('../data_for_categorical.csv')

In [3]:
Classification = ["world","politics","sport","football","culture","business",
                  "lifeandstyle", "fashion","environment","technology","travel"]
data = data.loc[data['class'].isin(Classification)]
print(data.shape)

(396607, 2)


In [4]:
Y = data.as_matrix(columns=['class']).reshape(-1)
le = preprocessing.LabelEncoder()
le.fit(Y)
num_classes = len(list(le.classes_))
Y = le.transform(Y)
Y

array([10, 10, 10, ...,  9,  9,  9])

In [5]:
Y = np_utils.to_categorical(Y, num_classes)
Y

array([[ 0.,  0.,  0., ...,  0.,  0.,  1.],
       [ 0.,  0.,  0., ...,  0.,  0.,  1.],
       [ 0.,  0.,  0., ...,  0.,  0.,  1.],
       ..., 
       [ 0.,  0.,  0., ...,  0.,  1.,  0.],
       [ 0.,  0.,  0., ...,  0.,  1.,  0.],
       [ 0.,  0.,  0., ...,  0.,  1.,  0.]])

In [6]:
class_weight = {}
weight = np.sum(Y, axis=0)
total = Y.shape[0]
for i in range(11):
    class_weight[i] = weight[i]/total
class_weight

{0: 0.10709089854692429,
 1: 0.019916441212585758,
 2: 0.055447836271170205,
 3: 0.014258447279044495,
 4: 0.1847572029742289,
 5: 0.093505661776015048,
 6: 0.075989581626143762,
 7: 0.19269705274995147,
 8: 0.054696462745236471,
 9: 0.020781277183710829,
 10: 0.18085913763498879}

In [7]:
vectorizer = TfidfVectorizer(min_df=60, ngram_range=(1, 3))
X = vectorizer.fit_transform(data['name'].tolist())
X

<396607x12097 sparse matrix of type '<class 'numpy.float64'>'
	with 4098415 stored elements in Compressed Sparse Row format>

In [8]:
# def create_branch():
#     first_model = Sequential()
#     first_model.add(Dense(12, activation='relu', input_shape=(X.shape[1],), bias_initializer='RandomNormal'))
#     first_model.add(Dropout(0.5))
#     first_model.add(Dense(100, activation='relu', input_shape=(X.shape[1],), bias_initializer='RandomNormal'))
#     first_model.add(Dropout(0.5))
#     first_model.add(Dense(100, activation='relu', input_shape=(X.shape[1],), bias_initializer='RandomNormal'))
#     first_model.add(Dropout(0.5))
#     return first_model

In [20]:
def create_model():
    model = Sequential()
    model.add(Dense(14, activation='relu', input_shape=(X.shape[1],), bias_initializer='RandomNormal'))
    model.add(Dropout(0.5))
    model.add(Dense(14, activation='relu', input_shape=(X.shape[1],), bias_initializer='RandomNormal'))
    model.add(Dropout(0.5))
    model.add(Dense(14, activation='relu', input_shape=(X.shape[1],), bias_initializer='RandomNormal'))
    model.add(Dropout(0.5))
    model.add(Dense(num_classes, activation='softmax'))
    model.compile(optimizer='RMSprop',
                  loss='categorical_crossentropy',
                  metrics=['accuracy'])
    model.summary()
    return model

In [10]:
def train_and_evaluate_model(model, X_train, y_train, X_test, y_test):
    history = model.fit(X_train, y_train, 
                        epochs=6,
                        batch_size=10000)
    score = model.evaluate(X_test, y_test)[1]
    print('\n valid acc :%.4f'%(score))
    pred = np.argmax(model.predict(X_test), axis=1)
    true = np.argmax(y_test, axis=1)
    print(classification_report(true, pred, target_names=list(le.classes_)))
    return score

In [11]:
X_train, X_test, y_train, y_test = train_test_split(X, Y, train_size=0.8)
X_train= X_train.toarray()
X_test = X_test.toarray()
#X =X.toarray()

In [21]:
model = create_model()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense_12 (Dense)             (None, 14)                169372    
_________________________________________________________________
dropout_9 (Dropout)          (None, 14)                0         
_________________________________________________________________
dense_13 (Dense)             (None, 14)                210       
_________________________________________________________________
dropout_10 (Dropout)         (None, 14)                0         
_________________________________________________________________
dense_14 (Dense)             (None, 14)                210       
_________________________________________________________________
dropout_11 (Dropout)         (None, 14)                0         
_________________________________________________________________
dense_15 (Dense)             (None, 11)                165       
Total para

In [None]:
history = model.fit(X_train, y_train, 
                    epochs=100,
                    validation_data = (X_test, y_test),
                    class_weight = class_weight,
                    batch_size=8000)

# 0.5092
#0.5889
# history = model.fit([X,X], Y, 
#                     epochs=15,
#                     batch_size=8000)
# pred = np.argmax(model.predict(X_test), axis=1)
# true = np.argmax(y_test, axis=1)
# print(classification_report(true, pred, target_names=list(le.classes_)))

Train on 317285 samples, validate on 79322 samples
Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
 16000/317285 [>.............................] - ETA: 9s - loss: 0.1605 - acc: 0.4508

In [29]:
pred = np.argmax(model.predict(X_test ), axis=1)
true = np.argmax(y_test, axis=1)
print(classification_report(true, pred, target_names=list(le.classes_)))

              precision    recall  f1-score   support

    business       0.79      0.80      0.79      8442
     culture       0.92      0.32      0.48      1609
 environment       0.83      0.57      0.68      4470
     fashion       0.94      0.16      0.27      1161
    football       0.91      0.92      0.92     14695
lifeandstyle       0.70      0.79      0.75      7419
    politics       0.86      0.74      0.79      6023
       sport       0.85      0.92      0.88     15203
  technology       0.87      0.69      0.77      4313
      travel       0.92      0.41      0.57      1655
       world       0.71      0.89      0.79     14332

 avg / total       0.82      0.81      0.80     79322



In [13]:
# kf = KFold(n_splits=5, shuffle=True)
# acc = []
# X = X.toarray()
# for train_idx, test_idx in kf.split(X):
#     print ("Running Fold")
#     model = create_model()
#     acc.append(train_and_evaluate_model(model, X[train_idx], Y[train_idx], X[test_idx], Y[test_idx]))
#     del model
# print('mean acc:%.4f'%(np.mean(acc)))

In [14]:
# model.save('news_title_cls85.h5')
# joblib.dump(vectorizer, 'tfidf_vectorizer.pkl')