In [2]:
import keras

In [3]:
import pandas as pd
import json
import numpy as np

Extracting the labels and the contents out of our JSON file.

In [4]:
data = []
with open('News-Classification-DataSet.json') as f:
    for line in f:
        data.append(json.loads(line))
df=pd.DataFrame(columns=["content","label"])
for i in range(0,len(data)):
    content=data[i]['content']
    label=data[i]['annotation']['label'][0]
    df=df.append({'content':content,'label':label},ignore_index=True)

In [5]:
import string,nltk
from nltk.corpus import stopwords
from nltk.stem.wordnet import WordNetLemmatizer
from nltk.stem.snowball import SnowballStemmer

Preprocessing the data by converting each word to lower case, removing stop words, removing the punctuation marks, lemmatizing and stemming.

Stop words are the basic English words not relevant for classification.

Lemmatization considers the context and converts the word to its meaningful base form.

Stemming is the process of  reducing words to their stem, base or the root form. Stemmers use an algorithmic approach of removing prefixes and suffixes. The result might not be an actual dictionary word. 

In [6]:
df['content'] = df['content'].apply(lambda x: " ".join(x.lower() for x in nltk.word_tokenize(x)))
stop = stopwords.words('english')
df['content'] = df['content'].apply(lambda x: " ".join( x for x in nltk.word_tokenize(x) if x not in stop))
df['content'] = df['content'].apply(lambda x: " ".join(x for x in nltk.word_tokenize(x) if x not in string.punctuation))
lm = WordNetLemmatizer()
df['content'] = df['content'].apply(lambda x: " ".join([lm.lemmatize(word,"v") for word in x.split()]))
stem = nltk.stem.SnowballStemmer('english')
df['content'] = df['content'].apply(lambda x: " ".join([stem.stem(word) for word in x.split()]))

Now ecoding the labels into numeric characters i.e. 0,1,2,3 here corresponding to the 4 labels we have, which our predictive models can better understand. 

In [7]:
from sklearn import preprocessing
le = preprocessing.LabelEncoder()
labels=le.fit_transform(df["label"]) 

Applying TF-IDF transformation to our content.

In [8]:
from sklearn.feature_extraction.text import TfidfVectorizer
tfidf= TfidfVectorizer(analyzer="word")
content=tfidf.fit_transform(df['content'])

Using 10 fold cross validation. Stratification is a technique where we rearrange the data in a way that each fold has a good representation of the whole dataset. This approach ensures that one class of data is not overrepresented especially when the target variable is unbalanced. So it helps reduce both bias and variance. 

In [9]:
from sklearn.model_selection import KFold
kfold = KFold(n_splits=10, shuffle=True, random_state=7)

In [10]:
from keras import models
from keras import layers
from keras.layers import Dense
from keras.models import Sequential
from keras.callbacks import EarlyStopping
from keras.utils.np_utils import to_categorical

In [11]:
cvscores_test=[] #stores the cross-validation scores

for train, test in kfold.split(content,labels):
    
    model = Sequential()
    ''' Sequential layers are stacked such that every layer passes its output to the next 
    layer without you specifying extra information '''
    
    '''layers.dense is a neural network layer that is fully connected. 
    It takes in an activation function and also the dimension of the output layer. '''
    
    model.add(layers.Dense(16, activation='relu', input_dim=content[train,:].shape[1]))
    model.add(layers.Dense(4, activation='softmax'))
    model.compile(optimizer='adam',
              loss='binary_crossentropy',
              metrics=['accuracy'])
    
    #Reserving 1000 examples for validation
    
    x_val = content[train[:1000],:]
    partial_x_train = content[train[1000:],:]
    
    y_val = to_categorical(labels[train][:1000])
    partial_y_train = to_categorical(labels[train][1000:])

    es=EarlyStopping(monitor='val_loss', min_delta=0.01, patience=2, mode='auto')
    #EarlyStopping stops training when a monitored quantity has stopped improving. 
     
    history = model.fit(partial_x_train,
                    partial_y_train,
                    epochs=100, 
                    batch_size=200,
                    validation_data=(x_val, y_val),
                    callbacks=[es]) 
    
    #The 100 epochs mentioned above is just the maximum limit. EarlyStopping will actually stop before that whenever necessary.
        
    scores = model.evaluate(content[test,:],to_categorical(labels[test]))

    cvscores_test.append(scores[1] * 100)  

Train on 5840 samples, validate on 1000 samples
Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Train on 5840 samples, validate on 1000 samples
Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Train on 5840 samples, validate on 1000 samples
Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Train on 5840 samples, validate on 1000 samples
Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100


Train on 5840 samples, validate on 1000 samples
Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Train on 5840 samples, validate on 1000 samples
Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Train on 5840 samples, validate on 1000 samples
Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Train on 5840 samples, validate on 1000 samples
Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100


Epoch 12/100
Epoch 13/100
Epoch 14/100
Train on 5840 samples, validate on 1000 samples
Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Train on 5840 samples, validate on 1000 samples
Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100


In [12]:
cvscores_test

[93.15789473684211,
 93.84868421052632,
 93.8157894736842,
 93.88157888462669,
 92.9276316416891,
 93.25657901011016,
 93.45394736842105,
 93.6513158522154,
 93.02631578947368,
 93.05921046357406]

In [13]:
np.mean(cvscores_test),np.std(cvscores_test)

(93.40789474311627, 0.3506646742720511)

So the test data achieved an average accuracy of more than 93% 

In [14]:
predictions=model.predict_classes(tfidf.fit_transform(df['content']))
pred=le.inverse_transform(predictions)
df["Predicted_Label"]=pred
df.to_csv('output.csv')