# Importing packages

In [1]:
import json
import pandas as pd
import numpy as np

### Sk-learn 

In [2]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score
from sklearn.metrics import accuracy_score ,f1_score ,recall_score, precision_score, precision_recall_curve

### Keras 

In [3]:
from keras.utils import to_categorical
from keras import models
from keras import layers

from keras import losses
from keras import metrics

from keras.utils import to_categorical
from keras import optimizers

from keras.wrappers.scikit_learn import KerasClassifier

Using TensorFlow backend.


### Tensorflow 

In [4]:
import tensorflow as tf
tf.enable_eager_execution
from tensorflow import set_random_seed
set_random_seed(2)

# Parsing the data

### Loading the data 

In [5]:
data_all = []
f = open("/home/arolive/Documents/CMI/MSC_2nd_sem/Machine_learning/My_work/Assignment_3/Data/News-Classification-DataSet.json","r")
for line in f:
     data_all.append(json.loads(line.strip()))

### Each array of data is a dictionary of each news items consisting  'content', 'annotation', 'extras', 'metadata'

In [6]:
print(data_all[0].keys())
print(data_all[0]["annotation"].keys())

dict_keys(['content', 'annotation', 'extras', 'metadata'])
dict_keys(['notes', 'label'])


### From each of the list we need contents and annotation ~> label

In [7]:
content = []
label= []
for i in data_all:
    content.append(i["content"])
    label.append(i["annotation"]["label"][0])

# Pre-processing

### Removing noise from contents 

In [8]:
noice = ["\\","-",",",",","\n","#",";",".","'","(",")","@","!","$","&","%"]
for line in range(len(content)): 
    for i in noice:
        content[line] = content[line].replace(str(i)," ")

### Creating pandas dataframe 

In [9]:
data = {"content" : content,  "label" : label}

In [10]:
data = pd.DataFrame(data)

### Describing data and dropping duplicates 

In [11]:
data.describe()

Unnamed: 0,content,label
count,7600,7600
unique,7594,4
top,Sysco Corp the country 39 s largest food se...,SciTech
freq,2,1900


In [12]:
data.drop_duplicates(inplace = True)
data.describe()

Unnamed: 0,content,label
count,7594,7594
unique,7594,4
top,India posted 199/7 and trail Australia by 275 ...,World
freq,1,1900


In [13]:
data.head()

Unnamed: 0,content,label
0,Unions representing workers at Turner Newall...,Business
1,SPACE com TORONTO Canada A second team o...,SciTech
2,AP A company founded by a chemistry research...,SciTech
3,AP It s barely dawn when Mike Fitzpatrick st...,SciTech
4,AP Southern California s smog fighting agenc...,SciTech


### Coverting labels from string to int

In [14]:
set_label = list(set(label))
data['labels'] = data['label'].apply(set_label.index)

In [15]:
data.head()

Unnamed: 0,content,label,labels
0,Unions representing workers at Turner Newall...,Business,3
1,SPACE com TORONTO Canada A second team o...,SciTech,2
2,AP A company founded by a chemistry research...,SciTech,2
3,AP It s barely dawn when Mike Fitzpatrick st...,SciTech,2
4,AP Southern California s smog fighting agenc...,SciTech,2


### Train test split 

In [16]:
content_train, content_test, labels_train, labels_test = train_test_split(data.content, data.labels, test_size=0.30, random_state=69)

# Vectorizer

In [17]:
#vectorizer = TfidfVectorizer(max_features = 15000)
vectorizer = TfidfVectorizer()

In [18]:
vectorised_train_content = vectorizer.fit_transform(content_train)
vectorised_test_content = vectorizer.transform(content_test)

In [19]:
print(vectorised_test_content.shape)
print(vectorised_train_content.shape)

(2279, 17101)
(5315, 17101)


# Models

### forming matrix for each documunts it belongs to

In [20]:
train_labels = to_categorical(labels_train)
test_labels = to_categorical(labels_test)
train_labels.shape

(5315, 4)

### Total unique words 

In [21]:
input_len = len(vectorised_train_content.toarray()[1])
input_len

17101

### Setting up model 

#### Model_1 

Input layer : 
<ul>
    <li>shape : 17101</li>
</ul>
No. of layers hidden layer: 2
<ul>
    <li>Length of each the layer : 50</li>
    <li>Activation function : relu</li>
</ul>
Output layer : 
<ul>
    <li>shape : 4</li>
    <li>Activation function : sigmoid</li>
</ul>
Optimizer : RMSprop
<br>Loss_function : binary_crossentropy


In [22]:
def create_network_1():
    model = models.Sequential()
    model.add(layers.Dense(50, activation='relu', input_shape=(input_len,)))
    model.add(layers.Dense(50, activation='relu'))
    model.add(layers.Dense(4, activation='sigmoid'))
    
    model.compile(optimizer=optimizers.RMSprop(lr=0.001), loss=losses.binary_crossentropy, metrics = ["acc"])
    
    return model

#### Model_2

Input layer : 
<ul>
    <li>shape : 17101</li>
</ul>
No. of layers hidden layer: 1
<ul>
    <li>Length of the layer : 40</li>
    <li>Activation function : relu</li>
</ul>
Output layer : 
<ul>
    <li>shape : 4</li>
    <li>Activation function : softmax</li>
</ul>
Optimizer : RMSprop
<br>Loss_function : binary_crossentropy


In [23]:
def create_network_2():
    model = models.Sequential()
    model.add(layers.Dense(40, activation='relu', input_shape=(input_len,)))
    model.add(layers.Dense(4, activation='softmax'))
    
    model.compile(optimizer=optimizers.RMSprop(lr=0.001), loss=losses.binary_crossentropy, metrics = ["acc"] )
    
    return model

#### Model_3

Input layer : 
<ul>
    <li>shape : 17101</li>
</ul>
No. of layers hidden layer: 2
<ul>
    <li>Length of the layer : 50</li>
    <li>Activation function : relu</li>
</ul>
Output layer : 
<ul>
    <li>shape : 4</li>
    <li>Activation function : softmax</li>
</ul>
Optimizer : RMSprop
<br>Loss_function : binary_crossentropy


In [24]:
def create_network_3():
    model = models.Sequential()
    model.add(layers.Dense(50, activation='relu', input_shape=(input_len,)))
    model.add(layers.Dense(50, activation='relu'))
    model.add(layers.Dense(4, activation='softmax'))
    
    model.compile(optimizer=optimizers.RMSprop(lr=0.001), loss=losses.mean_squared_error, metrics = ["acc"] )
    
    return model

#### Model_4

Input layer : 
<ul>
    <li>shape : 17101</li>
</ul>
No. of layers hidden layer: 2
<ul>
    <li>Length of the layer : 50</li>
    <li>Activation function : exponential</li>
</ul>
Output layer : 
<ul>
    <li>shape : 4</li>
    <li>Activation function : softmax</li>
</ul>
Optimizer : Adagrad
<br>Loss_function : binary_crossentropy


In [25]:
def create_network_4():
    model = models.Sequential()
    model.add(layers.Dense(50, activation='exponential', input_shape=(input_len,)))
    model.add(layers.Dense(50, activation='exponential'))
    model.add(layers.Dense(4, activation='softmax'))
    
    model.compile(optimizer=optimizers.Adagrad(lr=0.01), loss=losses.mean_squared_error, metrics = ["acc"] )
    
    return model

#### Model_5

Input layer : 
<ul>
    <li>shape : 17101</li>
</ul>
No. of layers hidden layer: 1
<ul>
    <li>Length of the layer : 50</li>
    <li>Activation function : selu</li>
</ul>
Output layer : 
<ul>
    <li>shape : 4</li>
    <li>Activation function : sigmoid</li>
</ul>
Optimizer : Adagrad
<br>Loss_function : binary_crossentropy


In [26]:
def create_network_5():
    model = models.Sequential()
    model.add(layers.Dense(50, activation='selu', input_shape=(input_len,)))
    model.add(layers.Dense(50, activation='selu'))
    model.add(layers.Dense(4, activation='sigmoid'))
    
    model.compile(optimizer=optimizers.Adagrad(lr=0.001), loss=losses.binary_crossentropy, metrics = ["acc"])
    
    return model

# Cross-validation

### Based on Network_1 

In [55]:
np.random.seed(5)
cv_model_1 = KerasClassifier(build_fn = create_network_1, epochs=10, batch_size=50, verbose=0)
cv_score_1 = cross_val_score(cv_model_1, vectorised_train_content, train_labels, cv=10 )
cv_score_1

array([0.92904135, 0.93139098, 0.92763158, 0.93656014, 0.94172932,
       0.92137477, 0.93926554, 0.93220339, 0.92372882, 0.92890773])

### Based on Network_2

In [56]:
np.random.seed(5)
cv_model_2 = KerasClassifier(build_fn = create_network_2, epochs=10, batch_size=50, verbose=0)
cv_score_2 = cross_val_score(cv_model_2, vectorised_train_content, train_labels, cv=10 )
cv_score_2

array([0.93280076, 0.93984963, 0.93656015, 0.94078947, 0.94407895,
       0.93314501, 0.93973634, 0.93926554, 0.93079097, 0.93455744])

### Based on Network_3

In [57]:
np.random.seed(5)
cv_model_3 = KerasClassifier(build_fn = create_network_3, epochs=8, batch_size=100, verbose=0)
cv_score_3 = cross_val_score(cv_model_3, vectorised_train_content, train_labels, cv=10 )
cv_score_3

array([0.85150376, 0.85150376, 0.84586467, 0.86654135, 0.87218045,
       0.84934087, 0.86817326, 0.85310735, 0.83804143, 0.85310735])

### Based on Network_4

In [61]:
np.random.seed(15)
cv_model_4 = KerasClassifier(build_fn = create_network_4, epochs=10, batch_size=50, verbose=0)
cv_score_4 = cross_val_score(cv_model_4, vectorised_train_content, train_labels, cv=10 )
cv_score_4

array([0.84398495, 0.62406015, 0.85338346, 0.86090225, 0.88157894,
       0.83427495, 0.87193973, 0.70433145, 0.83427493, 0.84180791])

### Based on Network_5

In [59]:
np.random.seed(5)
cv_model_5 = KerasClassifier(build_fn = create_network_5, epochs=8, batch_size=50, verbose=0)
cv_score_5 = cross_val_score(cv_model_5, vectorised_train_content, train_labels, cv=10 )
cv_score_5

array([0.80827068, 0.83458647, 0.81860902, 0.80451129, 0.80733083,
       0.80838042, 0.81450093, 0.80414313, 0.80838042, 0.82344632])

# Model fitting

### Based on Network_1

In [51]:
np.random.seed(5)
model_1 = create_network_1()
neural_network_1 = model_1.fit(vectorised_train_content,
                    train_labels,
                    epochs=10,
                    batch_size = 50)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


### Based on model 2 

In [32]:
np.random.seed(5)
model_2 = create_network_2()
neural_network_2 = model_2.fit(vectorised_train_content,
                    train_labels,
                    epochs=10,
                    batch_size = 50)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


### Based on model 3

In [33]:
np.random.seed(5)
model_3 = create_network_3()
neural_network_3 = model_3.fit(vectorised_train_content,
                    train_labels,
                    epochs=8,
                    batch_size = 100)

Epoch 1/8
Epoch 2/8
Epoch 3/8
Epoch 4/8
Epoch 5/8
Epoch 6/8
Epoch 7/8
Epoch 8/8


### Based on model 4

In [34]:
np.random.seed(5)
model_4 = create_network_4()
neural_network_4 = model_4.fit(vectorised_train_content,
                    train_labels,
                    epochs=10,
                    batch_size = 50)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


### Based on model 5

In [35]:
np.random.seed(5)
model_5 = create_network_5()
neural_network_5 = model_5.fit(vectorised_train_content,
                    train_labels,
                    epochs=8,
                    batch_size = 50)

Epoch 1/8
Epoch 2/8
Epoch 3/8
Epoch 4/8
Epoch 5/8
Epoch 6/8
Epoch 7/8
Epoch 8/8


# Evaluating models

## Model 1 

In [52]:
loss_model_1, acc_model_1 = model_1.evaluate(vectorised_test_content, test_labels)



### Predicting labels

In [53]:
result_1 = model_1.predict(vectorised_test_content)

pred_label_1 = []
for array in result_1:
    pred_label_1.append(np.argmax(array))

### Scores 

In [54]:
prec_model_1 = precision_score(labels_test,pred_label_1, average=None)[0]
f1_model_1 = f1_score(list(labels_test), pred_label_1, average = None)[0]

print("Accuraacy of model 1 : ", acc_model_1)
print("Precision of model 1 : ", prec_model_1)
print("F1 score of model 1  : ", f1_model_1)

Accuraacy of model 1 :  0.9305616498464239
Precision of model 1 :  0.9313893653516295
F1 score of model 1  :  0.9354005167958657


## Model 2

In [36]:
loss_model_2, acc_model_2 = model_2.evaluate(vectorised_test_content, test_labels)



### Predicting labels

In [37]:
result_2 = model_2.predict(vectorised_test_content)

pred_label_2 = []
for array in result_2:
    pred_label_2.append(np.argmax(array))

### Scores 

In [49]:
prec_model_2 = precision_score(labels_test,pred_label_2, average=None)[0]
f1_model_2 = f1_score(list(labels_test), pred_label_2, average = None)[0]

print("Accuraacy of model 2 : ", acc_model_2)
print("Precision of model 2 : ", prec_model_2)
print("F1 score of model 2  : ", f1_model_2)

Accuraacy of model 2 :  0.9393374287491045
Precision of model 2 :  0.9483648881239243
F1 score of model 2  :  0.9508196721311475


## Model 3

In [40]:
loss_model_3, acc_model_3 = model_3.evaluate(vectorised_test_content, test_labels)



### Predicting labels

In [41]:
result_3 = model_3.predict(vectorised_test_content)

pred_label_3 = []
for array in result_3:
    pred_label_3.append(np.argmax(array))


### Scores 

In [42]:
prec_model_3 = precision_score(labels_test,pred_label_3, average=None)[0]
f1_model_3 = f1_score(list(labels_test), pred_label_3, average = None)[0]

print("Accuraacy of model 3 : ", acc_model_3)
print("Precision of model 3 : ", prec_model_3)
print("F1 score of model 3  : ", f1_model_3)

Accuraacy of model 3 :  0.8617814831327796
Precision of model 3 :  0.9366438356164384
F1 score of model 3  :  0.9414802065404474


## Model 4

In [43]:
loss_model_4, acc_model_4 = model_4.evaluate(vectorised_test_content, test_labels)



### Predictin labels

In [44]:
result_4 = model_4.predict(vectorised_test_content)

pred_label_4 = []
for array in result_4:
    pred_label_4.append(np.argmax(array))


### Scores 

In [45]:
prec_model_4 = precision_score(labels_test,pred_label_4, average=None)[0]
f1_model_4 = f1_score(list(labels_test), pred_label_4, average = None)[0]

print("Accuraacy of model 4 : ", acc_model_4)
print("Precision of model 4 : ", prec_model_4)
print("F1 score of model 4  : ", f1_model_4)

Accuraacy of model 4 :  0.860026327362705
Precision of model 4 :  0.9344827586206896
F1 score of model 4  :  0.9360967184801381


## Model 5

In [46]:
loss_model_5, acc_model_5 = model_5.evaluate(vectorised_test_content, test_labels)



### Predictin labels

In [47]:
result_5 = model_5.predict(vectorised_test_content)

pred_label_5 = []
for array in result_5:
    pred_label_5.append(np.argmax(array))


### Scores  

In [48]:
prec_model_5 = precision_score(labels_test,pred_label_5, average=None)[0]
f1_model_5 = f1_score(list(labels_test), pred_label_5, average = None)[0]

print("Accuraacy of model 4 : ", acc_model_5)
print("Precision of model 4 : ", prec_model_5)
print("F1 score of model 4  : ", f1_model_5)

Accuraacy of model 4 :  0.8313953488633632
Precision of model 4 :  0.9095394736842105
F1 score of model 4  :  0.9325463743676222


# Creating Output File

In [62]:
f = open("output_of_topic_classifier.txt",'w')

In [63]:
print("\t\t\t\t\t\t Details of the dataset", file = f)
print("Number of data used for training ::", len(content_train), file = f)
print("Number of data used for testing ::", len(content_test), file = f)
print("Number of different model used :: 5", file = f)
print("\n", file = f)

for i in range(1,6):

    ######################################################################################################################
    neural_network = vars()["neural_network_" + str(i)]
    cv_score = vars()["cv_score_" + str(i)]
    acc_model = vars()["acc_model_" + str(i)]
    prec_model = vars()["prec_model_" + str(i)]
    f1_model = vars()["f1_model_" + str(i)]
    model = vars()["model_" + str(i)]
    model_summary = model.get_config()["layers"]
    ######################################################################################################################

    ######################################################################################################################
    print("\t\t\t\t\t\t\t MODEL",i, file = f)
    ######################################################################################################################

    ######################################################################################################################
    print("\t Structutre of the Network {} " .format(i), file = f)
    print("\n", file = f)
    no_of_layers = len(model_summary)

    print("Number of layers in the model ::", no_of_layers, file = f)
    for i in range(0, no_of_layers - 1):
        print("No. of nodes in layer {} ::" .format(i+1), model_summary[i]["config"]["units"], file = f)
        print("Activation function of layer {} ::" .format(i+1),model_summary[i]["config"]["activation"], file = f)    
    print("No. of nodes in output layer ::", model_summary[no_of_layers - 1]["config"]["units"], file = f)
    print("Activation function of output layer ::", model_summary[no_of_layers - 1]["config"]["activation"], file = f)
    ######################################################################################################################

    ######################################################################################################################
    print("\n", file = f)
    print("\t Details of the model no {}" .format(i), file = f)
    print("\n", file = f)
    print("batch_size ::", neural_network.params['batch_size'], file = f)
    print("epochs ::", neural_network.params['epochs'], file = f)
    print("Cost function used ::", str(model.loss_functions[0]).split(".")[0].split(" ")[1], file = f)
    print("Optimizer used ::", str(model.optimizer).split(".")[2].split(" ")[0], file = f)
    ######################################################################################################################

    ######################################################################################################################
    print("\n", file = f)
    print("\t Evaluation ", file = f)
    print("\n", file = f)
    print("Ten fold cross validation of model {} is : ".format(i), list(map(lambda x: round(x * 100,2), cv_score)), file = f)
    ######################################################################################################################

    ######################################################################################################################
    print("Accuracy is : ", acc_model, file = f)
    print("Precision is : ", prec_model, file = f)
    print("F1 score is : ", f1_model, file = f)
    print("\n", file = f)
    ######################################################################################################################

In [64]:
f.close()