# Implementing Topic Memory Network for short text classification

In [0]:
# mounting the drive
from google.colab import drive
drive.mount('/content/drive')

Go to this URL in a browser: https://accounts.google.com/o/oauth2/auth?client_id=947318989803-6bn6qk8qdgf4n4g3pfee6491hc0brc4i.apps.googleusercontent.com&redirect_uri=urn%3aietf%3awg%3aoauth%3a2.0%3aoob&response_type=code&scope=email%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdocs.test%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdrive%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdrive.photos.readonly%20https%3a%2f%2fwww.googleapis.com%2fauth%2fpeopleapi.readonly

Enter your authorization code:
··········
Mounted at /content/drive


In [0]:
# importing the libraries

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import re
import random
import os
import copy

# TOPIC MODELLING:
# Topic modeling is an unsupervised machine learning technique that's capable of scanning a set of documents, detecting word and phrase
# patterns within them, and automatically clustering word groups and similar expressions that best characterize a set of documents.


import gensim   #used for topic modelling, document indexing and similarity retreival
from scipy import sparse        #used to create sparse matrices
from gensim.parsing.preprocessing import STOPWORDS
import logging #This module defines functions and classes which implement a flexible event logging system for applications and libraries.

#deep learning libraries


In [0]:
dataset1=pd.read_csv("/content/drive/My Drive/MAJOR 2- WORK IN PROGRESS/dataset_25classes_less7000.csv")

In [0]:
# quick analysis of dataset1

#shape of the dataset
print(dataset1.shape)
print("========================================================")

#finding the columns in the dataset
print(dataset1.columns)
print("========================================================")

#all the unique classes
po=dataset1.category.unique()
for i in po:
    print(i)
print("========================================================")

#number of different classes in the dataset
print(len(po))
print("========================================================")

#value count for each of the class in the dataset
print(dataset1.category.value_counts())
print("========================================================")

#dropping the na isntances from the dataset, adn counting the value counts again
dataset1=dataset1.dropna()
print(dataset1.shape)
print("========================================================")
print(dataset1.category.value_counts())
print("========================================================")


(101585, 3)
Index(['Unnamed: 0', 'text_instance', 'category'], dtype='object')
CRIME
WORLD NEWS
IMPACT
WEIRD NEWS
WOMEN
MEDIA
TECH
RELIGION
SCIENCE
LATINO VOICES
EDUCATION
COLLEGE
ARTS & CULTURE
STYLE
GREEN
TASTE
GOOD NEWS
WORLDPOST
FIFTY
ARTS
DIVORCE
MONEY
ENVIRONMENT
CULTURE & ARTS
24
WOMEN             6980
IMPACT            6918
DIVORCE           6852
CRIME             6810
MEDIA             5629
WEIRD NEWS        5340
GREEN             5244
WORLDPOST         5158
RELIGION          5112
STYLE             4508
SCIENCE           4356
WORLD NEWS        4354
TASTE             4192
TECH              4164
MONEY             3414
ARTS              3018
FIFTY             2802
GOOD NEWS         2796
ARTS & CULTURE    2678
ENVIRONMENT       2646
COLLEGE           2288
LATINO VOICES     2258
CULTURE & ARTS    2060
EDUCATION         2008
Name: category, dtype: int64
(93384, 3)
DIVORCE           6852
WOMEN             6592
IMPACT            6520
CRIME             6080
MEDIA             5088
WEIRD

In [0]:
#combining some of the similar classes into 1
print("combining some of the classes into 1....")
dataset1['category'].replace({"COLLEGE":"COLLEGE & EDUCATION","EDUCATION":"COLLEGE & EDUCATION",
                          "GREEN":"ENVIRONMENT","TECH":"TECH & SCIENCE","SCIENCE":"TECH & SCIENCE",
                          "CULTURE & ARTS":"ARTS & CULTURE","WEIRD NEWS":"NEWS","GOOD NEWS":"NEWS","WORLD NEWS":"NEWS","ARTS":"ARTS & CULTURE"}, inplace=True)

removed_classes=["LATINO VOICES","FIFTY"]
dataset1=dataset1[~dataset1['category'].isin(removed_classes)]
print("processing DONE.")


#value count of the new dataset formed
print(dataset1.category.value_counts())
print("========================================================")

#printing the head of the dataset
print(dataset1.head(5))
print("========================================================")


combining some of the classes into 1....
processing DONE.
NEWS                   11668
TECH & SCIENCE          8116
ENVIRONMENT             7312
ARTS & CULTURE          7099
DIVORCE                 6852
WOMEN                   6592
IMPACT                  6520
CRIME                   6080
MEDIA                   5088
RELIGION                4413
TASTE                   4036
COLLEGE & EDUCATION     3961
STYLE                   3821
WORLDPOST               3820
MONEY                   3413
Name: category, dtype: int64
   Unnamed: 0  ... factor_class
0           0  ...          NaN
1           1  ...          NaN
2          22  ...          NaN
3          23  ...          NaN
4          24  ...          NaN

[5 rows x 4 columns]


In [0]:
# adding a new column in the dataset for holding hte factorised value of the classes
import numpy as np
dataset1['factor_class'] = np.nan

In [0]:
# factorising the caterogy column of the dataset .i.e, converting classes into numbers using hard code

print("creating a dictionary for mapping each class to a unique number")
classes=dataset1.category.unique()
count =1
dict_classes= {}
for i in classes:
    dict_classes[i]=count
    count+=1

print("final value of count ", count)
print("========================================================")



creating a dictionary for mapping each class to a unique number
final value of count  16


In [0]:
print(dict_classes)

{'CRIME': 1, 'NEWS': 2, 'IMPACT': 3, 'WOMEN': 4, 'MEDIA': 5, 'TECH & SCIENCE': 6, 'RELIGION': 7, 'COLLEGE & EDUCATION': 8, 'ARTS & CULTURE': 9, 'STYLE': 10, 'ENVIRONMENT': 11, 'TASTE': 12, 'WORLDPOST': 13, 'DIVORCE': 14, 'MONEY': 15}


In [0]:
print("making the appropriate changes in the dataset using the dictionary created")
print("========================================================")

for i in range(0,len(dataset1)):
    dataset1.iloc[i,3]=int(dict_classes[dataset1.iloc[i,2]])

making the appropriate changes in the dataset using the dictionary created


In [0]:
print(dataset1.head(10))
print(type(dataset1.factor_class))

   Unnamed: 0  ... factor_class
0           0  ...          1.0
1           1  ...          1.0
2          22  ...          2.0
3          23  ...          2.0
4          24  ...          3.0
5          25  ...          3.0
6          40  ...          2.0
7          41  ...          2.0
8          44  ...          2.0
9          45  ...          2.0

[10 rows x 4 columns]
<class 'pandas.core.series.Series'>


In [0]:
dataset1.columns

Index(['Unnamed: 0', 'text_instance', 'category', 'factor_class'], dtype='object')

In [0]:
dataset1.shape

(88791, 4)

In [0]:
print(len(dataset1))

88791


In [0]:
for i in range(0,len(dataset1)):
    text = gensim.utils.to_unicode(dataset1.iloc[i,1], 'utf8').strip()
    news_lst.append(text)
#splitting each of the instance from the whole document set and creating a list of instances

for i in news_lst[:5]:
	print(i)	
	print("==================================================")

msgs = []
labels = []
label_dict = {}


print("total number of instances in the data file: ",len(news_lst))
print("==================================================")



Verizon Wireless and AT&T are already promoting LTE devices including smartphones and tablets from RIM's rivals. RIM's first
There Were 2 Mass Shootings In Texas Last Week, But Only 1 On TV
She left her husband. He killed their children. Just another day in America.
South Korean President Meets North Korea's Kim Jong Un To Talk Trump Summit
The two met to pave the way for a summit between North Korean and the U.S.
total number of instances in the data file:  88792


#for text file as a dataset

#open the dataset,  convert to unicode, 

In [0]:
with open("/content/drive/My Drive/MAJOR 2- WORK IN PROGRESS/TMN/final_tmn_dataset_file.txt", 'r') as fin:
    text = gensim.utils.to_unicode(fin.read(), 'utf8').strip()


#splitting each of the instance from the whole document set and creating a list of instances
news_lst = text.split("\n")
for i in news_lst[:5]:
	print(i)	
	print("==================================================")

msgs = []
labels = []
label_dict = {}


print("total number of instances in the data file: ",len(news_lst))
print("==================================================")

There Were 2 Mass Shootings In Texas Last Week, But Only 1 On TV######CRIME
She left her husband. He killed their children. Just another day in America.######CRIME
Will Smith Joins Diplo And Nicky Jam For The 2018 World Cup's Official Song######ENTERTAINMENT
Of course it has a song.######ENTERTAINMENT
Hugh Grant Marries For The First Time At Age 57######ENTERTAINMENT
total number of instances in the data file:  401990


#splitting each instance into text and label and tokenising each instance

In [0]:

#%%==================================================================================================================================================================================


#%%==================================================================================================================================================================================

errors=0
passed=0

co=2

for n_i, line in enumerate(news_lst):  #enumerate function adds a counter to an iterable, here that being the  news_lst
	try:
	    msg, label = line.strip().split("######")
        #.tokenise ===== Iteratively yield tokens as unicode strings, optionally removing accent marks and lowercasing it.
	    msg = list(gensim.utils.tokenize(msg, lower=True))
	    msgs.append(msg)
	    if label not in label_dict:
	        label_dict[label] = len(label_dict)  #later on used to factorise the labels using this dictionary, generated somewhat like 
        #crime:1, entertainment:2, politics :3 ....
	    labels.append(label_dict[label]) #factorisation process: the labels are converted from strings to numbers using the dictionary created in the above step
	    passed+=1
	except:  #to handle all the text instances that were not in the correct format required
		errors+=1
		# print("problem statement was: ==============================")
		# print(line)


print("==================================================")
print("==================================================")
print("==================================================")
print("total instances: ",len(news_lst))
print("passed instances: ",passed)
print("failed instances: ",errors)
print("==================================================")
print("==================================================")
print("==================================================")


#%%==================================================================================================================================================================================


total instances:  401990
passed instances:  401706
failed instances:  284


#how the tokenised instance looks

In [0]:
print(msgs[:5])
print("\n===================================================================\n")
print(labels[:5])

print(len(msgs))
print(len(labels))

[['there', 'were', 'mass', 'shootings', 'in', 'texas', 'last', 'week', 'but', 'only', 'on', 'tv'], ['she', 'left', 'her', 'husband', 'he', 'killed', 'their', 'children', 'just', 'another', 'day', 'in', 'america'], ['will', 'smith', 'joins', 'diplo', 'and', 'nicky', 'jam', 'for', 'the', 'world', 'cup', 's', 'official', 'song'], ['of', 'course', 'it', 'has', 'a', 'song'], ['hugh', 'grant', 'marries', 'for', 'the', 'first', 'time', 'at', 'age']]


[0, 0, 1, 1, 1]
401706
401706


#forming  a dictionary of tokens

and handling and analysing the dictionary for better understanding of the code snippet

In [0]:
# build dictionary
dictionary = gensim.corpora.Dictionary(msgs)
#convert each set of tokens,formed for each of the document present in the dataset, formed by gensim.utils.tokenise to their BOW representation

In [0]:
len(dictionary)

84788

In [0]:
print(dictionary[3])
print(dictionary.dfs[3])

mass
565


In [0]:
su=0
for i in range(0,len(dictionary)):
    su+=dictionary.dfs[i]

In [0]:
print(su)

5508981


In [0]:
for i in range(0,len(dictionary)):
    print(dictionary[i],end=" /////// ")



checking the working of dfs method of the dictionary

In [0]:
for i in range(0,20):
    print('The word "' + dictionary[i] + '" appears in', dictionary.dfs[i],'documents')


The word "but" appears in 20205 documents
The word "in" appears in 82674 documents
The word "last" appears in 4682 documents
The word "mass" appears in 565 documents
The word "on" appears in 44386 documents
The word "only" appears in 5292 documents
The word "shootings" appears in 229 documents
The word "texas" appears in 1053 documents
The word "there" appears in 10748 documents
The word "tv" appears in 1519 documents
The word "week" appears in 6901 documents
The word "were" appears in 5817 documents
The word "america" appears in 4047 documents
The word "another" appears in 3064 documents
The word "children" appears in 4294 documents
The word "day" appears in 10316 documents
The word "he" appears in 11815 documents
The word "her" appears in 10603 documents
The word "husband" appears in 1229 documents
The word "just" appears in 11810 documents


copying and using token2id

In [0]:
import copy

temp_dict=copy.deepcopy(dictionary)
ppo=temp_dict.token2id.get

In [0]:
count=10
for i in dictionary.items():
    if count>0:
        print(i)
        count-=1
    else:
        break

(0, 'but')
(1, 'in')
(2, 'last')
(3, 'mass')
(4, 'on')
(5, 'only')
(6, 'shootings')
(7, 'texas')
(8, 'there')
(9, 'tv')


In [0]:
'in' in dictionary.token2id

True

In [0]:
popo=list(map(temp_dict.token2id.get, STOPWORDS))
print(popo[:5])
print(len(popo))

[1858, 585, 1823, 1771, 6876]
337


In [0]:
for i in popo:
    print(temp_dict[i])

always
me
if
well
yours
afterwards
whereby
please
seems
hence
yourself
from
next
former
cant
within
bottom
move
were
detail
towards
those
whereas
inc
an
itself
found
mostly
may
already
herself
does
do
becoming
take


KeyError: ignored

#filtering the dictionary

In [0]:

#python is based completely  references ad addresses, so normal copying using the "=" operator do not the way its expected to
#any change made in any of the documents is reflected in the other one, thats why we use copy library's deepcopy method

import copy
bow_dictionary = copy.deepcopy(dictionary)

bow_dictionary.filter_tokens(list(map(bow_dictionary.token2id.get, STOPWORDS)))


In [0]:
print(len(bow_dictionary))

84461


In [0]:
count =10
for i in bow_dictionary.values():
    if count>0:
        print(i)
        count-=1

mass
shootings
texas
tv
week
america
children
day
husband
killed


In [0]:
print(len(bow_dictionary))
count=0
for i in bow_dictionary.values():
    if len(i)==1:
        count+=1

print(count)

84461
46


In [0]:

len_1_words = list(filter(lambda w: len(w) == 1, bow_dictionary.values()))


In [0]:
print(len_1_words[:5])
print(len(len_1_words))

['s', 'd', 'u', 't', 'f']
46


In [0]:
bow_dictionary.filter_tokens(list(map(bow_dictionary.token2id.get, len_1_words)))
bow_dictionary.filter_extremes(no_below=3, keep_n=None)



In [0]:
bow_dictionary.compactify()

In [0]:
print(len(bow_dictionary))
count =10
for i in bow_dictionary.values():
    if count>0:
        print(i)
        count-=1

41080
mass
shootings
texas
tv
week
america
children
day
husband
killed


In [0]:
print(msgs[0])

print(bow_dictionary.doc2bow(msgs[0]))

['there', 'were', 'mass', 'shootings', 'in', 'texas', 'last', 'week', 'but', 'only', 'on', 'tv']
[(0, 1), (1, 1), (2, 1), (3, 1), (4, 1)]


In [0]:
print(bow_dictionary[0])
print(bow_dictionary[1])
print(bow_dictionary[2])
print(bow_dictionary[3])
print(bow_dictionary[4])

mass
shootings
texas
tv
week


#bow_dictionary.doc2bow(doc)

In [0]:
for i in range(0,5):
    print(msgs[i])
    print(bow_dictionary.doc2bow(msgs[i]))
    print("\n========================================================\n")

['there', 'were', 'mass', 'shootings', 'in', 'texas', 'last', 'week', 'but', 'only', 'on', 'tv']
[(0, 1), (1, 1), (2, 1), (3, 1), (4, 1)]


['she', 'left', 'her', 'husband', 'he', 'killed', 'their', 'children', 'just', 'another', 'day', 'in', 'america']
[(5, 1), (6, 1), (7, 1), (8, 1), (9, 1), (10, 1)]


['will', 'smith', 'joins', 'diplo', 'and', 'nicky', 'jam', 'for', 'the', 'world', 'cup', 's', 'official', 'song']
[(11, 1), (12, 1), (13, 1), (14, 1), (15, 1), (16, 1), (17, 1), (18, 1), (19, 1)]


['of', 'course', 'it', 'has', 'a', 'song']
[(18, 1), (20, 1)]


['hugh', 'grant', 'marries', 'for', 'the', 'first', 'time', 'at', 'age']
[(21, 1), (22, 1), (23, 1), (24, 1), (25, 1)]




In [0]:
wids = list(map(dictionary.token2id.get, msgs[0]))
print(wids)

[8, 11, 3, 6, 1, 7, 2, 10, 0, 5, 4, 9]


In [0]:
wids = np.array(list(filter(lambda x: x is not None, wids))) + 1
print(wids)

[ 9 12  4  7  2  8  3 11  1  6  5 10]


In [0]:
ntm_model = Model(bow_input, [represent_mu, p_x_given_h])
ntm_model.compile(loss=[kl_loss, nnl_loss], loss_weights=[kl_strength, 1.0], optimizer="adagrad")

combine_model = Model([bow_input, seq_input, psudo_input], cls_out)
combine_model.compile(optimizer="adadelta", loss=K.categorical_crossentropy, metrics=["accuracy"])

vis_model = Model([bow_input, seq_input, psudo_input], [represent_mu, wt_emb, match, cls_out])

print_weight_shape(combine_model)