In [None]:
import io
import os
import re
import numpy as np # used for handling numbers
import pandas as pd # used for handling the dataset
df= pd.read_csv('drive/My Drive/Colab Notebooks/dataset/labeled_outs.txt',delimiter=",",header=None)
kw=pd.read_csv('drive/My Drive/Colab Notebooks/dataset/frequent.txt',header=None)
# Dataset is now stored in a Pandas Dataframe

In [None]:
df.columns=['label','text']
df.text=df.text.astype(str)
df.label=df.label.astype(str)
df=df.sample(frac=1)
docs=df['text'].values
labels=df['label'].values

In [None]:
from sklearn.feature_extraction.text import CountVectorizer

count_vect = CountVectorizer(max_features = 7000)
X_train_counts = count_vect.fit_transform(docs)

In [None]:
from sklearn.feature_extraction.text import TfidfTransformer

tfidf_transformer = TfidfTransformer()
X_train_tfidf = tfidf_transformer.fit_transform(X_train_counts)
X_train_tfidf.shape
X_train_tfidf = X_train_tfidf.toarray()
X_train_tfidf = np.reshape(X_train_tfidf, (X_train_tfidf.shape[0],20,-1))
print(X_train_tfidf.shape)

In [None]:

kw.columns=['words']
dfreq={}
for k in kw['words']:
  dfreq[k]=0
print(len(dfreq))

In [None]:
for d in docs:
  words=re.split('[\s]',d)
  words=np.unique(words)
  for w in words:
    if w in dfreq:
      dfreq[w]=dfreq[w]+1

tf_idf = []
for d in docs:
  words=re.split('[\s]',d)
  N=len(words)
  tfreq={}
  for w in words:
    if w in tfreq:
      tfreq[w]=tfreq[w]+1
    else:
      tfreq[w]=1
  temp=[]
  for w in dfreq:
    if w in tfreq:
      tf=tfreq[w]*1.0/N
      idf=np.log(len(docs)*1.0/dfreq[w])
      temp.append(tf*idf)
    else:
      temp.append(0.0)
  tf_idf.append(temp)

tf_idf = np.array(tf_idf)
input_dim=len(tf_idf[0])
print(input_dim)
tf_idf = np.reshape(tf_idf, (tf_idf.shape[0],20,-1))

In [None]:

from keras.utils import np_utils
from sklearn.preprocessing import LabelEncoder
print(labels)
# encode class values as integers
encoder = LabelEncoder()
encoder.fit(labels)
encoded_Y = encoder.transform(labels)
# convert integers to dummy variables (i.e. one hot encoded)
dummy_y = np_utils.to_categorical(encoded_Y)
print(dummy_y)


In [None]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(tf_idf, dummy_y, test_size=0.18)

In [None]:
print(X_train.shape)
print(y_train.shape)
print(X_test.shape)
print(y_test.shape)

In [None]:

from keras.models import Sequential
from keras.layers import Dense, Dropout
from keras.layers import Conv1D, MaxPooling1D,GlobalAveragePooling1D

In [None]:

model = Sequential()
model.add(Conv1D(filters=512, kernel_size=7,input_shape=(X_train.shape[1],X_train.shape[2]), activation='relu'))
model.add(BatchNormalization())
model.add(Dropout(0.5))
model.add(MaxPooling1D(pool_size=1))
model.add(Dropout(0.4))
model.add(Conv1D(filters=256, kernel_size=6, activation='relu'))
model.add(BatchNormalization())
model.add(Dropout(0.4))
model.add(MaxPooling1D(pool_size=1))
model.add(Dropout(0.3))
model.add(Conv1D(filters=128, kernel_size=5, activation='relu'))
model.add(BatchNormalization())
model.add(Dropout(0.3))
model.add(MaxPooling1D(pool_size=1))
model.add(Dropout(0.2))
model.add(Conv1D(filters=64, kernel_size=5, activation='relu'))
model.add(Dropout(0.2))
model.add(MaxPooling1D(pool_size=1))
model.add(Dropout(0.1))
model.add(GlobalAveragePooling1D())
model.add(Dropout(0.1))
model.add(Dense(64, activation='relu'))
model.add(Dense(8, activation='softmax'))

In [None]:

model.compile(loss="categorical_crossentropy",optimizer="adam",metrics=['acc'])
# summarize the model
print(model.summary())

In [None]:
history = model.fit(X_train, y_train, epochs=30,shuffle=False, verbose=1,validation_split=0.1)
#model.save_weights('./checkpoints/text_classifier_rnn_tfidf')
!mkdir -p saved_model
model.save('saved_model/text_classifier_rnn_tfidf')

In [None]:
results = model.evaluate(X_test, y_test)
print(results)

In [None]:
y_pred=model.predict(X_test)

In [None]:
"""
import numpy as np
array1=[[ 0,  1,  2,  3,  4],
       [ 0,  1,  2,  3,  4],
       [5, 6, 7, 1, 0],
       [5, 6, 7, 1, 0]]
array2=[[ 5, 6, 7, 1, 0],
       [ 0,  1,  2,  3,  4],
       [0,  1,  2,  3,  4],
       [5, 6, 7, 1, 0]]
"""
no_cat=len(y_test[0])
conf_mat=np.zeros(no_cat*no_cat).reshape(no_cat,no_cat)
test_size=len(y_test)
print(conf_mat.shape)
for i in range(test_size):
  y_t=y_test[i]
  y_p=y_pred[i]
  class_t=np.where(y_t == np.amax(y_t))
  class_t=class_t[0][0]
  class_p=np.where(y_p == np.amax(y_p))
  class_p=class_p[0][0]
  conf_mat[class_t][class_p]=conf_mat[class_t][class_p]+1

In [None]:
tp=np.zeros(no_cat)
fp=np.zeros(no_cat)
tn=np.zeros(no_cat)
fn=np.zeros(no_cat)

prec=np.zeros(no_cat)
rec=np.zeros(no_cat)
f_score=np.zeros(no_cat)

for i in range(no_cat):
  for j in range(no_cat):
    if i==j:
      tp[i]=conf_mat[i][j]
    if i!= j:
      fp[i] = fp[i]+conf_mat[j][i]
      fn[i] = fn[i]+conf_mat[i][j]
    for k in range(no_cat):
      if i!=k & j!=k:
        tn[i]=tn[i]+conf_mat[j][k]

for i in range(no_cat):
    prec[i] = tp[i] * 100.0 / (tp[i] + fp[i])
    rec[i] = tp[i] * 100.0 / (tp[i] + fn[i])
    f_score[i] = 2 * prec[i] * rec[i] / (prec[i] + rec[i])
print(np.average(prec))
print(np.average(rec))
print(np.average(f_score))