### BERT

#### Importing Package and Reading the data

In [1]:
import pandas as pd

df = pd.read_csv('/home/users/pgodbole2/TA_Training_Set.csv')
df

Unnamed: 0,Comment,Topic
0,Very hot and sexy nail color;),21
1,"I know, right? Like, an FMEA. I definitely kno...",28
2,"Hello, we only allow people with an account ag...",5
3,Megumin reads das kapital??? New best girl.,2
4,Freedom of speech ? but you use a random anon ...,13
...,...,...
899995,"For many people with ASD, this is not true. As...",1
899996,Hes too brashly for someone in crusading dista...,25
899997,But if you replace the word “man” with “ conse...,36
899998,You can sit down in the cubicle have a browse ...,37


In [2]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import re
import tensorflow as tf
import transformers
from transformers import BertTokenizer
from transformers import DistilBertTokenizer, RobertaTokenizer
from tensorflow.keras.optimizers import Adam, SGD
import nltk 
from nltk.corpus import stopwords

#### Data Preprocessing

In [3]:
def tokens(words):
    words = re.sub("[^a-zA-Z]"," ",words)
    text = words.lower().split()
    return " ".join(text)

df['Comment'] = df['Comment'].apply(tokens)
df.head()

Unnamed: 0,Comment,Topic
0,very hot and sexy nail color,21
1,i know right like an fmea i definitely know wh...,28
2,hello we only allow people with an account age...,5
3,megumin reads das kapital new best girl,2
4,freedom of speech but you use a random anon ac...,13


In [4]:
nltk.download('stopwords')

stop = stopwords.words('english')
stop[0:10]

[nltk_data] Downloading package stopwords to
[nltk_data]     /home/users/pgodbole2/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


['i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', "you're"]

In [5]:
def stopwords(text):
    text = [word.lower() for word in text.split() if word.lower() not in stop]
    return " ".join(text)

In [6]:
df['Comment'] = df['Comment'].apply(lambda x: stopwords(x))
df.head()

Unnamed: 0,Comment,Topic
0,hot sexy nail color,21
1,know right like fmea definitely know one want ...,28
2,hello allow people account age days comment ka...,5
3,megumin reads das kapital new best girl,2
4,freedom speech use random anon account brave,13


In [7]:
import nltk
nltk.download('omw-1.4')

[nltk_data] Downloading package omw-1.4 to
[nltk_data]     /home/users/pgodbole2/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


True

In [8]:
nltk.download('wordnet')
lem = nltk.stem.WordNetLemmatizer()

def word_lem(text):
    lem_text = [lem.lemmatize(word) for word in text.split()]
    return " ".join(lem_text)

df['Comment'] = df['Comment'].apply(word_lem)
df.head()

[nltk_data] Downloading package wordnet to
[nltk_data]     /home/users/pgodbole2/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


Unnamed: 0,Comment,Topic
0,hot sexy nail color,21
1,know right like fmea definitely know one want ...,28
2,hello allow people account age day comment kar...,5
3,megumin read da kapital new best girl,2
4,freedom speech use random anon account brave,13


In [9]:
df['Topic'].value_counts()

20    22572
28    22570
6     22564
15    22550
18    22548
1     22539
33    22533
25    22533
10    22530
29    22527
24    22523
11    22521
27    22519
12    22515
19    22513
3     22513
30    22510
32    22503
4     22503
17    22497
8     22492
7     22492
38    22491
21    22491
22    22486
36    22486
31    22483
35    22483
9     22482
39    22482
14    22469
2     22468
37    22464
23    22460
34    22458
16    22453
26    22452
13    22450
5     22439
40    22436
Name: Topic, dtype: int64

In [13]:
df['Comment']

0                                       hot sexy nail color
1         know right like fmea definitely know one want ...
2         hello allow people account age day comment kar...
3                     megumin read da kapital new best girl
4              freedom speech use random anon account brave
                                ...                        
899995    many people asd true apply retarded equivalent...
899996                he brashly someone crusading distance
899997    replace word man conservative reddit think tot...
899998    sit cubicle browse phone bit downtime much bet...
899999    fried potato lady wonder everyone call reddito...
Name: Comment, Length: 900000, dtype: object

#### Train Test Split

In [19]:
train = df[:720000]
test = df[720000:]

In [20]:
train.to_csv('train.csv',encoding='utf-8')
test.to_csv('test.csv',encoding='utf-8')

In [21]:
train = pd.read_csv('train.csv',encoding='utf-8')
test = pd.read_csv('test.csv',encoding='utf-8')

In [22]:
train = train[~train['Comment'].isnull()]

In [23]:
train.drop(columns = ('Unnamed: 0'), axis = 1, inplace = True)
train

Unnamed: 0,Comment,Topic
0,hot sexy nail color,21
1,know right like fmea definitely know one want ...,28
2,hello allow people account age day comment kar...,5
3,megumin read da kapital new best girl,2
4,freedom speech use random anon account brave,13
...,...,...
719995,see need make look bad would,28
719996,uploader buying subscriber view pretty clear l...,38
719997,saying woman wider variety stuff judge someone...,37
719998,haha stage grief still denial friend anger com...,14


In [24]:
train = train.reset_index()

In [25]:
train.drop(columns=['index'], inplace=True)
train.head()

Unnamed: 0,Comment,Topic
0,hot sexy nail color,21
1,know right like fmea definitely know one want ...,28
2,hello allow people account age day comment kar...,5
3,megumin read da kapital new best girl,2
4,freedom speech use random anon account brave,13


In [26]:
test = test[~test['Comment'].isnull()]

In [27]:
test.drop(columns = ('Unnamed: 0'), axis = 1, inplace = True)
test

Unnamed: 0,Comment,Topic
0,well yeah hot plenty thing wrong hot,30
1,would become addicted thing keep engaging educ...,20
2,side,19
3,give shit comment reddit sing fuck ramos every...,33
4,yeah would chance euron going full psycho eldr...,14
...,...,...
179995,many people asd true apply retarded equivalent...,1
179996,he brashly someone crusading distance,25
179997,replace word man conservative reddit think tot...,36
179998,sit cubicle browse phone bit downtime much bet...,37


In [28]:
test = test.reset_index()
test.drop(columns=['index'], inplace=True)
test.head()

Unnamed: 0,Comment,Topic
0,well yeah hot plenty thing wrong hot,30
1,would become addicted thing keep engaging educ...,20
2,side,19
3,give shit comment reddit sing fuck ramos every...,33
4,yeah would chance euron going full psycho eldr...,14


#### Using BertTokenizer

In [29]:
from transformers import BertTokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased', do_lower_case=True)

In [30]:
def bert_encode(data,maximum_length) :
    input_ids = []
    attention_masks = []
  

    for i in range(len(data.Comment)):
        encoded = tokenizer.encode_plus(
        
        data.Comment[i],
        add_special_tokens=True,
        max_length=maximum_length,
        pad_to_max_length=True,
        
        return_attention_mask=True,
        
      )
      
        input_ids.append(encoded['input_ids'])
        attention_masks.append(encoded['attention_mask'])
    return np.array(input_ids),np.array(attention_masks)

In [31]:
train_input_ids,train_attention_masks = bert_encode(train,121)

Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.


In [32]:
train_input_ids.shape

(703513, 121)

In [33]:
test_input_ids,test_attention_masks = bert_encode(test,121)

In [34]:
test_input_ids.shape

(175899, 121)

#### Model Creation

In [35]:
def create_model(bert_model):
    input_ids = tf.keras.Input(shape=(121,),dtype='int32')
    attention_masks = tf.keras.Input(shape=(121,),dtype='int32')
    
    output = bert_model([input_ids,attention_masks])
    output = output[1]
    
    #output = tf.keras.layers.Dense(60,activation='tanh')(output)
    #output = tf.keras.layers.BatchNormalization()(output)
    
    #output = tf.keras.layers.Dense(30,activation='tanh')(output)
    #output = tf.keras.layers.Dropout(0.2)(output)

    output = tf.keras.layers.Dense(41,activation='softmax')(output)
    model = tf.keras.models.Model(inputs = [input_ids,attention_masks],outputs = output)
    model.compile(Adam(learning_rate=6e-6), loss='sparse_categorical_crossentropy', metrics=['accuracy'])
    return model

In [36]:
from transformers import TFBertModel

In [37]:
bert_model = TFBertModel.from_pretrained('bert-base-uncased')

2022-04-24 14:08:36.741210: I tensorflow/core/platform/cpu_feature_guard.cc:142] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 AVX512F FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.
2022-04-24 14:08:37.171430: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1510] Created device /job:localhost/replica:0/task:0/device:GPU:0 with 30997 MB memory:  -> device: 0, name: Tesla V100-PCIE-32GB, pci bus id: 0000:3b:00.0, compute capability: 7.0
Some layers from the model checkpoint at bert-base-uncased were not used when initializing TFBertModel: ['nsp___cls', 'mlm___cls']
- This IS expected if you are initializing TFBertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you

In [38]:
bert_model

<transformers.models.bert.modeling_tf_bert.TFBertModel at 0x7fc45f07ff90>

In [39]:
model = create_model(bert_model)
model.summary()

Model: "model"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_1 (InputLayer)            [(None, 121)]        0                                            
__________________________________________________________________________________________________
input_2 (InputLayer)            [(None, 121)]        0                                            
__________________________________________________________________________________________________
tf_bert_model (TFBertModel)     TFBaseModelOutputWit 109482240   input_1[0][0]                    
                                                                 input_2[0][0]                    
__________________________________________________________________________________________________
dense (Dense)                   (None, 41)           31529       tf_bert_model[0][1]          

In [40]:
#pip install graphviz

In [41]:
#from keras.utils.vis_utils import plot_model
#plot_model(model, to_file='model_plot.png', show_shapes=True, show_layer_names=True)


In [42]:
dummy = train
targets = dummy['Topic'].values

dummy2 = test
targets_y = dummy2['Topic'].values

In [43]:
# history = model.fit([train_input_ids,train_attention_masks],targets,validation_data=([test_input_ids,test_attention_masks],targets_y), epochs=4,batch_size=64)

In [None]:
history1 = model.fit([train_input_ids,train_attention_masks],targets,validation_data=([test_input_ids,test_attention_masks],targets_y), epochs=4,batch_size=64)

2022-04-24 14:08:43.958587: I tensorflow/compiler/mlir/mlir_graph_optimization_pass.cc:185] None of the MLIR Optimization Passes are enabled (registered 2)


Epoch 1/4
Epoch 2/4
Epoch 4/4

In [36]:
model.save('TA_Model_New.h5')

#### Predicting on Test Data

In [37]:
test_df = pd.read_csv('/home/users/pgodbole2/TA_Test_Set.csv')
#test_df.drop(columns = 'Topic', inplace = True)
#test_df.reset_index()
test_df.head()

Unnamed: 0,Comment
0,I bought a month and a half out on a stock tha...
1,"Parity used to be the justification, but that ..."
2,Yeah cartel. Legolas is gonna shoot your ass d...
3,"I do think he’s TA, but there’s one thing with..."
4,"Were trying, let you know if anything works"


In [38]:
test_df['Comment'] = test_df['Comment'].apply(tokens)
test_df['Comment'] = test_df['Comment'].apply(lambda x: stopwords(x))
test_df['Comment'] = test_df['Comment'].apply(word_lem)
test_df

Unnamed: 0,Comment
0,bought month half stock almost option volume l...
1,parity used justification day free agency way ...
2,yeah cartel legolas gonna shoot as
3,think ta one thing saying thinking maybe maybe...
4,trying let know anything work
...,...
99995,migraine sufferer tell coffee catalyst getting...
99996,drink room temperature tap water
99997,live campus happened super fucked
99998,bro nobody like shit


In [47]:
test_df = test_df.reset_index()
test_df.drop(columns = 'index', axis = 1, inplace = True)
test_df

Unnamed: 0,Comment
0,ultimate farming whats gt join u


In [None]:
#topic = 6

In [39]:
# test_id, test_mask = bert_encode(test_df, 60)



In [40]:
# test_id.shape[0]

100000

In [41]:
y_pred = model.predict([test_id, test_mask])

In [42]:
y_pred

array([[3.1582403e-04, 2.8317710e-03, 3.5886184e-04, ..., 1.0604177e-03,
        9.6163946e-01, 1.9981570e-03],
       [2.3811398e-04, 6.1928626e-04, 3.5160317e-04, ..., 6.8623049e-04,
        4.1400149e-04, 3.0466483e-04],
       [1.4460781e-03, 4.9176969e-04, 2.6883291e-02, ..., 1.2549058e-02,
        1.7845783e-03, 1.5454317e-03],
       ...,
       [8.1546261e-04, 7.5611111e-04, 5.1461351e-03, ..., 2.4001565e-02,
        1.3309445e-02, 2.0846674e-02],
       [6.1152258e-04, 5.2602948e-03, 2.4477963e-03, ..., 3.1246687e-03,
        6.8248168e-02, 5.1304810e-03],
       [1.0180919e-03, 6.4107915e-04, 9.1671765e-01, ..., 5.5221515e-03,
        6.9401221e-04, 1.5452536e-03]], dtype=float32)

In [43]:
labels = np.argmax(y_pred,axis=1)

In [44]:
test_df['Topic'] = list(labels)

In [45]:
test_df.to_csv('T_New.csv')

### LSTM

In [None]:
# The maximum number of words to be used. (most frequent)
MAX_NB_WORDS = 50000
# Max number of words in each complaint.
MAX_SEQUENCE_LENGTH = 250
# This is fixed.
EMBEDDING_DIM = 100
tokenizer = Tokenizer(num_words=MAX_NB_WORDS, filters='!"#$%&()*+,-./:;<=>?@[\]^_`{|}~', lower=True)
tokenizer.fit_on_texts(df['Comment'].values)
word_index = tokenizer.word_index
print('Found %s unique tokens.' % len(word_index))

In [None]:
X = tokenizer.texts_to_sequences(df['Comment'].values)
X = pad_sequences(X, maxlen=MAX_SEQUENCE_LENGTH)
print('Shape of data tensor:', X.shape)

In [None]:
Y = pd.get_dummies(df['Topic']).values
print('Shape of label tensor:', Y.shape)

In [None]:
model = Sequential()
model.add(Embedding(MAX_NB_WORDS, EMBEDDING_DIM, input_length=X.shape[1]))
model.add(SpatialDropout1D(0.2))
model.add(LSTM(100, dropout=0.2, recurrent_dropout=0.2))
model.add(Dense(40, activation='sigmoid'))
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

epochs = 5
batch_size = 64
from tensorflow.keras.callbacks import EarlyStopping
history = model.fit(X_train, Y_train, epochs=epochs, batch_size=batch_size,validation_split=0.1,callbacks=[EarlyStopping(monitor='val_loss', patience=3, min_delta=0.0001)])