In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import cross_val_score
from sklearn.metrics import confusion_matrix, classification_report

In [2]:
from bert.bert_model import BERT_model
import logging
from util import logger


2024-03-15 23:48:26,402 :: root         : INFO     Configured the logging successfully


In [9]:
def get_data(size=None):
    #df = pd.read_csv('data/train.tsv', delimiter='\t', header=None)
    df = pd.read_csv("C:/Users/anany/Desktop/fyp/data/kn/kannada-dataset/input2.txt", delimiter=', ', header=None)

    if size:
        df = df[:size]
    logging.info('Trainig data:{}'.format(df.shape))
    logging.info('Class distribution:\n{}'.format(df[1].value_counts()))
    return df


In [10]:
def get_max_sent_length(tokenized_df):
    max_len = 0
    for i in tokenized_df.values:
        if len(i) > max_len:
            max_len = len(i)
    return max_len

In [11]:
def print_test_metrics(act, pred):
    logging.info('Confusion Matrix:\n' + str(confusion_matrix(act, pred)))
    logging.info('Report:\n' + str(classification_report(act, pred)))


In [12]:
def sentence_classifier(bert_feature_array, target_labels, TEST_RATIO=0.1):
    # Split train and test
    train_features, test_features, train_labels, test_labels = train_test_split(bert_feature_array, target_labels,
    test_size=TEST_RATIO)

    # Train the model
    classifier = LogisticRegression()
    classifier.fit(train_features, train_labels)
    logging.info('Training complete')

    # Get the test results
    pred_labels = classifier.predict(test_features)
    # logging.info('Pred:{}'.format(pred_labels))
    # Print the evaluation results
    print_test_metrics(test_labels, pred_labels)
    return classifier

In [13]:
def trainer():
    # Load the TSV training data
    df = get_data()

    # Load the tokenizer and BERT models
    bert = BERT_model()
    bert.load_BERT(small=True)

    # Tokenize the sentences in the training data
    tokenized_df = df[0].apply(lambda sent: bert.tokenize_sentence(sent))
    MAX_LEN = get_max_sent_length(tokenized_df)
    logging.info('Maximum sentence length:{}'.format(MAX_LEN))

    # Provide the tokenized sentences and get the BERT embeddings back
    bert_hidden_states = bert.convert_tokenized_sent_to_bert_emb(tokenized_df, MAX_LEN, batch_size=500)
    # TODO Save the features and target lables

    # Slice the hidden states of shape (number of training examples, max number of tokens=MAX_LEN, number of hidden units in BERT=768)
    # And take only the CLS output of the BERT
    bert_feature_array = bert_hidden_states[:,0,:].numpy()

    logging.info('Bert features shape:{}'.format(bert_feature_array.shape))
    # The target labels
    target_labels = df[1]

    # Train a classifier
    classifier= sentence_classifier(bert_feature_array, target_labels)
    
    return classifier
    

In [14]:
if __name__ == "__main__":
    classifier=trainer()


2024-03-15 23:49:59,915 :: root         : INFO     Trainig data:(45798, 2)
2024-03-15 23:49:59,923 :: root         : INFO     Class distribution:
1    40392
0     5406
Name: 1, dtype: int64
2024-03-15 23:50:06,986 :: root         : INFO     BERT has been loaded successfully
2024-03-15 23:50:12,839 :: root         : INFO     Maximum sentence length:18
2024-03-15 23:50:13,324 :: root         : INFO     Padded array shape:(45798, 18)
2024-03-15 23:50:13,329 :: root         : INFO     Attention mask shape:(45798, 18)
2024-03-15 23:50:13,331 :: root         : INFO     Going to get BERT embeddings for 45798 records
2024-03-15 23:50:13,331 :: root         : INFO     Running batch-wise. Original shape:(45798, 18)
2024-03-15 23:53:02,315 :: root         : INFO     Time taken:168 seconds
2024-03-15 23:55:43,435 :: root         : INFO     Time taken:161 seconds
2024-03-15 23:55:43,457 :: root         : INFO     Accumulated emb size:torch.Size([1000, 18, 768])
2024-03-15 23:58:24,979 :: root      

2024-03-16 07:00:35,934 :: root         : INFO     Accumulated emb size:torch.Size([22000, 18, 768])
2024-03-16 07:03:12,688 :: root         : INFO     Time taken:156 seconds
2024-03-16 07:03:12,922 :: root         : INFO     Accumulated emb size:torch.Size([22500, 18, 768])
2024-03-16 07:05:49,566 :: root         : INFO     Time taken:156 seconds
2024-03-16 07:05:49,832 :: root         : INFO     Accumulated emb size:torch.Size([23000, 18, 768])
2024-03-16 07:08:26,163 :: root         : INFO     Time taken:156 seconds
2024-03-16 07:08:26,472 :: root         : INFO     Accumulated emb size:torch.Size([23500, 18, 768])
2024-03-16 07:11:03,104 :: root         : INFO     Time taken:156 seconds
2024-03-16 07:11:03,371 :: root         : INFO     Accumulated emb size:torch.Size([24000, 18, 768])
2024-03-16 07:13:45,914 :: root         : INFO     Time taken:162 seconds
2024-03-16 07:13:46,305 :: root         : INFO     Accumulated emb size:torch.Size([24500, 18, 768])
2024-03-16 07:16:31,006 

2024-03-16 12:54:21,640 :: root         : INFO     Accumulated emb size:torch.Size([45500, 18, 768])
2024-03-16 12:56:07,590 :: root         : INFO     Time taken:105 seconds
2024-03-16 12:56:12,454 :: root         : INFO     Accumulated emb size:torch.Size([45798, 18, 768])
2024-03-16 12:56:12,517 :: root         : INFO     Bert features shape:(45798, 768)
2024-03-16 12:56:17,233 :: root         : INFO     Training complete
2024-03-16 12:56:17,272 :: root         : INFO     Confusion Matrix:
[[ 235  304]
 [  58 3983]]
2024-03-16 12:56:17,305 :: root         : INFO     Report:
              precision    recall  f1-score   support

           0       0.80      0.44      0.56       539
           1       0.93      0.99      0.96      4041

    accuracy                           0.92      4580
   macro avg       0.87      0.71      0.76      4580
weighted avg       0.91      0.92      0.91      4580



In [15]:
classifier

In [16]:
def test_classifier_on_new_data(new_df, classifier):
    # Load the tokenizer and BERT model
    bert = BERT_model()
    bert.load_BERT(small=True)

    # Tokenize the sentences in the new dataframe
    tokenized_df = new_df[0].apply(lambda sent: bert.tokenize_sentence(sent))
    MAX_LEN = get_max_sent_length(tokenized_df)

    # Convert tokenized sentences to BERT embeddings
    bert_hidden_states = bert.convert_tokenized_sent_to_bert_emb(tokenized_df, MAX_LEN)
    bert_feature_array = bert_hidden_states[:, 0, :].numpy()

    # Predict labels using the trained classifier
    predicted_labels = classifier.predict(bert_feature_array)

    return predicted_labels


In [45]:
df1=pd.read_csv("C:/Users/anany/Desktop/stem.txt",header=None)

In [46]:
labels=test_classifier_on_new_data(df1,classifier)

2024-03-23 12:55:39,734 :: root         : INFO     BERT has been loaded successfully
2024-03-23 12:55:39,736 :: root         : INFO     Padded array shape:(1, 12)
2024-03-23 12:55:39,744 :: root         : INFO     Attention mask shape:(1, 12)
2024-03-23 12:55:39,756 :: root         : INFO     Going to get BERT embeddings for 1 records
2024-03-23 12:55:39,920 :: root         : INFO     Time taken:0 seconds


In [47]:
print(labels[0])

1


In [31]:
import pickle

In [35]:
with open('model.pkl', 'wb') as f:
    pickle.dump(classifier, f)
