In [None]:
import tensorflow as tf
from tensorflow.keras.layers import Input, Dense, Dropout, GlobalAveragePooling1D
from tensorflow.keras.models import Model
from transformers import TFBertModel, BertTokenizerFast
import pandas as pd
# Define hyperparameters
max_length = 51
num_epochs = 3
batch_size = 32

# Load preprocessed data
data = pd.read_csv('hindi.csv')

# Split data into training and validation sets
train_size = int(len(data) * 0.7)
train_data = data[:train_size]
test_data = data[train_size:]

# Tokenize text data using the BERT tokenizer
tokenizer = BertTokenizerFast.from_pretrained('bert-base-multilingual-cased')
train_encodings = tokenizer(train_data['text'].tolist(), max_length=max_length, padding=True, truncation=True)
test_encodings = tokenizer(test_data['text'].tolist(), max_length=max_length, padding=True, truncation=True)

# Convert encodings to TensorFlow tensors
train_dataset = tf.data.Dataset.from_tensor_slices((
    dict(train_encodings),
    train_data['label'].tolist()
)).shuffle(len(train_data)).batch(batch_size)
test_dataset = tf.data.Dataset.from_tensor_slices((
    dict(test_encodings),
    test_data['label'].tolist()
)).batch(batch_size)

# Load the BERT model
bert_model = TFBertModel.from_pretrained('bert-base-multilingual-cased')

# Define the model architecture
input_ids = Input(shape=(max_length,), dtype=tf.int32, name='input_ids')
attention_mask = Input(shape=(max_length,), dtype=tf.int32, name='attention_mask')
embedding_layer = bert_model(input_ids, attention_mask)[0]
x = GlobalAveragePooling1D()(embedding_layer)
x = Dense(256, activation='relu')(x)
x = Dropout(0.2)(x)
x = Dense(128, activation='relu')(x)
x = Dropout(0.2)(x)
x = Dense(64, activation='relu')(x)
x = Dropout(0.2)(x)
x = Dense(32, activation='relu')(x)
x = Dropout(0.2)(x)
output = Dense(1, activation='sigmoid')(x)
model = Model(inputs=[input_ids, attention_mask], outputs=output)

# Compile the model
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

# Train the model
history = model.fit(train_dataset, epochs=num_epochs, validation_data=test_dataset)

# Evaluate the model on the test set
loss, accuracy = model.evaluate(test_dataset)
print(f'Test loss: {loss:.2f}')
print(f'Test accuracy: {accuracy:.2f}')

Some layers from the model checkpoint at bert-base-multilingual-cased were not used when initializing TFBertModel: ['nsp___cls', 'mlm___cls']
- This IS expected if you are initializing TFBertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFBertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
All the layers of TFBertModel were initialized from the model checkpoint at bert-base-multilingual-cased.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFBertModel for predictions without further training.


Epoch 1/3


  inputs = self._flatten_to_reference_inputs(inputs)


Epoch 2/3
Epoch 3/3
Test loss: 0.69
Test accuracy: 0.53


In [None]:
!pip install transformers

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting transformers
  Downloading transformers-4.26.1-py3-none-any.whl (6.3 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m6.3/6.3 MB[0m [31m56.6 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting huggingface-hub<1.0,>=0.11.0
  Downloading huggingface_hub-0.13.1-py3-none-any.whl (199 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m199.2/199.2 KB[0m [31m26.7 MB/s[0m eta [36m0:00:00[0m
Collecting tokenizers!=0.11.3,<0.14,>=0.11.1
  Downloading tokenizers-0.13.2-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (7.6 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.6/7.6 MB[0m [31m116.3 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: tokenizers, huggingface-hub, transformers
Successfully installed huggingface-hub-0.13.1 tokenizers-0.13.2 transformers-4.26.1


In [None]:
import pandas as pd
import numpy as np

# Define the number of samples and the number of labels
num_samples = 1000
num_labels = 2

# Create some random text data in Hindi
text_data = [' '.join(np.random.choice(['नमस्ते', 'दुनिया', 'फू', 'बार'], size=np.random.randint(10, 20))) for i in range(num_samples)]

# Create some random labels
labels = np.random.randint(num_labels, size=num_samples)

# Combine the text data and labels into a DataFrame
data = pd.DataFrame({'text': text_data, 'label': labels})

# Save the data to a CSV file
data.to_csv('hindi.csv', index=False)
