In [246]:
#Important libraries
import pandas as pd
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

import tensorflow as tf
from tensorflow.keras import layers
from tensorflow.keras import losses
from tensorflow.keras import preprocessing
from tensorflow.keras.layers.experimental.preprocessing import TextVectorization

from sklearn.model_selection import train_test_split
from sklearn import preprocessing

In [247]:
#Reading data set
df = pd.read_csv("/content/dataset.csv", header = 0)

df.head()

Unnamed: 0,Text,language
0,klement gottwaldi surnukeha palsameeriti ning ...,Estonian
1,sebes joseph pereira thomas på eng the jesuit...,Swedish
2,ถนนเจริญกรุง อักษรโรมัน thanon charoen krung เ...,Thai
3,விசாகப்பட்டினம் தமிழ்ச்சங்கத்தை இந்துப் பத்திர...,Tamil
4,de spons behoort tot het geslacht haliclona en...,Dutch


In [250]:
#Counting null values if any
df.isna().sum()

Text        0
language    0
dtype: int64

In [251]:
#Counting columns per each language
df["language"].value_counts()

Estonian      1000
Swedish       1000
English       1000
Russian       1000
Romanian      1000
Persian       1000
Pushto        1000
Spanish       1000
Hindi         1000
Korean        1000
Chinese       1000
French        1000
Portugese     1000
Indonesian    1000
Urdu          1000
Latin         1000
Turkish       1000
Japanese      1000
Dutch         1000
Tamil         1000
Thai          1000
Arabic        1000
Name: language, dtype: int64

In [252]:
#Giving Label interms of 0 and 1
#Adding new columns in original dataset by appending prefix at the end
def onehot_encode(df, columns, prefixes):
    df = df.copy()
    for column, prefix in zip(columns, prefixes):
        dummies = pd.get_dummies(df[column], prefix=prefix)
        df = pd.concat([df, dummies], axis=1)
        df = df.drop(column, axis=1)
    return df

In [253]:
df = onehot_encode(
    df,
    columns=['language'],
    prefixes=['lan']
)

In [254]:
#Printing the updated data_set
df.head()

Unnamed: 0,Text,lan_Arabic,lan_Chinese,lan_Dutch,lan_English,lan_Estonian,lan_French,lan_Hindi,lan_Indonesian,lan_Japanese,...,lan_Portugese,lan_Pushto,lan_Romanian,lan_Russian,lan_Spanish,lan_Swedish,lan_Tamil,lan_Thai,lan_Turkish,lan_Urdu
0,klement gottwaldi surnukeha palsameeriti ning ...,0,0,0,0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,sebes joseph pereira thomas på eng the jesuit...,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,1,0,0,0,0
2,ถนนเจริญกรุง อักษรโรมัน thanon charoen krung เ...,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
3,விசாகப்பட்டினம் தமிழ்ச்சங்கத்தை இந்துப் பத்திர...,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,1,0,0,0
4,de spons behoort tot het geslacht haliclona en...,0,0,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [255]:
#Spliting the data set
#Text in one columns X
#Labels in the other column y
y = df.drop('Text', axis=1)
X = df['Text']

In [256]:
# Split data into training and testing
X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.7, random_state=17)

In [257]:
# Turn pandas dataframe into TensorFlow Dataset
raw_train_dataset = tf.data.Dataset.from_tensor_slices((X_train.values, y_train.values))
raw_test_dataset = tf.data.Dataset.from_tensor_slices((X_test.values, y_test.values))


In [258]:
# Create batches for training the model
batch_size = 32
raw_train_dataset = raw_train_dataset.batch(batch_size)
raw_test_dataset = raw_test_dataset.batch(batch_size)

In [259]:
# Print labels as example by providing one batch
#Converting the tensorflow dataset into numpy array for easier handling
for text_batch, label_batch in raw_train_dataset.take(1):
    for i in range(3):
        print("Review:", text_batch.numpy()[i])
        print("Label:", label_batch.numpy()[i])


Review: b'\xd8\xa7\xd8\xb3 \xda\xa9\xdb\x92 \xd8\xb9\xd9\x84\xd8\xa7\xd9\x88\xdb\x81 \xd8\xa7\xda\xaf\xd8\xb1 \xd8\xaa\xd9\x85\xdb\x81\xd8\xa7\xd8\xb1\xdb\x8c \xd8\xaf\xd9\x84\xdb\x8c\xd9\x84 \xdb\x8c\xdb\x81\xdb\x8c \xdb\x81\xdb\x92 \xda\xa9\xdb\x81 \xda\x86\xd9\x88\xd9\x86\xda\xa9\xdb\x81 \xd8\xac\xd9\x86\xd8\xa7\xd8\xa8 \xd8\xa7\xd8\xa8\xd8\xb1\xd8\xa7\xdb\x81\xdb\x8c\xd9\x85 \xd8\xb9\xd9\x84\xdb\x8c\xdb\x81 \xd8\xa7\xd9\x84\xd8\xb3\xd9\x84\xd8\xa7\xd9\x85 \xd8\xae\xd9\x84\xdb\x8c\xd9\x84 \xd8\xae\xd8\xaf\xd8\xa7 \xdb\x81\xdb\x8c\xda\xba \xd9\x84\xdb\x81\xd9\xb0\xd8\xb0\xd8\xa7 \xd9\x88\xdb\x81 \xd8\xae\xd8\xaf\xd8\xa7 \xda\xa9\xdb\x92 \xd8\xa8\xdb\x8c\xd9\xb9\xdb\x92 \xdb\x81\xdb\x8c\xda\xba \xd8\xaa\xd9\x88 \xd8\xa7\xd8\xb3 \xd8\xa8\xd9\x86\xdb\x8c\xd8\xa7\xd8\xaf \xd9\xbe\xd8\xb1 \xd8\xaa\xd9\x85\xdb\x81\xdb\x8c\xda\xba \xdb\x8c\xdb\x81 \xd8\xa8\xda\xbe\xdb\x8c \xda\xa9\xdb\x81\xd9\x86\xd8\xa7 \xda\x86\xd8\xa7\xdb\x81\xd8\xa6\xdb\x92 \xda\xa9\xdb\x81 \xd8\xac\xd9\x86\xd8\xa7\xd8\

In [260]:
# Get number of unique words in entire dataset
from collections import Counter
results = Counter()
df['Text'].str.split().apply(results.update) # Very computer intensive method
len(results)#279515 unique words

279515

In [261]:
#Convert variable-length text data into a fixed-size vector
max_features = 120000 # Total words to vectorize
sequence_length = 22 # The length of a sentence

vectorize_layer = TextVectorization(
    max_tokens=max_features,
    output_mode='int',
    output_sequence_length=sequence_length)

# Adapt to our text
vectorize_layer.adapt([t for t in df['Text']])

In [262]:
# Function to vectorize text
def vectorize_text(text):
    text = tf.expand_dims(text, -1)
    return tf.dtypes.cast(vectorize_layer(text), tf.float32)

In [263]:
# Retrieve a batch from the dataset
text_batch, label_batch = next(iter(raw_train_dataset))
first_review, first_label = text_batch[0], label_batch[0]
print("Review:", first_review)
print("Label:", first_label)
print("Vectorized review:", vectorize_text(first_review))

Review: tf.Tensor(b'\xd8\xa7\xd8\xb3 \xda\xa9\xdb\x92 \xd8\xb9\xd9\x84\xd8\xa7\xd9\x88\xdb\x81 \xd8\xa7\xda\xaf\xd8\xb1 \xd8\xaa\xd9\x85\xdb\x81\xd8\xa7\xd8\xb1\xdb\x8c \xd8\xaf\xd9\x84\xdb\x8c\xd9\x84 \xdb\x8c\xdb\x81\xdb\x8c \xdb\x81\xdb\x92 \xda\xa9\xdb\x81 \xda\x86\xd9\x88\xd9\x86\xda\xa9\xdb\x81 \xd8\xac\xd9\x86\xd8\xa7\xd8\xa8 \xd8\xa7\xd8\xa8\xd8\xb1\xd8\xa7\xdb\x81\xdb\x8c\xd9\x85 \xd8\xb9\xd9\x84\xdb\x8c\xdb\x81 \xd8\xa7\xd9\x84\xd8\xb3\xd9\x84\xd8\xa7\xd9\x85 \xd8\xae\xd9\x84\xdb\x8c\xd9\x84 \xd8\xae\xd8\xaf\xd8\xa7 \xdb\x81\xdb\x8c\xda\xba \xd9\x84\xdb\x81\xd9\xb0\xd8\xb0\xd8\xa7 \xd9\x88\xdb\x81 \xd8\xae\xd8\xaf\xd8\xa7 \xda\xa9\xdb\x92 \xd8\xa8\xdb\x8c\xd9\xb9\xdb\x92 \xdb\x81\xdb\x8c\xda\xba \xd8\xaa\xd9\x88 \xd8\xa7\xd8\xb3 \xd8\xa8\xd9\x86\xdb\x8c\xd8\xa7\xd8\xaf \xd9\xbe\xd8\xb1 \xd8\xaa\xd9\x85\xdb\x81\xdb\x8c\xda\xba \xdb\x8c\xdb\x81 \xd8\xa8\xda\xbe\xdb\x8c \xda\xa9\xdb\x81\xd9\x86\xd8\xa7 \xda\x86\xd8\xa7\xdb\x81\xd8\xa6\xdb\x92 \xda\xa9\xdb\x81 \xd8\xac\xd9\x86\xd

In [264]:
# Apply the TextVectorization step to the train and test dataset
train_ds = raw_train_dataset.map(lambda x,y: (vectorize_text(x), y))
test_ds = raw_test_dataset.map(lambda x ,y: (vectorize_text(x) , y))

In [265]:
# Performance measures for faster training
AUTOTUNE = tf.data.experimental.AUTOTUNE

train_ds = train_ds.cache().prefetch(buffer_size=AUTOTUNE)
test_ds = test_ds.cache().prefetch(buffer_size=AUTOTUNE)

In [266]:
len(df.columns)

23

In [267]:
tf.keras.backend.clear_session()

embedding_dim = 16

model = tf.keras.Sequential([
    layers.Embedding(max_features + 1, embedding_dim, input_length=22),
    layers.GlobalAveragePooling1D(),
    layers.Dense(128, activation=tf.nn.relu),
    layers.Dense(64, activation=tf.nn.relu),
    layers.Dense(32, activation=tf.nn.relu),
    layers.Dense(22, activation=tf.nn.softmax),
])

model.summary()

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       (None, 22, 16)            1920016   
                                                                 
 global_average_pooling1d (  (None, 16)                0         
 GlobalAveragePooling1D)                                         
                                                                 
 dense (Dense)               (None, 128)               2176      
                                                                 
 dense_1 (Dense)             (None, 64)                8256      
                                                                 
 dense_2 (Dense)             (None, 32)                2080      
                                                                 
 dense_3 (Dense)             (None, 22)                726       
                                                        

In [268]:
model.compile(loss='categorical_crossentropy',
              optimizer='adam',
              metrics=['accuracy'])

In [269]:
epochs = 1
history = model.fit(
    train_ds,
    epochs=epochs
)



In [270]:
loss, accuracy = model.evaluate(test_ds)

print("Loss: ", loss)
print("Accuracy: ", accuracy)

Loss:  0.37007051706314087
Accuracy:  0.897742748260498


In [294]:

to_predict = [text]
# Vectorize text before giving it to the model
to_predict = vectorize_layer(to_predict)
prediction = model.predict(to_predict)

classes = df.columns[1:]

highest_prediction = tf.math.argmax(prediction, 1).numpy()
print(highest_prediction)
print(classes[highest_prediction[0]])
print(f"Certainty: {prediction[0][highest_prediction][0] * 100}%")

[4]
lan_Estonian
Certainty: 96.4853584766388%


In [291]:
text = df.iloc[8]['Text']
text = 'surnukeha palsameeriti ning paigutati mausoleumi surnukeha oli aga liiga hilja ja oskamatult palsameeritud ning hakkas ilmutama lagunemise tundemärke  aastal viidi ta surnukeha mausoleu'