In [None]:
import os
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from wordcloud import WordCloud

In [None]:
#dataset merging

df = pd.DataFrame(columns=['Text','Language'])
for filename in os.listdir('/kaggle/input/lang-dataset'):
    f = os.path.join('/kaggle/input/lang-dataset', filename)
    if 'csv' in filename:
        if 'sentences' not in filename:
            df1 = pd.read_csv(f)
            if 'language' in df1.columns:
                df1=df1.rename(columns={'language':'Language'})
            df = pd.concat([df,df1],axis=0)  

In [None]:
df.shape

In [None]:
df.describe

In [None]:
df['Language'].unique()

In [None]:
df.Language.replace(to_replace=['Portugeese','Sweedish'],value=['Portuguese','Swedish'],inplace=True)

In [None]:
len(df['Language'].unique())

In [None]:
wc = WordCloud(width=600,height=300,random_state=42).generate(' '.join(df.Text))
plt.imshow(wc);

In [None]:
df['num_words'] = df.Text.apply(len)
sns.displot(df.num_words);

In [None]:
sizes = []
for lang in df['Language'].unique():
    temp = df[df['Language']== lang]
    sizes.append(temp.shape[0])

In [None]:
plt.boxplot(sizes)

In [None]:
count = 0
for i in sizes:
    #if i > 4300:
    count = count+1
    print(i)
print(count)


In [None]:
for lang in df['Language'].unique():
    temp = df[df['Language']== lang]
    #if temp.shape[0] > 4300:
    print(lang)

In [None]:
data = pd.read_csv('/kaggle/input/lang-dataset/sentences.csv')
data.shape

In [None]:
import json
with open('/kaggle/input/lang-dataset/lan_to_language.json', 'r') as f:
    js_file = json.load(f)

In [None]:
data['lan_code'] = data['lan_code'].map(js_file).fillna(data['lan_code'])

In [None]:
data['lan_code'].unique()

In [None]:
lang=[]
for i in data['lan_code'].unique():
    if 1000 <= df[df['Language']==i].shape[0]:
        #print(df[df['Language']==i])
        lang.append(i)

In [None]:
lang

In [None]:
df_1 = pd.DataFrame(columns=['Text','Language'])
for lan in lang:
    #l = len(df[lan])
    #print(l)
    #print(lang.index(lan))
    t = data[data['lan_code']==lan][:7000]
    #t = t.apply(name_change,axis=1)
    t = t .drop(['id'],axis=1)
    t = t.rename(columns={'lan_code':'Language','sentence':'Text'})
    t = t.reset_index(drop=True)
    df_1 = pd.concat([df_1,t])

In [None]:
df_1

In [None]:
df = pd.concat([df,df_1])
df.head()

In [None]:
df.shape

In [None]:
df.num_words.describe()

In [None]:
max_len = 125

In [None]:
!pip install wget

In [None]:
import warnings, wget
warnings.filterwarnings("ignore")
import tensorflow as tf
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Dense, Input, Dropout, Flatten
from tensorflow.keras.callbacks import EarlyStopping, ModelCheckpoint
from tensorflow.keras.losses import CategoricalCrossentropy
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.metrics import CategoricalAccuracy
from tensorflow.keras.utils import to_categorical, plot_model
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, classification_report, plot_confusion_matrix
wget.download("https://raw.githubusercontent.com/yogawicaksana/helper_prabowo/main/helper_prabowo_ml.py",out="helper_prabowo_ml.py")
from helper_prabowo_ml import clean_html, remove_links, remove_special_characters, removeStopWords, remove_, remove_digits, lower, email_address, non_ascii, punct, hashtags
from transformers import AutoTokenizer, TFAutoModelForSequenceClassification

In [None]:
def text_preprocess(data,col):
    data[col] = data[col].apply(func=clean_html)
    data[col] = data[col].apply(func=remove_)
    data[col] = data[col].apply(func=removeStopWords)
    data[col] = data[col].apply(func=remove_digits)
    data[col] = data[col].apply(func=remove_links)
    data[col] = data[col].apply(func=remove_special_characters)
    data[col] = data[col].apply(func=punct)
    data[col] = data[col].apply(func=non_ascii)
    data[col] = data[col].apply(func=email_address)
    data[col] = data[col].apply(func=lower)
    return data

In [None]:
preprocessed_df = text_preprocess(df,'Text')
preprocessed_df.head()

In [None]:
labels_dict = {}
for idx, lang in enumerate(preprocessed_df.Language.unique()):
    labels_dict[lang] = idx
labels_dict

In [None]:
preprocessed_df['Label'] = preprocessed_df.Language.map(labels_dict)
preprocessed_df.head()

In [None]:
#preprocessed_df.sample(frac=1)

In [None]:
train_df, test_df = train_test_split(preprocessed_df,test_size=0.2,random_state=42,shuffle=True,stratify=preprocessed_df.Language)

In [None]:
tokenizer = AutoTokenizer.from_pretrained("papluca/xlm-roberta-base-language-detection")
xlm = TFAutoModelForSequenceClassification.from_pretrained("papluca/xlm-roberta-base-language-detection",from_pt=True)

In [None]:
X_train = tokenizer(text=train_df.Text.tolist(),
                   max_length=max_len,
                   padding=True,
                   truncation=True,
                   add_special_tokens=True,
                   return_tensors="tf",
                   return_attention_mask=True,
                   return_token_type_ids=False,
                   verbose=True)

X_test = tokenizer(text=test_df.Text.tolist(),
                  max_length=max_len,
                  padding=True,
                  truncation=True,
                  add_special_tokens=True,
                  return_tensors="tf",
                  return_attention_mask=True,
                  return_token_type_ids=False,
                  verbose=True)

In [None]:
input_ids = Input(shape=(max_len,),dtype=tf.int32,name='input_ids')
attention_mask = Input(shape=(max_len,),dtype=tf.int32,name='attention_mask')

In [None]:
embeddings = xlm(input_ids,attention_mask=attention_mask)[0] # 0 --> final hidden state, 1 --> pooling output
output = Flatten()(embeddings)
output = Dense(units=1024,activation='relu')(output)
output = Dropout(0.3)(output)
output = Dense(units=512,activation='relu')(output)
output = Dropout(0.2)(output)
output = Dense(units=512,activation='relu')(output)
output = Dropout(0.2)(output)
output = Dense(units=128,activation='relu')(output)
output = Dense(units=30,activation='softmax')(output)

model = Model(inputs=[input_ids,attention_mask],outputs=output)
model.layers[2].trainable = True

In [None]:
model.summary()

In [None]:
plot_model(model,'model.png',show_shapes=True,dpi=100,rankdir='TB')

In [None]:
optimizer = tf.keras.optimizers.legacy.Adam(learning_rate=5e-5,epsilon=2e-8,decay=0.01,clipnorm=1.0)
loss = CategoricalCrossentropy()
metrics = CategoricalAccuracy('balanced_accuracy')
model.compile(loss=loss,optimizer=optimizer,metrics=metrics)

In [None]:
es = EarlyStopping(monitor='val_balanced_accuracy',patience=20,verbose=1,mode='max',restore_best_weights=True)
mc = ModelCheckpoint(filepath='checkpoint',monitor='val_balanced_accuracy',mode='max',save_best_only=True,verbose=1)
r = model.fit(x={'input_ids': X_train['input_ids'], 'attention_mask': X_train['attention_mask']},
              y=to_categorical(train_df.Label),
              epochs=3,
              batch_size=64,
              callbacks=[es,mc],
              validation_data=({'input_ids': X_test['input_ids'], 'attention_mask': X_test['attention_mask']},to_categorical(test_df.Label))
             )

In [None]:
plt.plot(r.history['loss'],'r',label='train loss')
plt.plot(r.history['val_loss'],'b',label='test loss')
plt.xlabel('No. of Epochs')
plt.ylabel('Categorical Crossentropy Loss')
plt.title('Loss Graph')
plt.legend();

In [None]:
model.save('language_detector.h5')