In [3]:
import pickle
import pandas as pd
from sklearn.naive_bayes import MultinomialNB
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics import accuracy_score, classification_report

# Load the dataset
data = pd.read_csv('Language_det_train.csv')
X = data['Text']
y = data['Language']

# Split the data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Vectorize the text
vectorizer = CountVectorizer()
X_train_vec = vectorizer.fit_transform(X_train)
X_test_vec = vectorizer.transform(X_test)

# Train the model
det_model = MultinomialNB()
det_model.fit(X_train_vec, y_train)

# Predict and evaluate
y_pred = det_model.predict(X_test_vec)
print(f'Accuracy: {accuracy_score(y_test, y_pred)}')
print(classification_report(y_test, y_pred))

Accuracy: 0.9760692464358453
              precision    recall  f1-score   support

      Arabic       1.00      0.97      0.98        93
      Danish       0.99      0.92      0.95        86
       Dutch       1.00      0.98      0.99       111
     English       0.89      1.00      0.94       266
      French       0.98      0.98      0.98       178
      German       0.99      0.96      0.97        80
       Greek       1.00      0.97      0.99        73
       Hindi       1.00      0.91      0.95        11
     Italian       1.00      0.99      0.99       136
     Kannada       1.00      0.95      0.97        75
   Malayalam       1.00      0.97      0.99       116
  Portugeese       0.99      0.97      0.98       138
     Russian       0.99      0.98      0.99       141
     Spanish       0.98      0.99      0.98       152
    Sweedish       0.98      0.98      0.98       120
       Tamil       1.00      0.97      0.99        77
     Turkish       1.00      0.97      0.99       11

In [4]:
with open('lang_det.pickle','wb') as f:
    pickle.dump(det_model,f)
print("Language Detector Model Saved")
with open('vectorizer.pickle','wb') as f:
    pickle.dump(vectorizer,f)
print("Vectorizer Saved")

Language Detector Model Saved
Vectorizer Saved


In [2]:
# Load model directly
import evaluate
import pandas as pd
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM

tokenizer = AutoTokenizer.from_pretrained("facebook/nllb-200-distilled-600M")
model = AutoModelForSeq2SeqLM.from_pretrained("facebook/nllb-200-distilled-600M")

In [4]:
def translate_to_arabic(text):
  encoded = tokenizer(text, return_tensors="pt")
  generated_tokens = model.generate(
      **encoded,forced_bos_token_id=tokenizer.lang_code_to_id["arb_Arab"]
  )
  return tokenizer.batch_decode(generated_tokens, skip_special_tokens=True)

In [5]:
def translate_to_english(text):
  encoded = tokenizer(text, return_tensors="pt")
  generated_tokens = model.generate(
      **encoded,forced_bos_token_id=tokenizer.lang_code_to_id["eng_Latn"]
  )
  return tokenizer.batch_decode(generated_tokens, skip_special_tokens=True)

In [80]:
meteor = evaluate.load('meteor')

[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package omw-1.4 to /root/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


In [81]:
bleu = evaluate.load('bleu')

In [84]:
wer = evaluate.load('wer')

In [6]:
data = pd.read_csv('translation_train.csv')
sample = data.sample(2)
predicted_english = list(map(translate_to_english,sample['Arabic']))
predicted_arabic = list(map(translate_to_arabic,sample['English']))

the `lang_code_to_id` attribute is deprecated. The logic is natively handled in the `tokenizer.adder_tokens_decoder` this attribute will be removed in `transformers` v4.38


In [9]:
sample

Unnamed: 0,English,Arabic
22915,I'm not surprised.,أنا غير متفاجئ.
22350,I'm counting on you guys.,أعتمد عليكم يا رفاق.


In [10]:
predicted_english

[["I'm not surprised."], ['I trust you guys.']]

In [11]:
predicted_arabic

[['أنا لست متفاجئة'], ['أنا أعتمد عليكم يا رفاق']]

In [92]:
print('Bleu: ',bleu.compute(references=sample['English'].tolist(),predictions=sum(predicted_english,[])))
print('WER: ',wer.compute(references=sample['English'].tolist(),predictions=sum(predicted_english,[])))
print('Meteor: ',meteor.compute(references=sample['English'].tolist(),predictions=sum(predicted_english,[])))

Bleu:  {'bleu': 0.11611843274277883, 'precisions': [0.36012861736334406, 0.14892412231030577, 0.07923169267707082, 0.042784163473818644], 'brevity_penalty': 1.0, 'length_ratio': 1.0366666666666666, 'translation_length': 1866, 'reference_length': 1800}
WER:  0.922668240850059
Meteor:  {'meteor': 0.4135564338106954}


In [93]:
print('Bleu: ',bleu.compute(references=sample['Arabic'].tolist(),predictions=sum(predicted_arabic,[])))
print('WER: ', wer.compute(references=sample['Arabic'].tolist(),predictions=sum(predicted_arabic,[])))
print('Meteor: ', meteor.compute(references=sample['Arabic'].tolist(),predictions=sum(predicted_arabic,[])))

Bleu:  {'bleu': 0.07759276719689627, 'precisions': [0.3304054054054054, 0.13043478260869565, 0.04996096799375488, 0.016835016835016835], 'brevity_penalty': 1.0, 'length_ratio': 1.0143934201507883, 'translation_length': 1480, 'reference_length': 1459}
WER:  0.8598458304134547
Meteor:  {'meteor': 0.2949758855169468}
