# CLASSIFICATION OF NEPALI LANGUAGE DATA

In [1]:
import pandas as pd
import numpy as np
pd.set_option('display.max_colwidth',None)   
import collections
from collections import Counter

In [2]:
df = pd.read_csv("train.csv")
df2 = pd.read_csv("valid.csv")

In [3]:
df.head()

Unnamed: 0,headings,paras,label
0,‘लभ स्टेसन’ भियतनाम र इन्डोनेसियामा,"नेपाली कथानक फिल्म ‘लभ स्टेसन’ को टिम यति बेला भियतनाम र इन्डोनेसियामा छ । फिल्मको गीत छायांकनका लागि अभिनेता प्रदीप खड्का, अभिनेत्री जसिता गुरुङ, निर्माता गोविन्द शाही, कोरियोग्राफर शिशिर खाती, क्यामेरा पर्सन आलोक शुक्लासहितको टिम यी देश पुगेका हुन् ।",entertainment
1,चाडबाडलगत्तै तरकारी सस्तियो,दसैंको मुखमा अस्वाभाविक बढेको तरकारी तथा फलफूलको भाउ घट्न थालेको छ । उत्पादन वृद्धिसँगै सहज आपूर्तिका कारण मूल्य घटेको हो ।,business
2,किङ्गफिसर बियर नेपाली बजारमा,एशियाकै ठूलो बियर कम्पनी मध्येको युनाइटेड ब्रुअरीज लिमिटेडले यति ब्रुअरीसँगको सहकार्यमा किङ्गफिसर बियर बजारमा ल्याएको छ ।,business
3,अर्बपतिका आँखा अन्तरिक्षतिर,"संसारका धनाढ्यहरू अन्तरिक्ष यात्रालाई सस्तो र व्यावसायिक रूपमा साकार बनाउने अभियानमा सामेल हुने क्रम बढ्दो छ । यिनैमध्येका हुन् अमेरिकी अर्बपति इलन मस्क, जेफ बेजोस् र बेलायती टाइकुन रिचर्ड ब्रानसन ।",business
4,‘आई एम सरी’ भन्दै सौगात,निकेश खड्का निर्देशित फिल्म ‘फाटेको जुत्ता’ को हालै सार्वजनिक गीतमा अभिनेता सौगात मल्ल र प्रियंका कार्की फिचरिङ छन् । जसमा सौगातले प्रियंकासँग माफी माग्ने मुडमा फकाइरहेका छन् ।,entertainment


In [4]:
#preparing training data
X_train=df['paras']
Y_train=df['label']

In [5]:
#preparing testing data
X_test=df2['paras']
Y_test=df2['label']

### REMOVING STOPWORDS

In [6]:
import nltk

In [7]:
stopWords = set(nltk.corpus.stopwords.words('nepali'))

### Vectorizer

In [8]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [9]:
def tfidf_features(X_train, X_test):
    tfidf_vectorizer = TfidfVectorizer(stop_words=stopWords)
    X_train_tfidf = tfidf_vectorizer.fit_transform(X_train)
    X_test_tfidf = tfidf_vectorizer.transform(X_test)
    return X_train_tfidf,X_test_tfidf,tfidf_vectorizer.vocabulary_
X_train_tfidf,X_test_tfidf,vocabulary=tfidf_features(X_train,X_test)




# CLASSIFICATION OF NEPALI LANGUAGE

In [10]:
#Convert Sparse to Dense
X_train_Vectorized_Dense = X_train_tfidf.todense()
X_test_Vectorized_Dense = X_test_tfidf.todense()

# Algorithm implemntation

In [13]:
from sklearn.svm import LinearSVC
from sklearn.ensemble import ExtraTreesClassifier,GradientBoostingClassifier,RandomForestClassifier,AdaBoostClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.neural_network import MLPClassifier
from sklearn.linear_model import LogisticRegression,RidgeClassifier
from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis
from sklearn.tree import DecisionTreeClassifier

In [14]:
from sklearn.metrics import confusion_matrix, accuracy_score, f1_score

In [15]:
def ClassificationAlgorithm_dense(func, **kwargs):

  def innerFunction():
    model = func(**kwargs)
    model.fit(X_train_Vectorized_Dense, Y_train)

    Y_pred = model.predict(X_test_Vectorized_Dense)
    
    print("Confusion Matrix : \n",confusion_matrix(Y_test,Y_pred),"\n")
    print("Accuracy Score : ",accuracy_score(Y_test,Y_pred))
    print("F1 Score : ",f1_score(Y_test,Y_pred,average='weighted'))

  return innerFunction()

In [16]:
def ClassificationAlgorithm(func, **kwargs):

  def innerFunction():
    model = func(**kwargs)
    model.fit(X_train_tfidf,Y_train)
    
    Y_pred= model.predict(X_test_tfidf)
    
    print ("Confusion Matrix : \n", confusion_matrix(Y_test, Y_pred), "\n")
    print("Testing Accuracy is  :%s" %(model.score(X_test_tfidf, Y_test)))
    print ("F1 Score : " , f1_score(Y_test, Y_pred, average='weighted'))
    
  return innerFunction()

### SVM

In [17]:
from sklearn.svm import LinearSVC
ClassificationAlgorithm(LinearSVC)

Confusion Matrix : 
 [[447  48  15]
 [ 36 398  17]
 [ 15  22 497]] 

Testing Accuracy is  :0.8976588628762542
F1 Score :  0.8978573923845155


In [18]:
from sklearn.svm import SVC
ClassificationAlgorithm(SVC,kernel='rbf')

Confusion Matrix : 
 [[453  47  10]
 [ 27 409  15]
 [ 13  26 495]] 

Testing Accuracy is  :0.9076923076923077
F1 Score :  0.9081360685048192


In [19]:
from sklearn.svm import SVC
ClassificationAlgorithm(SVC,kernel='poly')

Confusion Matrix : 
 [[463  42   5]
 [ 22 420   9]
 [ 21  37 476]] 

Testing Accuracy is  :0.9090301003344482
F1 Score :  0.9097357237552444


### Naive Bayes

In [20]:
from sklearn.naive_bayes import GaussianNB
ClassificationAlgorithm_dense(GaussianNB)



Confusion Matrix : 
 [[333  39 138]
 [ 79 223 149]
 [ 36  33 465]] 

Accuracy Score :  0.682943143812709
F1 Score :  0.6758247755953466


### BERT MODEL

In [21]:
! pip install tensorboardX pandas simpletransformers transformers

Collecting tensorboardX
  Downloading tensorboardX-2.5.1-py2.py3-none-any.whl (125 kB)
[K     |████████████████████████████████| 125 kB 1.5 MB/s eta 0:00:01
Collecting simpletransformers
  Downloading simpletransformers-0.63.7-py3-none-any.whl (249 kB)
[K     |████████████████████████████████| 249 kB 4.6 MB/s eta 0:00:01
Collecting datasets
  Downloading datasets-2.4.0-py3-none-any.whl (365 kB)
[K     |████████████████████████████████| 365 kB 3.7 MB/s eta 0:00:01
[?25hCollecting wandb>=0.10.32
  Downloading wandb-0.13.0-py2.py3-none-any.whl (1.8 MB)
[K     |████████████████████████████████| 1.8 MB 4.0 MB/s eta 0:00:01
Collecting seqeval
  Downloading seqeval-1.2.2.tar.gz (43 kB)
[K     |████████████████████████████████| 43 kB 3.1 MB/s eta 0:00:01
Collecting tensorboard
  Downloading tensorboard-2.9.1-py3-none-any.whl (5.8 MB)
[K     |████████████████████████████████| 5.8 MB 2.7 MB/s eta 0:00:01
[?25hCollecting streamlit
  Downloading streamlit-1.11.1-py2.py3-none-any.whl (9.1 M

In [25]:
from simpletransformers.classification import ClassificationModel

In [26]:
print(set(Y_train.values))

{'business', 'sports', 'entertainment'}


In [27]:
from sklearn import preprocessing
labeler = preprocessing.LabelEncoder()

df3 = pd.DataFrame(X_train)
df3['label'] = labeler.fit_transform(Y_train)
print(df3.head())

                                                                                                                                                                                                                                                          paras  \
0  नेपाली कथानक फिल्म ‘लभ स्टेसन’ को टिम यति बेला भियतनाम र इन्डोनेसियामा छ । फिल्मको गीत छायांकनका लागि अभिनेता प्रदीप खड्का, अभिनेत्री जसिता गुरुङ, निर्माता गोविन्द शाही, कोरियोग्राफर शिशिर खाती, क्यामेरा पर्सन आलोक शुक्लासहितको टिम यी देश पुगेका हुन् ।   
1                                                                                                                                   दसैंको मुखमा अस्वाभाविक बढेको तरकारी तथा फलफूलको भाउ घट्न थालेको छ । उत्पादन वृद्धिसँगै सहज आपूर्तिका कारण मूल्य घटेको हो ।   
2                                                                                                                                    एशियाकै ठूलो बियर कम्पनी मध्येको युनाइटेड ब्रुअरीज लिमिटेडले यति ब्रुअरीसँगको सहकार्यमा कि

In [29]:
model = ClassificationModel('bert', 'bert-base-multilingual-uncased', num_labels=3, use_cuda=False, args={
    'reprocess_input_data': True,
    'use_cached_eval_features': False,
    'overwrite_output_dir': True,
    'num_train_epochs': 3
})
model.train_model(df3)
print("trained model")

Downloading:   0%|          | 0.00/672M [00:00<?, ?B/s]

Some weights of the model checkpoint at bert-base-multilingual-uncased were not used when initializing BertForSequenceClassification: ['cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.decoder.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.bias', 'cls.seq_relationship.weight']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model 

Downloading:   0%|          | 0.00/872k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/1.72M [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/28.0 [00:00<?, ?B/s]



  0%|          | 0/5975 [00:00<?, ?it/s]

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Av

Epoch:   0%|          | 0/3 [00:00<?, ?it/s]

Running Epoch 0 of 3:   0%|          | 0/747 [00:00<?, ?it/s]

Running Epoch 1 of 3:   0%|          | 0/747 [00:00<?, ?it/s]

Running Epoch 2 of 3:   0%|          | 0/747 [00:00<?, ?it/s]

trained model


In [32]:
eval_frame = pd.DataFrame(X_test)
eval_frame['label'] = labeler.fit_transform(Y_test)
print(eval_frame.head())

                                                                                                                                                                                                                                                paras  \
0                                                            आर्थिक वर्ष २०७४/७५ मा जिल्लामा आएको १ अर्ब ३९ करोड २२ लाख रकम फ्रिज भएको छ ।\nचालुतर्फ ७० करोड ४ लाख ७ हजार र पुँजीगततर्फ ६९ करोड १८ लाख ३ हजार रूपैयाँ खर्च हुन नसकेर फिर्ता गएको हो ।   
1                                                                       गोरखा र नुवाकोटमा भूकम्पबाट क्षतिग्रस्त पचास हजार घरको पुनर्निर्माणका लागि भारत सरकारले १ करोड ६२ लाख अमेरिकी डलर (१ अर्ब ६७ करोड ७० लाख रुपैयाँ) अनुदान सहयोग गर्ने भएको छ ।   
2  सरकारले उपभोक्ता ठगी गर्ने व्यवसायीलाई वाणिज्य, आपूिर्त तथा उपभोक्ता संरक्षण विभागले नै कारबाही गर्न सक्ने अधिकारसहितको कानुन जारी गर्न लागेको छ । प्रस्तावित उपभोक्ता संरक्षण ऐन २०७५ को मस्यौदामा अनुगमन निरीक्षकलाई उक्त अधिकार दिन लागेको हो ।   
3   

In [33]:
print(len(eval_frame))

1495


In [34]:
result, model_outputs, wrong_predictions = model.eval_model(eval_frame)
bads = {}
for pred in wrong_predictions:
    if pred.label in bads:
        bads[pred.label] += 1
    else:
        bads[pred.label] = 1
print("wrong predictions :")
print(bads)



  0%|          | 0/1495 [00:00<?, ?it/s]

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


Running Evaluation:   0%|          | 0/187 [00:00<?, ?it/s]

wrong predictions (by input I think?):
{0: 47, 2: 49, 1: 50}


In [35]:
(1-(47+49+50)/1495)*100   

90.23411371237458

Accuracy =90.23%; Epoches = 3; Simple Transformers + Multiligual BERT