In [1]:
import os

In [6]:
import pandas as pd
import numpy as np
import nltk

In [3]:
from sklearn.model_selection import train_test_split

In [9]:
DATA_DIR = os.path.abspath('./datasets/output/data.csv')

In [10]:
#nltk.download('stopwords')

## Loading Data

In [12]:
df = pd.read_csv(DATA_DIR,index_col=0)
df.head()

Unnamed: 0_level_0,class,text
id,Unnamed: 1_level_1,Unnamed: 2_level_1
182,Interview,"﻿विडम्बना, ओलीको सत्ता सवारी पनि सिंहदरबारमा फ..."
127,Interview,﻿नेपाली कांग्रेसको सभापतिमा रामचन्द्र पौडेलको ...
85,Interview,﻿\nदुर्भाग्य नै भन्नुपर्छ हाम्रो प्राथमिकता पे...
162,Interview,"﻿नयनराज पाण्डे, लेखक\nविमोचन हुनै लागेको तपाईं..."
13,Interview,"﻿- भवन भट्ट, उपाध्यक्ष, गैर आवासीय नेपाली संघ\..."


In [30]:
df.dropna(inplace=True)

In [31]:
y = df['class']
X = df['text']

In [128]:
labels=list(set(df['class']))
labels

['Agriculture',
 'Automobiles',
 'Bank',
 'Sports',
 'Employment',
 'Entertainment',
 'Literature',
 'Society',
 'Tourism',
 'Blog',
 'Opinion',
 'Migration',
 'Politics',
 'Technology',
 'Education',
 'World',
 'Interview',
 'Economy',
 'Business']

### Processing Data

In [147]:
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import TfidfVectorizer

In [148]:
nepali_stop_words = set(stopwords.words('nepali'))

In [149]:
tfidfVectorizer = TfidfVectorizer(stop_words=nepali_stop_words,encoding='utf-8',decode_error='ignore')

In [150]:
tfidfVectorizer = TfidfVectorizer(tokenizer= lambda x: x.split(" "),
                                  sublinear_tf=True, encoding='utf-8',
                                  decode_error='ignore',
                                  max_df=0.5,
                                  min_df=10,
                                  stop_words=nepali_stop_words)

In [151]:
X_vectorized = tfidfVectorizer.fit_transform(X)



In [156]:
X_vectorized

<5475x14431 sparse matrix of type '<class 'numpy.float64'>'
	with 740939 stored elements in Compressed Sparse Row format>

In [157]:
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.2)

In [158]:
X_train.shape, y_train.shape

((4380,), (4380,))

In [159]:
X_test.shape, y_test.shape

((1095,), (1095,))

## SVC Classifier

In [48]:
from sklearn.pipeline import Pipeline
from sklearn.svm import SVC
from sklearn.feature_selection import SelectKBest, chi2

In [81]:
svc_clf = Pipeline([
    ('vectorizer',tfidfVectorizer),
    ('chi2',SelectKBest(chi2,k=5000)),
    ('clf',SVC())
])
svc_clf_no_feat_sel = Pipeline([
    ('vectorizer',tfidfVectorizer),

    ('clf',SVC())
])

In [82]:
svc_clf.fit(X_train,y_train)

Pipeline(steps=[('vectorizer',
                 TfidfVectorizer(decode_error='ignore',
                                 stop_words={'अक्सर', 'अगाडी', 'अझै', 'अनुसार',
                                             'अन्तर्गत', 'अन्य', 'अन्यत्र',
                                             'अन्यथा', 'अब', 'अरु', 'अरुलाई',
                                             'अर्को', 'अर्थात', 'अर्थात्',
                                             'अलग', 'आए', 'आजको', 'आत्म', 'आदि',
                                             'आफू', 'आफूलाई', 'आफ्नै', 'आफ्नो',
                                             'आयो', 'उदाहरण', 'उनको', 'उनले',
                                             'उप', 'उहालाई', 'एउटै', ...})),
                ('chi2',
                 SelectKBest(k=5000,
                             score_func=<function chi2 at 0x7f425e595160>)),
                ('clf', SVC())])

In [86]:
svc_clf_no_feat_sel.fit(X_train,y_train)

Pipeline(steps=[('vectorizer',
                 TfidfVectorizer(decode_error='ignore',
                                 stop_words={'अक्सर', 'अगाडी', 'अझै', 'अनुसार',
                                             'अन्तर्गत', 'अन्य', 'अन्यत्र',
                                             'अन्यथा', 'अब', 'अरु', 'अरुलाई',
                                             'अर्को', 'अर्थात', 'अर्थात्',
                                             'अलग', 'आए', 'आजको', 'आत्म', 'आदि',
                                             'आफू', 'आफूलाई', 'आफ्नै', 'आफ्नो',
                                             'आयो', 'उदाहरण', 'उनको', 'उनले',
                                             'उप', 'उहालाई', 'एउटै', ...})),
                ('clf', SVC())])

In [87]:
print("Accuracy on train Set : ", svc_clf.score(X_train,y_train))
print("Accuracy on test Set : ", svc_clf.score(X_test,y_test))


Accuracy on train Set :  0.9472602739726027
Accuracy on test Set :  0.6365296803652968


In [88]:
print("Accuracy on train Set : ", svc_clf_no_feat_sel.score(X_train,y_train))
print("Accuracy on test Set : ", svc_clf_no_feat_sel.score(X_test,y_test))

Accuracy on train Set :  0.9490867579908676
Accuracy on test Set :  0.6401826484018265


### Random Forest Classifier

In [67]:
from sklearn.ensemble import RandomForestClassifier

In [89]:
rf_clf = Pipeline([
    ('vectorizer',tfidfVectorizer),
    ('chi2',SelectKBest(chi2,k=5000)),
    ('clf',RandomForestClassifier(
            n_estimators=2000, 
            max_depth=20, 
            min_samples_split=6, 
            min_samples_leaf=5, 
            bootstrap=True,
            verbose=1,
            n_jobs = -1
        )
    )
])
rf_clf_no_feat_sel = Pipeline([
    ('vectorizer',tfidfVectorizer),
    ('clf',RandomForestClassifier(
            n_estimators=2000, 
            max_depth=20, 
            min_samples_split=6, 
            min_samples_leaf=5, 
            bootstrap=True,
            verbose=1,
            n_jobs = -1
        )
    )
])

In [90]:
rf_clf.fit(X_train,y_train)
rf_clf_no_feat_sel.fit(X_train,y_train)

[Parallel(n_jobs=-1)]: Using backend ThreadingBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:    0.2s
[Parallel(n_jobs=-1)]: Done 184 tasks      | elapsed:    0.8s
[Parallel(n_jobs=-1)]: Done 434 tasks      | elapsed:    1.7s
[Parallel(n_jobs=-1)]: Done 784 tasks      | elapsed:    3.2s
[Parallel(n_jobs=-1)]: Done 1234 tasks      | elapsed:    5.3s
[Parallel(n_jobs=-1)]: Done 1784 tasks      | elapsed:    7.4s
[Parallel(n_jobs=-1)]: Done 2000 out of 2000 | elapsed:    8.4s finished
[Parallel(n_jobs=-1)]: Using backend ThreadingBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:    0.1s
[Parallel(n_jobs=-1)]: Done 184 tasks      | elapsed:    0.6s
[Parallel(n_jobs=-1)]: Done 434 tasks      | elapsed:    1.4s
[Parallel(n_jobs=-1)]: Done 784 tasks      | elapsed:    2.5s
[Parallel(n_jobs=-1)]: Done 1234 tasks      | elapsed:    4.0s
[Parallel(n_jobs=-1)]: Done 1784 tasks      | elapsed:    5.8s
[Parallel(n_jobs=

Pipeline(steps=[('vectorizer',
                 TfidfVectorizer(decode_error='ignore',
                                 stop_words={'अक्सर', 'अगाडी', 'अझै', 'अनुसार',
                                             'अन्तर्गत', 'अन्य', 'अन्यत्र',
                                             'अन्यथा', 'अब', 'अरु', 'अरुलाई',
                                             'अर्को', 'अर्थात', 'अर्थात्',
                                             'अलग', 'आए', 'आजको', 'आत्म', 'आदि',
                                             'आफू', 'आफूलाई', 'आफ्नै', 'आफ्नो',
                                             'आयो', 'उदाहरण', 'उनको', 'उनले',
                                             'उप', 'उहालाई', 'एउटै', ...})),
                ('clf',
                 RandomForestClassifier(max_depth=20, min_samples_leaf=5,
                                        min_samples_split=6, n_estimators=2000,
                                        n_jobs=-1, verbose=1))])

In [91]:
print("Accuracy on train Set : ", rf_clf.score(X_train,y_train))
print("Accuracy on test Set : ", rf_clf.score(X_test,y_test))

[Parallel(n_jobs=8)]: Using backend ThreadingBackend with 8 concurrent workers.
[Parallel(n_jobs=8)]: Done  34 tasks      | elapsed:    0.0s
[Parallel(n_jobs=8)]: Done 184 tasks      | elapsed:    0.2s
[Parallel(n_jobs=8)]: Done 434 tasks      | elapsed:    0.4s
[Parallel(n_jobs=8)]: Done 784 tasks      | elapsed:    0.6s
[Parallel(n_jobs=8)]: Done 1234 tasks      | elapsed:    0.9s
[Parallel(n_jobs=8)]: Done 1784 tasks      | elapsed:    1.3s
[Parallel(n_jobs=8)]: Done 2000 out of 2000 | elapsed:    1.5s finished


Accuracy on train Set :  0.7474885844748859


[Parallel(n_jobs=8)]: Using backend ThreadingBackend with 8 concurrent workers.
[Parallel(n_jobs=8)]: Done  34 tasks      | elapsed:    0.0s
[Parallel(n_jobs=8)]: Done 184 tasks      | elapsed:    0.0s
[Parallel(n_jobs=8)]: Done 434 tasks      | elapsed:    0.1s
[Parallel(n_jobs=8)]: Done 784 tasks      | elapsed:    0.2s
[Parallel(n_jobs=8)]: Done 1234 tasks      | elapsed:    0.4s


Accuracy on test Set :  0.5132420091324201


[Parallel(n_jobs=8)]: Done 1784 tasks      | elapsed:    0.6s
[Parallel(n_jobs=8)]: Done 2000 out of 2000 | elapsed:    0.7s finished


In [92]:
print("Accuracy on train Set : ", rf_clf_no_feat_sel.score(X_train,y_train))
print("Accuracy on test Set : ", rf_clf_no_feat_sel.score(X_test,y_test))

[Parallel(n_jobs=8)]: Using backend ThreadingBackend with 8 concurrent workers.
[Parallel(n_jobs=8)]: Done  34 tasks      | elapsed:    0.0s
[Parallel(n_jobs=8)]: Done 184 tasks      | elapsed:    0.2s
[Parallel(n_jobs=8)]: Done 434 tasks      | elapsed:    0.4s
[Parallel(n_jobs=8)]: Done 784 tasks      | elapsed:    0.6s
[Parallel(n_jobs=8)]: Done 1234 tasks      | elapsed:    1.1s
[Parallel(n_jobs=8)]: Done 1784 tasks      | elapsed:    1.5s
[Parallel(n_jobs=8)]: Done 2000 out of 2000 | elapsed:    1.7s finished


Accuracy on train Set :  0.689041095890411


[Parallel(n_jobs=8)]: Using backend ThreadingBackend with 8 concurrent workers.
[Parallel(n_jobs=8)]: Done  34 tasks      | elapsed:    0.0s
[Parallel(n_jobs=8)]: Done 184 tasks      | elapsed:    0.1s
[Parallel(n_jobs=8)]: Done 434 tasks      | elapsed:    0.1s
[Parallel(n_jobs=8)]: Done 784 tasks      | elapsed:    0.2s
[Parallel(n_jobs=8)]: Done 1234 tasks      | elapsed:    0.4s


Accuracy on test Set :  0.5013698630136987


[Parallel(n_jobs=8)]: Done 1784 tasks      | elapsed:    0.5s
[Parallel(n_jobs=8)]: Done 2000 out of 2000 | elapsed:    0.6s finished


### Naive Bayes

In [160]:
from sklearn.naive_bayes import MultinomialNB,BernoulliNB

In [161]:
bnb_clf = Pipeline([
    ('vect', tfidfVectorizer),
    ('clf', BernoulliNB(alpha=0.01))
])

mnb_clf = Pipeline([
    ('vect', tfidfVectorizer),
    ('clf', MultinomialNB(alpha=0.01, fit_prior=True))
])

In [162]:
bnb_clf.fit(X_train,y_train)

Pipeline(steps=[('vect',
                 TfidfVectorizer(decode_error='ignore', max_df=0.5, min_df=10,
                                 stop_words={'अक्सर', 'अगाडी', 'अझै', 'अनुसार',
                                             'अन्तर्गत', 'अन्य', 'अन्यत्र',
                                             'अन्यथा', 'अब', 'अरु', 'अरुलाई',
                                             'अर्को', 'अर्थात', 'अर्थात्',
                                             'अलग', 'आए', 'आजको', 'आत्म', 'आदि',
                                             'आफू', 'आफूलाई', 'आफ्नै', 'आफ्नो',
                                             'आयो', 'उदाहरण', 'उनको', 'उनले',
                                             'उप', 'उहालाई', 'एउटै', ...},
                                 sublinear_tf=True,
                                 tokenizer=<function <lambda> at 0x7f42498d4820>)),
                ('clf', BernoulliNB(alpha=0.01))])

In [163]:
print("Accuracy on train Set : ", bnb_clf.score(X_train,y_train))
print("Accuracy on test Set : ", bnb_clf.score(X_test,y_test))

Accuracy on train Set :  0.9123287671232877
Accuracy on test Set :  0.634703196347032


In [164]:
mnb_clf.fit(X_train,y_train)

Pipeline(steps=[('vect',
                 TfidfVectorizer(decode_error='ignore', max_df=0.5, min_df=10,
                                 stop_words={'अक्सर', 'अगाडी', 'अझै', 'अनुसार',
                                             'अन्तर्गत', 'अन्य', 'अन्यत्र',
                                             'अन्यथा', 'अब', 'अरु', 'अरुलाई',
                                             'अर्को', 'अर्थात', 'अर्थात्',
                                             'अलग', 'आए', 'आजको', 'आत्म', 'आदि',
                                             'आफू', 'आफूलाई', 'आफ्नै', 'आफ्नो',
                                             'आयो', 'उदाहरण', 'उनको', 'उनले',
                                             'उप', 'उहालाई', 'एउटै', ...},
                                 sublinear_tf=True,
                                 tokenizer=<function <lambda> at 0x7f42498d4820>)),
                ('clf', MultinomialNB(alpha=0.01))])

In [165]:
print("Accuracy on train Set : ", mnb_clf.score(X_train,y_train))
print("Accuracy on test Set : ", mnb_clf.score(X_test,y_test))

Accuracy on train Set :  0.9424657534246575
Accuracy on test Set :  0.6894977168949772


## Classification Report


In [127]:
from sklearn.metrics import classification_report

#### SVM

In [144]:
y_pred = svc_clf.predict(X_test)
print(classification_report(y_test,y_pred))

               precision    recall  f1-score   support

  Agriculture       1.00      0.70      0.82        20
  Automobiles       1.00      0.83      0.91        24
         Bank       0.95      0.97      0.96       102
         Blog       0.88      0.71      0.79        49
     Business       0.95      0.38      0.54        48
      Economy       0.77      0.89      0.82        87
    Education       0.96      0.87      0.92        31
   Employment       0.97      0.88      0.92        41
Entertainment       0.92      0.90      0.91       124
    Interview       0.51      0.71      0.59        45
   Literature       0.91      0.94      0.93        34
    Migration       0.92      0.86      0.89        14
      Opinion       0.87      0.94      0.90        99
     Politics       0.91      1.00      0.95       100
      Society       0.90      0.76      0.83        34
       Sports       0.85      0.95      0.90       125
   Technology       1.00      0.86      0.92        14
      Tou

In [145]:
y_pred = svc_clf_no_feat_sel.predict(X_test)
print(classification_report(y_test,y_pred))

               precision    recall  f1-score   support

  Agriculture       1.00      0.70      0.82        20
  Automobiles       1.00      0.83      0.91        24
         Bank       0.96      0.96      0.96       102
         Blog       0.90      0.73      0.81        49
     Business       0.95      0.38      0.54        48
      Economy       0.78      0.90      0.83        87
    Education       0.96      0.84      0.90        31
   Employment       0.97      0.90      0.94        41
Entertainment       0.93      0.91      0.92       124
    Interview       0.50      0.69      0.58        45
   Literature       0.91      0.94      0.93        34
    Migration       0.92      0.86      0.89        14
      Opinion       0.86      0.94      0.90        99
     Politics       0.91      1.00      0.95       100
      Society       0.90      0.79      0.84        34
       Sports       0.84      0.95      0.89       125
   Technology       1.00      0.86      0.92        14
      Tou

### Naive Bayes

In [166]:
y_pred = bnb_clf.predict(X_test)
print(classification_report(y_test,y_pred))

               precision    recall  f1-score   support

  Agriculture       0.73      0.52      0.61        21
  Automobiles       0.82      0.54      0.65        26
         Bank       0.78      0.83      0.80        99
         Blog       0.40      0.33      0.37        51
     Business       0.07      0.04      0.05        45
      Economy       0.55      0.73      0.63        99
    Education       0.73      0.57      0.64        28
   Employment       0.77      0.50      0.61        40
Entertainment       0.55      0.65      0.60       117
    Interview       0.07      0.08      0.08        38
   Literature       0.77      0.28      0.41        36
    Migration       0.86      0.33      0.48        18
      Opinion       0.67      0.89      0.77        85
     Politics       0.91      0.91      0.91        98
      Society       0.51      0.47      0.49        45
       Sports       0.66      0.77      0.71       137
   Technology       0.80      0.57      0.67        21
      Tou

In [167]:
y_pred = mnb_clf.predict(X_test)
print(classification_report(y_test,y_pred))

               precision    recall  f1-score   support

  Agriculture       0.70      0.67      0.68        21
  Automobiles       0.83      0.58      0.68        26
         Bank       0.85      0.89      0.87        99
         Blog       0.80      0.24      0.36        51
     Business       0.39      0.31      0.35        45
      Economy       0.52      0.71      0.60        99
    Education       0.82      0.64      0.72        28
   Employment       0.73      0.60      0.66        40
Entertainment       0.72      0.72      0.72       117
    Interview       0.10      0.13      0.11        38
   Literature       0.65      0.78      0.71        36
    Migration       0.80      0.44      0.57        18
      Opinion       0.58      0.91      0.71        85
     Politics       0.90      0.92      0.91        98
      Society       0.55      0.64      0.59        45
       Sports       0.94      0.74      0.83       137
   Technology       0.85      0.52      0.65        21
      Tou

### Random Forest

In [168]:
y_pred = rf_clf.predict(X_test)
print(classification_report(y_test,y_pred))

[Parallel(n_jobs=8)]: Using backend ThreadingBackend with 8 concurrent workers.
[Parallel(n_jobs=8)]: Done  34 tasks      | elapsed:    0.0s
[Parallel(n_jobs=8)]: Done 184 tasks      | elapsed:    0.1s
[Parallel(n_jobs=8)]: Done 434 tasks      | elapsed:    0.1s
[Parallel(n_jobs=8)]: Done 784 tasks      | elapsed:    0.2s
[Parallel(n_jobs=8)]: Done 1234 tasks      | elapsed:    0.4s


               precision    recall  f1-score   support

  Agriculture       1.00      0.24      0.38        21
  Automobiles       1.00      0.65      0.79        26
         Bank       0.87      0.95      0.91        99
         Blog       1.00      0.08      0.15        51
     Business       0.75      0.27      0.39        45
      Economy       0.64      0.88      0.74        99
    Education       1.00      0.32      0.49        28
   Employment       1.00      0.45      0.62        40
Entertainment       0.84      0.62      0.71       117
    Interview       0.45      0.66      0.53        38
   Literature       1.00      0.61      0.76        36
    Migration       1.00      0.44      0.62        18
      Opinion       0.77      0.99      0.87        85
     Politics       0.75      0.93      0.83        98
      Society       1.00      0.29      0.45        45
       Sports       0.45      0.97      0.61       137
   Technology       1.00      0.10      0.17        21
      Tou

[Parallel(n_jobs=8)]: Done 1784 tasks      | elapsed:    0.5s
[Parallel(n_jobs=8)]: Done 2000 out of 2000 | elapsed:    0.6s finished


In [169]:
y_pred = rf_clf_no_feat_sel.predict(X_test)
print(classification_report(y_test,y_pred))

[Parallel(n_jobs=8)]: Using backend ThreadingBackend with 8 concurrent workers.
[Parallel(n_jobs=8)]: Done  34 tasks      | elapsed:    0.0s
[Parallel(n_jobs=8)]: Done 184 tasks      | elapsed:    0.0s
[Parallel(n_jobs=8)]: Done 434 tasks      | elapsed:    0.1s
[Parallel(n_jobs=8)]: Done 784 tasks      | elapsed:    0.2s
[Parallel(n_jobs=8)]: Done 1234 tasks      | elapsed:    0.4s


               precision    recall  f1-score   support

  Agriculture       0.00      0.00      0.00        21
  Automobiles       1.00      0.38      0.56        26
         Bank       0.89      0.92      0.91        99
         Blog       1.00      0.04      0.08        51
     Business       0.75      0.20      0.32        45
      Economy       0.57      0.86      0.69        99
    Education       1.00      0.18      0.30        28
   Employment       1.00      0.38      0.55        40
Entertainment       0.79      0.53      0.64       117
    Interview       0.44      0.63      0.52        38
   Literature       1.00      0.44      0.62        36
    Migration       1.00      0.28      0.43        18
      Opinion       0.72      0.99      0.83        85
     Politics       0.70      0.94      0.80        98
      Society       1.00      0.16      0.27        45
       Sports       0.40      0.97      0.57       137
   Technology       0.00      0.00      0.00        21
      Tou

[Parallel(n_jobs=8)]: Done 1784 tasks      | elapsed:    0.5s
[Parallel(n_jobs=8)]: Done 2000 out of 2000 | elapsed:    0.6s finished
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
