## Objective
- Learn the keywords of NLP
- Understand basic of NLP
- Explore the Bag of words for basic analysis of text



## Dataset Preparation:

Prepare the Nepali news dataset

In [4]:
import nltk
import pandas as pd
import requests as req
import matplotlib.pyplot as plt
nltk.download('punkt')



[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Bibek\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [5]:
url = 'https://github.com/bibekb11/nepalinewsdataset/raw/main/nepalidataset.xlsx'
xlfile = req.get(url)
xl = pd.read_excel(xlfile.content)
print(xl.head())


                                                text        category
0  राजस्व अनुसन्धान विभागले जयबाबा इँटा उद्योगविर...  अर्थ / वाणिज्य
1  युरोपियन युनियन र चीन सरकार दुवैतिर समस्याको स...  अर्थ / वाणिज्य
2  डेडिकेटेड तथा टंकलाइनबापतको उठ्नुपर्ने २० अर्ब...  अर्थ / वाणिज्य
3  मुलुकबाट गत वर्ष ४ लाख ५२ हजार रुपैयाँ बराबरको...  अर्थ / वाणिज्य
4  सरकारको कसिलो नीति, पूर्वाधार अभाव, कर्जा प्रव...  अर्थ / वाणिज्य


Frequency Analysis

In [6]:

from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk import FreqDist
import string



In [7]:
#converting text column to list
text = xl['text'].tolist()

#converting list to string
text_var = str(text)

#tokenization
tokens = word_tokenize(text_var)

# Calculate word frequencies
frequency_distribution = FreqDist(tokens)

# Display the most common words and their frequencies
common_words = frequency_distribution.most_common(10)

print(common_words)



[("'", 41), (',', 36), ('।', 30), ('छ', 15), ('र', 12), ('भएको', 11), ('गरेको', 9), ('छ\\xa0।', 7), ('आएको', 6), ('नेपाल', 6)]


## Task  2.2: Filter Stop words

Improve Performance analysis by filtering stop words


In [8]:
stop_words = set(stopwords.words('nepali'))
other_characters = [',', '।', 'छ\\xa0।', '(', ')', 'छन्\\xa0।'] 
final_stopwords = list(stop_words)+list(other_characters)
filtered_tokens = []

for token in tokens:
    if token not in final_stopwords:
        filtered_tokens.append(token)

frequency_distribution = FreqDist(filtered_tokens)
print(frequency_distribution.most_common(10))



[("'", 41), ('आएको', 6), ('नेपाल', 6), ('भएका', 5), ('अटो', 4), ('इन्जिनियरिङ', 4), ('विश्वविद्यालयको', 4), ('सरकार', 3), ('लागेको', 3), ('लाइन', 3)]


## Task 3: BoW

Task 3: BoW: Prepare Bag of Words (BoW) from the dataset


In [9]:
from sklearn.feature_extraction.text import CountVectorizer
# Create a CountVectorizer object
vectorizer = CountVectorizer()
# Fit and transform the text data to create the BoW representation
X_bow = vectorizer.fit_transform(xl['text'])
# Convert the BoW representation to a DataFrame with feature names
bow_df = pd.DataFrame(X_bow.toarray(), columns=vectorizer.get_feature_names_out())

bow_df.head()


Unnamed: 0,अख,अट,अद,अध,अन,अप,अफ,अब,अभ,अर,...,२०७८,२०७९,२१,२३,३७,५०,५२,७२,७८,९८
0,0,0,1,0,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,1,0,0,0,0,1,...,0,0,0,0,0,1,1,0,0,0
4,0,4,0,0,0,1,0,1,1,0,...,0,0,0,0,0,0,0,0,0,0


Classification



In [11]:
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score
# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(bow_df, xl['category'], test_size=0.2, random_state=10)

# Create a Naive Bayes classifier
clf = MultinomialNB()
#training classifier
clf.fit(X_train, y_train)
#category prediction
y_pred = clf.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)

Accuracy: 0.4
