In [4]:
import numpy as np
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from nltk.corpus import wordnet
import joblib
from langdetect import detect
import pickle

import warnings
warnings.filterwarnings('ignore')

In [5]:

# Loading Nepali words and numbers
with open('D://newsclassify//backend//base//idf_values//nepali_tfidf_1000.model','rb') as f:
    nepali_base_tfidf = pickle.load(f)

with open('D://newsclassify//backend//base//idf_values//english_tfidf.model','rb') as f:
    english_base_tfidf = pickle.load(f)
#stop_words
stop_words=open("D://newsclassify//backend//base//nepali_words//stopwords.txt","r",encoding="utf-8")
stop_words=stop_words.read()
stop_words=stop_words.split("//n")

#num file
nepali_num=open("D://newsclassify//backend//base//nepali_words//numbers.txt","r",encoding="utf-8")
nepali_num=nepali_num.read()
nepali_num=nepali_num.split(",")

#suffix file
nepali_suffix=open("D://newsclassify//backend//base//nepali_words//suffix.txt","r",encoding="utf-8")
nepali_suffix=nepali_suffix.read()
nepali_suffix=nepali_suffix.split("//n")


In [6]:
class Classify:
    def __init__(self,news,model_name):
        self.news = news
        self.language = detect(self.news)
        #self.model_selection = model_selection
        if self.language != "en":
            self.pre_processed_news = self.pre_process_nepali_news()
            self.model = joblib.load(f"D:\\newsclassify\\backend\\base\\model\\{model_name}_model_nepali.pkl")
        else:
            self.pre_processed_news = self.pre_process_english_news()
            self.model = joblib.load(f"D:\\newsclassify\\backend\\base\\model\\{model_name}_model_english.pkl")
            
        self.category_class = ['business', 'entertainment', 'politics', 'sport', 'tech']
    def pre_process_nepali_news(self):
        news = self.news
        
        #removing \n and \ufeff
        remove=['\n','\ufeff']
        for i in remove:
            news.replace(i,'')
        
        #Remove Stop Words
        word_tokens = news.split(" ")
        filtered_list = [w for w in word_tokens if not w in stop_words]
        
        #Remove Nepali numbers
        num_filter=[]
        for i in range(0,len(filtered_list)):
            for j in range(0,len(nepali_num)):
                if nepali_num[j] in filtered_list[i]:
                    num_filter.append(filtered_list[i])
                    break
        for filter in num_filter:
            filtered_list.remove(filter)
        
        #Remove English numbers
        num=['0','1','2','3','4','5','6','7','8','9']
        num_filter=[]
        for i in range(0,len(filtered_list)):
            for j in range(0,len(num)):
                if num[j] in filtered_list[i]:
                    num_filter.append(filtered_list[i])
                    break
        for filter in num_filter:
            filtered_list.remove(filter)       
        
        #Stemming Manual
        filtered_string =' '.join(filtered_list)
        
        #stemmed_string=' '.join(filtered_list)
        
        return filtered_string
    
    def pre_process_english_news(self):
        news=str(self.news)
        
        #lowercasing
        news=news.lower()
        
        #Remove Stop Words
        stop_words=set(stopwords.words('english'))
        word_tokens = word_tokenize(news)
        filtered_list = [w for w in word_tokens if not w in stop_words]
        
        #Remove numbers and special Symbols
        #words like 100m 2m were not removed so using this
        num=['0','1','2','3','4','5','6','7','8','9']
        num_filter=[]
        for i in range(0,len(filtered_list)):
            for j in range(0,len(num)):
                if num[j] in filtered_list[i]:
                    num_filter.append(filtered_list[i])
                    break
        
        for filter in num_filter:
            filtered_list.remove(filter)
                    
        filtered_list = [w for w in filtered_list if w.isalnum()]
        filtered_list=  [w for w in filtered_list if not w.isdigit()]
        
        #Lematizing
        wordnet_lemmatizer=WordNetLemmatizer()
        lemmatized_list=[wordnet_lemmatizer.lemmatize(w,wordnet.VERB) for w in filtered_list]
        lemmatized_string=' '.join(lemmatized_list)
        
        return lemmatized_string
    
    def predict_news(self):
        
        if self.language != "en":
            tf_idf = nepali_base_tfidf.transform([self.pre_processed_news]).toarray()
        else:
            tf_idf = english_base_tfidf.transform([self.pre_processed_news]).toarray()
        
        confidence = self.model.predict_proba(tf_idf)
        index = np.argmax(confidence)
        confidence = [np.around(x*100,2) for x in confidence][0]
        return(self.category_class[int(index)].upper(),confidence)
            

In [7]:
english_news = "Russia has asked for detailed project proposals from Nepal for 13 different projects, including the construction of railways and roads, that can be constructed and operated with Russian cooperation. Russia’s request for detailed project proposals for various projects with Nepal comes following a meeting between National Assembly Chairman Ganesh Prasad Timilsina and his Russian counterpart Valentina Matvienko, where discussions were held regarding 13 different projects that Russia could undertake. According to a statement issued by NA Chairman Timilsina’s office, the Russian embassy in Nepal, in a letter to Chairman Timilsina on May 4, has asked the government to move forward with various project proposals. The projects include the construction of electric railways, highways, and cancer hospitals for children along with increasing the scholarship quota for Nepali students to study in Moscow, reads a press statement issued by the National Assembly chairman’s office. Timilsina, who was on an official visit to Russia from April 19 to April 23, had held discussions regarding the projects with the Speaker of the Federal Assembly of Russia, the Upper House of Russia, Valentina Matvienko, the Speaker of the Russian Lower House, the Duma, Vyacheslav Volodin, and the Russian Deputy Prime Minister Dmitry Chernyshenko. As per the statement, Russia has asked for a project proposal to construct the East-West Electric Railway and a railway similar to the Moscow Railway in Kathmandu and to build the Pokhara-Ridi road connecting Nepal’s main tourist destination, Pokhara, to nearby tourist spots. “The estimated cost for the construction of the Pokhara-Ridi road from Pokhara to Gulmi via Parbat is nearly Rs4 billion. As per Chairman Timilsina’s proposal, Russia wants to build this road with its own investment,” reads the statement. Russia has also shown interest in the construction of two cancer hospitals for children in Kathmandu and Pokhara. “Russia has proposed to provide Russian doctors at the initial stage in the hospital and later handover the operations to Nepali doctors,” the statement reads. The statement also mentioned that Russia is trying to provide a scholarship quota to 150 to 200 students annually to study medicine and engineering. In the discussion list with the Nepal side, it has also enlisted some other points including “resumption of direct flights between Russia and Nepal, which were interrupted in 2002, delivery of Russian helicopters that have proven themselves in the mountainous condition of Nepal.” As per the statement, Timilsina has urged government leaders, including Prime Minister Pushpa Kamal Dahal, Finance Minister Prakash Sharan Mahat, Minister for Agriculture Beduram Bhusal, Minister for Physical Infrastructure and Transport Prakash Jwala, Minister for Education Ashok Rai, Minister for Health Mohan Bahadur Basnet and Minister for Youth and Sports Dik Bahadur Limbu, to prepare proposals for the implementation of the projects. "

In [8]:
models = ['KNN','MNB','SVM']
for model in models:
    classification,confidence = Classify(english_news,model).predict_news()
    print(f"-----{model}-----")
    print(f"Clasification is {classification}")
    print(f"Confidence is {confidence}")

-----KNN-----
Clasification is BUSINESS
Confidence is [85.71  0.   14.29  0.    0.  ]
-----MNB-----
Clasification is BUSINESS
Confidence is [52.69  4.42 31.52  3.34  8.03]
-----SVM-----
Clasification is POLITICS
Confidence is [34.87  0.64 63.23  0.25  1.01]


In [9]:
nepali_news = "काठमाडौं (हाम्रो खेलकुद) – नेपालले आइसिसी एकदिवसीय विश्वकप छनोट अघि दक्षिण अफ्रिकामा स्कटल्यान्ड र नेदरल्यान्ड्ससँग अभ्यास खेल खेल्ने भएको छ । नेपाल क्रिकेट संघ (क्यान) का कार्यवाहक सचिव दुर्गाराज पाठकले स्कटल्यान्ड र नेदरल्यान्ड्ससँग अभ्यास खेल खेल्ने बताएका छन् । ‘अभ्यास खेलको लागि दक्षिण अफ्रिकासँग कुरा गरेका छौं । हामीले एउटा ए टिम दिनपर्यो भनेका छौं । वहाँहरु सकरात्मक नै हुनुन्छ । त्यो नभए पनि स्कटल्यान्ड र नेदरल्यान्ड्ससँग वार्मअप खेल्ने कुरा भएको छ’, पाठकले हाम्रो खेलकुदसँग भने । ओडिआई हुँदा खर्च बढ्ने हुँदा अभ्यास खेल हुने पाठकले बताए । ‘ओडिआई चाहिँ नहुने भयो । ओडिआई हुँदा खर्च बढ्ने रहेछ । दक्षिण अफ्रिकामा नेपालको बसाई १० देखि १२ दिनको हुनेछ’, उनले भने । १० टोली सहभागी विश्वकप छनोट आगमी जुन १८ देखि जुलाई ९ सम्म जिम्बावेमा हुनेछ । नेपाल मे अन्तिममा जिम्बावे जानेछ । विश्वकप छनोटमा नेपाल, स्कटल्यान्ड, ओमान, युएई, अमेरिका, श्रीलंका, वेस्टइन्डिज, जिम्बावे, आयरल्यान्ड र नेदरल्यान्ड्स छन् । यी १० टोलीलाई आइसिसीले दुई समूह विभाजन गर्नेछ। हालसम्म आइसिसीले समूह विभाजन गरेको छैन । श्रीलंका र वेस्टन्डिज फरक समूहमा हुने पक्कापक्की छ। चाँडैनै समूह विभाजन गर्नेछ। हरेक समूहमा पाँच टोली हुनेछन् । हरेक समूहबाट शीर्ष तीन टोली सुपर सिक्समा स्थान बनाउनेछन् । सुपर सिक्समा हरेक टोलीले एक अर्कासँग खेल्नेछन् । जसबाट शीर्ष दुई टोली फाइनलमा पुग्नेछन् । फाइनलमा पुग्ने दुई टोलीले नै भारतमा हुने एकदिवसीय विश्वकप खेल्न पाउनेछन् ।"

In [10]:
models = ['KNN','MNB','SVM']
for model in models:
    classification,confidence = Classify(nepali_news,model).predict_news()
    print(f"-----{model}-----")
    print(f"Clasification is {classification}")
    print(f"Confidence is {confidence}")

-----KNN-----
Clasification is SPORT
Confidence is [15.38  7.69  0.   53.85 23.08]
-----MNB-----
Clasification is SPORT
Confidence is [14.27 15.68  0.58 59.45 10.01]
-----SVM-----
Clasification is SPORT
Confidence is [ 3.55  6.58  0.29 86.68  2.9 ]


In [11]:
from langdetect import detect
import math
from langdetect import detect
import math

# Loading Nepali words and numbers

class Summarize:
    def __init__(self,news):
        self.news = news
        self.language=detect(self.news)
        self.summarizeSentence = []
        for t in self.news.split():
            if t not in self.summarizeSentence:
                self.summarizeSentence.append(t)
    
    def calc_idf(self):
        doc_count=len(self.news)
        df={}
        idf={}
        for char in self.summarizeSentence:
            df[char]=0
            idf[char]=0
        #Calculating df
        for i in range(0,len(self.summarizeSentence)):
            for j in range(0,len(self.news)):
                if self.summarizeSentence[i] in self.news[j]:
                    df[self.summarizeSentence[i]]+=1
        #Calculating idf
        for char in self.summarizeSentence:
            idf[char]=math.log((doc_count+1)/(1+df[char]))+1
        return(idf)
    
    def calc_tf_idf(self,sentence):
        idf=self.calc_idf()
        
        tf_idf={}
        word_count={}
    
        for ch in self.summarizeSentence:
            tf_idf[ch]=0
            word_count[ch]=0
        #Calculating tf
        words = sentence.split()
        for ch in words:
            if ch in self.summarizeSentence:
                if ch in word_count:
                    word_count[ch] += 1
                else:
                    word_count[ch] = 1
    
        rough_tfidf=list(self.summarizeSentence)
        for keys in word_count.keys():
            tf_idf[keys]=idf[keys]*word_count[keys]
            if keys in rough_tfidf:
                index=rough_tfidf.index(keys)
                rough_tfidf[index]=tf_idf[keys]
        norm=0
        for i in range(0,len(rough_tfidf)):
            norm+=rough_tfidf[i]**2
        if norm==0:
            norm=1
        for i in range(0,len(rough_tfidf)):
            rough_tfidf[i]=round(rough_tfidf[i]/math.sqrt(norm),8)
        return rough_tfidf
    
    def count_sentence_eng(self):
        cnt=self.news.count(".")+1
        return cnt
    def count_sentence_nep(self):
        cnt=self.news.count("।")+1
        return cnt
    
    def summarize_in_sentence_number(self,number):
        paragraph = self.news
        number = number + 1
        tf_idf_each_sentence = {}
        if self.language=="en":
            each_sentence=paragraph.split(".")
            if "" in each_sentence:
                each_sentence.remove("")
            sentence_count=self.count_sentence_eng()
        else:
            #paragraph=paragraph.replace("।","|")
            each_sentence=paragraph.split("।")
            if "" in each_sentence:
                each_sentence.remove("")
            sentence_count=self.count_sentence_nep()
            
        
        if  (number>sentence_count):
            return("ERROR: Summarization line exceeds total sentence count")
        
        elif (number == 0):
            return("ERROR: Chosen Zero")
        
        else:
            summarized_indexes = {}
            for index in range(len(each_sentence)):
                tf_idf = sum(self.calc_tf_idf(each_sentence[index]))
                each_sentence[index] = f" ({index+1}) " + each_sentence[index]
                tf_idf_each_sentence[index + 1] = tf_idf
                summarized_indexes[tf_idf] = index
                
            sorted_summarized_indexs = sorted(summarized_indexes.items())[-number:]
            sorted_summarized_indexs = sorted([(t[1], t[0]) for t in sorted_summarized_indexs])
            sorted_summarized_indexs = [x[0] for x in sorted_summarized_indexs]
            summarized = []
            
            for index in sorted_summarized_indexs:
                summarized.append(each_sentence[index]) 
            summarized_str = str()
            if self.language == 'en':
                for summarized_sentence in summarized:
                    summarized_str += summarized_sentence
                    summarized_str += '. '
            else:
                for summarized_sentence in summarized:
                    summarized_str += summarized_sentence
                    summarized_str += ' | '
                    
            return summarized_str,tf_idf_each_sentence
                

In [12]:
print(Summarize(english_news).count_sentence_eng())
Summarize(english_news).summarize_in_sentence_number(3)

14


(' (2)  Russia’s request for detailed project proposals for various projects with Nepal comes following a meeting between National Assembly Chairman Ganesh Prasad Timilsina and his Russian counterpart Valentina Matvienko, where discussions were held regarding 13 different projects that Russia could undertake.  (4)  The projects include the construction of electric railways, highways, and cancer hospitals for children along with increasing the scholarship quota for Nepali students to study in Moscow, reads a press statement issued by the National Assembly chairman’s office.  (12)  In the discussion list with the Nepal side, it has also enlisted some other points including “resumption of direct flights between Russia and Nepal, which were interrupted in 2002, delivery of Russian helicopters that have proven themselves in the mountainous condition of Nepal.  (13) ” As per the statement, Timilsina has urged government leaders, including Prime Minister Pushpa Kamal Dahal, Finance Minister P

In [13]:
print(Summarize(nepali_news).count_sentence_nep())
Summarize(nepali_news).summarize_in_sentence_number(3)

23


(' (1) काठमाडौं (हाम्रो खेलकुद) – नेपालले आइसिसी एकदिवसीय विश्वकप छनोट अघि दक्षिण अफ्रिकामा स्कटल्यान्ड र नेदरल्यान्ड्ससँग अभ्यास खेल खेल्ने भएको छ  |  (2)  नेपाल क्रिकेट संघ (क्यान) का कार्यवाहक सचिव दुर्गाराज पाठकले स्कटल्यान्ड र नेदरल्यान्ड्ससँग अभ्यास खेल खेल्ने बताएका छन्  |  (6)  त्यो नभए पनि स्कटल्यान्ड र नेदरल्यान्ड्ससँग वार्मअप खेल्ने कुरा भएको छ’, पाठकले हाम्रो खेलकुदसँग भने  |  (11)  १० टोली सहभागी विश्वकप छनोट आगमी जुन १८ देखि जुलाई ९ सम्म जिम्बावेमा हुनेछ  | ',
 {1: 4.43183725,
  2: 4.097002079999998,
  3: 2.82842712,
  4: 2.62961873,
  5: 2.0,
  6: 3.8452514899999994,
  7: 2.8867513100000006,
  8: 2.0,
  9: 2.236068,
  10: 3.316624740000001,
  11: 3.7407587200000014,
  12: 2.236068,
  13: 3.7129888699999984,
  14: 2.6457512899999998,
  15: 2.44948974,
  16: 2.76887702,
  17: 1.73205081,
  18: 2.236068,
  19: 2.9999999699999993,
  20: 2.6457512899999998,
  21: 2.44948974,
  22: 3.316624740000001})