In [55]:
import numpy as np
import pandas as pd
import math

from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from nltk.stem.porter import PorterStemmer
from nltk.corpus import wordnet

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_selection import chi2
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from sklearn.metrics import accuracy_score

from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.naive_bayes import MultinomialNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import classification_report
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix

import os

<h1>Loading Dataset</h1>

In [56]:
import os

category_class=sorted(['business', 'entertainment', 'politics', 'sport', 'tech'])
df=pd.DataFrame(columns=['News','Category'])

#Temp array
temp=[]

#Convert texts file to dataframe
for i in range(0,len(category_class)):
    path=f"C://Users//Ghost//Desktop//newsclassify//backend//training//Nepali//train//{category_class[i]}"
    os.chdir(path)

    for file in os.listdir():
        # Check whether file is in text format or not
        if file.endswith(".txt"):
            file_path = f"{path}//{file}"
            
            #Exception Handling UnicodeDecode Error
            try:
                with open(file_path, 'r',encoding='utf-8') as f:
                    temp.append(f.read())
            except UnicodeDecodeError:
                pass
        
    
    df_copy=pd.DataFrame({"News": [x for x in temp],
                         "Category":f'{category_class[i]}'})
    df=pd.concat([df, df_copy], axis=0)
    temp=[]

In [57]:
df = df.sample(frac=1).reset_index(drop=True)

In [58]:
df.head()

Unnamed: 0,News,Category
0,\n﻿राजविराज । प्रतिभाशाली मोडेल अशोक आर्यन याद...,entertainment
1,﻿कांग्रेस भेलामा बोल्न सिण्डिकेट लगाएको भन्दै ...,politics
2,\n﻿भूकम्पको कारण देखाउँदै सरकारले आगामी साउनदे...,tech
3,\n﻿हवाई क्षेत्रको नियामक निकाय नेपाल नागरिक उड...,entertainment
4,﻿\n﻿‘ सरकारले साना व्यवसायीलाई मात्र कारवाही ...,business


<h2>PreProcessing</h2>

In [59]:
df_clean=df
df_clean.groupby('Category').Category.count()

Category
business         188
entertainment    544
politics         540
sport            647
tech             104
Name: Category, dtype: int64

### Balancing

In [60]:
## For number greater than 500 dropping the news
for i in range(len(category_class)):
    total_value=df_clean[df_clean['Category']==category_class[i]].shape[0]
    if total_value > 500:
       number_to_drop= total_value - 250
       drop_random=np.random.choice(df_clean[df_clean['Category']==category_class[i]].index,size=number_to_drop,replace=False)
       df_clean=df_clean.drop(drop_random)

In [61]:
df=df_clean
df = df.sample(frac=1).reset_index(drop=True)
df.groupby('Category').Category.count()

Category
business         188
entertainment    250
politics         250
sport            250
tech             104
Name: Category, dtype: int64

<h3>Maping</h3>

In [62]:
m={'business':0, 'entertainment':1, 'politics':2, 'sport':3, 'tech':4}

In [63]:
df['Category_Id']=df['Category'].map(m)

<h3>Visualize</h3>

In [64]:
df["News"].sample(5)
print(df["News"][0])
df["News"][0].replace('\n','')


﻿वर्तमान सूचना प्रविधिको युगमा इन्टरनेट मानिसहरुको दैनिकीको अभिन्न पाटो बनिरहेको छ । यद्यपि विश्वका प्रयोगकर्ताहरुले समान गतिको इन्टरनेट प्राप्त गर्न सकेका छैनन् । कतिपय देशमा निकै तिव्र गतिको इन्टरनेट सेवा प्राप्त छ भने कतिपय देशमा निकै झूर । नेटवर्क ट्रयाकर अकामाइले हालै सार्वजनिक गरेको विश्वका देशहरुको इन्टरनेट गतिको अवस्था विषयको एक प्रतिवेदन दी स्टेट अफ इन्टरनेट अनुसार विश्वका सर्वाधिक तिव्र गतिको इन्टरनेट भएका देशहरु यी हुन्-

१. दक्षिण कोरिया- औषति कनेक्सन स्पीड २२.२ एमबीपीएस सहित दक्षिण कोरिया विश्वको सबैभन्दा तिव्रगतिको इन्टरनेट सेवा रहेको देशको सूचिमा पहिलो स्थानमा लगातार रहन सफल भएको छ । यद्यपि पछिल्लो त्रैमासिकमा देशको औषत इन्टरनेट गतिमा १२ प्रतिशतको गिरावट आएको रिपोर्टले देखाएको छ । यद्यपि अघिल्लो वर्ष भन्दा यो वर्ष त्यहाँको इन्टरनेट स्पीड १.६ प्रतिशतले सुधार भएको छ ।

२.हङकङ- तिव्र इन्टरनेट रहेको देशका रुपमा हंगकंग दोश्रो स्थानमा रहेको छ । त्यहाँको औषत इन्टरनेट स्पीड १६.८ एमबीपिएस रहेकेा छ । वर्षेनी इन्टरनेट स्पीडमा ३७ प्रतिशतले सुधार भएको समेत रिपोर्टले देखाएको छ ।

३. 

'\ufeffवर्तमान सूचना प्रविधिको युगमा इन्टरनेट मानिसहरुको दैनिकीको अभिन्न पाटो बनिरहेको छ । यद्यपि विश्वका प्रयोगकर्ताहरुले समान गतिको इन्टरनेट प्राप्त गर्न सकेका छैनन् । कतिपय देशमा निकै तिव्र गतिको इन्टरनेट सेवा प्राप्त छ भने कतिपय देशमा निकै झूर । नेटवर्क ट्रयाकर अकामाइले हालै सार्वजनिक गरेको विश्वका देशहरुको इन्टरनेट गतिको अवस्था विषयको एक प्रतिवेदन दी स्टेट अफ इन्टरनेट अनुसार विश्वका सर्वाधिक तिव्र गतिको इन्टरनेट भएका देशहरु यी हुन्-१. दक्षिण कोरिया- औषति कनेक्सन स्पीड २२.२ एमबीपीएस सहित दक्षिण कोरिया विश्वको सबैभन्दा तिव्रगतिको इन्टरनेट सेवा रहेको देशको सूचिमा पहिलो स्थानमा लगातार रहन सफल भएको छ । यद्यपि पछिल्लो त्रैमासिकमा देशको औषत इन्टरनेट गतिमा १२ प्रतिशतको गिरावट आएको रिपोर्टले देखाएको छ । यद्यपि अघिल्लो वर्ष भन्दा यो वर्ष त्यहाँको इन्टरनेट स्पीड १.६ प्रतिशतले सुधार भएको छ ।२.हङकङ- तिव्र इन्टरनेट रहेको देशका रुपमा हंगकंग दोश्रो स्थानमा रहेको छ । त्यहाँको औषत इन्टरनेट स्पीड १६.८ एमबीपिएस रहेकेा छ । वर्षेनी इन्टरनेट स्पीडमा ३७ प्रतिशतले सुधार भएको समेत रिपोर्टले देखाएको छ ।३. ज

In [65]:
stop_words_file=open("stopwords.txt","r",encoding="utf-8")
stop_words=stop_words_file.read()
stop_words=stop_words.split("\n")

In [66]:
nepali_num_file=open("numbers.txt","r",encoding="utf-8")
nepali_num=nepali_num_file.read()
nepali_num=nepali_num.split(",")

In [67]:
nepali_suffix_file=open("suffix.txt","r",encoding="utf-8")
nepali_suffix=nepali_suffix_file.read()
nepali_suffix=nepali_suffix.split("\n")

In [68]:
def count_word(word):
    cnt=word.count(" ")+1
    return cnt

def get_first_five_hundred_words(text):
    count=count_word(text)-1
    new_text=""
    for i in range(count):
        if i==1000:
            break
        else:
            new_text += " "
            new_text += text.split(" ")[i]
    return new_text
    

get_first_five_hundred_words("hey my name")
    

' hey my'

<h3>PreProcessing Text</h3>

In [69]:
from Nepali_nlp import Tokenizer
def ProcessText(text):
    text=str(text)
    text=get_first_five_hundred_words(text)
    
    #removing \n and \ufeff
    remove=['\n','\ufeff']
    for i in remove:
        text.replace(i,'')
    
    #read stop words
    #Remove Stop Words
    word_tokens = Tokenizer().word_tokenize(text)
    filtered_list = [w for w in word_tokens if not w in stop_words]
    
    #Remove Nepali numbers
    num_filter=[]
    for i in range(0,len(filtered_list)):
        for j in range(0,len(nepali_num)):
            if nepali_num[j] in filtered_list[i]:
                num_filter.append(filtered_list[i])
                break
    for filter in num_filter:
        filtered_list.remove(filter)
    
    #Remove English numbers
    num=['0','1','2','3','4','5','6','7','8','9']
    num_filter=[]
    for i in range(0,len(filtered_list)):
        for j in range(0,len(num)):
            if num[j] in filtered_list[i]:
                num_filter.append(filtered_list[i])
                break
    for filter in num_filter:
        filtered_list.remove(filter)       
    
    #Stemming Manual
    
    suffix_filter=filtered_list
    '''
    for i in range(len(nepali_suffix)):
        for j in range(len(suffix_filter)):
            if nepali_suffix[i] in suffix_filter[j]:
                suffix_removed=suffix_filter[j].split(nepali_suffix[i])[0]
                suffix_filter.remove(suffix_filter[j])
                suffix_filter.insert(j,suffix_removed)
    '''
    stemmed_string=' '.join(suffix_filter)
    
    #stemmed_string=' '.join(filtered_list)
    
    return stemmed_string

In [70]:
print(ProcessText("\n \ufeff एन्फाको २ ३अध्यक्ष पदमा पराजित उम्मेदवार कर्मा छिरिङ शेर्पाले साधारणसभामा आर्थिक चलखेल भएको आरोप लगाएका छन् । यद्यपी चुनावी परिणाम भने स्वीकार गर्ने उनले बताए ।"))

 एन्फाको पदमा पराजित उम्मेदवार कर्मा छिरिङ शेर्पाले साधारणसभामा आर्थिक चलखेल आरोप लगाएका यद्यपी चुनावी परिणाम स्वीकार


<h3>Analyzing Processed Text</h3>

In [71]:
def rand():
    return np.random.binomial(n=1,p=0.01,size=[1])
def count_word(word):
    cnt=word.count(" ")+1
    return cnt
count=0
for i in range(0,len(df)):
    if rand():
        txt=df['News'][i]
        processed_txt=ProcessText(df['News'][i])
        
        print(txt+'\n \n'+processed_txt)
        print(f"\nMain Text Word Count : {count_word(txt)}\nProcessed Text Word Count : {count_word( processed_txt)}")
        count+=1
    if count==1:
        break
    


﻿दुःख जीवनको जग हो । दुःख सुखको खुड्किलो हो । त्यही जगमा उभिएको ठडिएको जीवनको सिँढी बनाउँछ । बिस्तारै उक्लिए त्यही सिँढीले शिखरमा पुु्र्याउँछ, हतार गरे चिप्लिएर जमिनमा ।
जीवनको यही शृंखला बुझ्न सीताराम (कट्टेल)लाई तीन दशक लाग्यो । बिस्तारै उक्लिएर शिखरमा पुग्दा उनले आफू चढेका त्यस्ता अनेकन् खुड्किला देखे, जहाँ दुःख र सुखका अनगिन्ती डोब थिए ।
क्यानभासमा रंगिन चित्र कोर्ने र महान् चित्रकार बन्ने सपना सजाएका सीतारामको जीवनलय अहिले कलाकारिताले डोर्याइरहेको छ । कलाकारितामात्रै होइन, समाजसेवाको माध्यमबाट परिवर्तनको दियालो जगाउँदैछन् । र, आलोकित तुल्याउँदैछन् भूकम्पबाट खन्डहर बनेका वस्ती । जीवनसंगिनी कुञ्जना (घिमिरे)ले हरेक कोणबाट टेको दिइरहेकी छन् ।

बुधबार मध्यबानेश्वरस्थित आफ्नै कार्यालयमा बाह्रखरीसँग गफिँदा सीताराम असिन–पसिन थिए । भन्दै थिए, “यसरी बोल्दा लगलग काँप्छु म त ।”
० ० ०
२०४० सालमा पूर्वी नेपालको सोलखुम्बुस्थित नेचा बेतघारीमा जन्मिएका सीतारामको बाल्यकाल निकै दुःखमा बित्यो । जेठो सन्तान भएकाले उनीमाथि परिवारको आशा र भरोसा त छँदै थियो– त्यसमाथि पनि स–साना भाइबहिनी हुर्काउनु पर्ने 

<h3>Apply Process Text in our column </h3>

In [72]:
df['News']=df['News'].apply(ProcessText)

<h3>Transforming Text to Vectors for MultiClassification </h3>

In [73]:
tfidf = TfidfVectorizer(sublinear_tf=True, min_df=5,ngram_range=(1,1))
features = tfidf.fit_transform(df['News']).toarray()
features.shape

(1042, 1611)

<h3>Displaying highly correlated words in categories </h3>

In [74]:
N = 5
category_id_df = df[['Category', 'Category_Id']].drop_duplicates()
category_id=dict(category_id_df.values)
labels=df['Category_Id']

for Category, category_id in sorted(category_id.items()):
  features_chi2 = chi2(features, labels == category_id)
  indices = np.argsort(features_chi2[0])
  feature_names = np.array(tfidf.get_feature_names_out())[indices]
  unigrams = [v for v in feature_names if len(v.split(' ')) == 1]
  bigrams = [v for v in feature_names if len(v.split(' ')) == 2]
  print(f"Category : {Category}")
  print(f"    Most Correlated Unigrams  :  {(', '.join(unigrams[-N:]))} \n")

Category : business
    Most Correlated Unigrams  :  थम, समस, चलनचल, जलव, यवस 

Category : entertainment
    Most Correlated Unigrams  :  शकक, कव, नय, कथ, चलच 

Category : politics
    Most Correlated Unigrams  :  मण, ईएक, दव, जव, सप 

Category : sport
    Most Correlated Unigrams  :  इकर, इङ, वकप, रनक, टबल 

Category : tech
    Most Correlated Unigrams  :  इलम, टफ, आइफ, गकर, एप 



In [75]:
df.head()

Unnamed: 0,News,Category,Category_Id
0,﻿वर्तमान सूचना प्रविधिको युगमा इन्टरनेट मानिसह...,tech,4
1,﻿फिल्म ‘गाँठो’को ‘थिम सङ’ सार्वजनिक गरिएको शनि...,entertainment,1
2,﻿जिल्ला सभापतिहरुको भेलामा देउवाको चर्को आलोचन...,politics,2
3,﻿वीरगञ्ज माघ संघिय समाजवादी फोरम नेपाल पर्साको...,politics,2
4,﻿विश्वकप टिकटको मूल्य यसपटक विश्वकप फुटबल प्रत...,sport,3


<h3>Train Test Split</h3>

In [76]:
x=features
y=df.loc[:,'Category_Id'].values
train_x,test_x,train_y,test_y=train_test_split(x,y,test_size=0.2,random_state=100)

<h2>Hyper Tuning</h2>

In [77]:
svc=SVC()
knn=KNeighborsClassifier()
dtc=DecisionTreeClassifier()
mb=MultinomialNB()
rtc=RandomForestClassifier()

In [78]:
models={'svc':svc,'knn':knn,'dtc':dtc,'mb':mb,'rtc':rtc}

In [79]:
params={
    'knn': 
        {'n_neighbors':[3,5,7,9,11,13,15],
        'metric':['cosine','euclidean','manhattan'],
        'weights':['uniform','distance']},
    
    'svc': {'C':[0.1,1,10,100], 
            'gamma':[1,0.1,0.01,0.01], 
            'kernel':['rbf','linear']},
    
    'dtc':{
        'criterion':['gini','entropy'],
        'max_depth':[2,4,6,8,10,12]
    },
    
    'mb':{
        'alpha': [1.0,2.0],
    'fit_prior': [True]
    },
    'rtc':{
        'criterion':['gini','entropy'],
        'max_depth':[2,4,6,8,10,12]
    }
}

In [80]:
model_accuracy={}
score=0.0001
for model in models.keys():
    mod = GridSearchCV(
    models[model],
    params[model],
    verbose=0, #Progress bar showing
    cv=5, #cross validation
    n_jobs=-1, #cores to assign
)
    gridsearch_result=mod.fit(train_x,train_y)
    
    #selects best model
    if(score < float(gridsearch_result.score(test_x,test_y))):
            score=gridsearch_result.score(test_x,test_y)
            best_model=gridsearch_result
            
    predict=mod.predict(test_x)
    print(f"{model} : ",gridsearch_result.best_estimator_)
    if model not in model_accuracy.keys():
        model_accuracy.update({model:accuracy_score(test_y,predict)})

svc :  SVC(C=1, gamma=1, kernel='linear')
knn :  KNeighborsClassifier(metric='cosine', n_neighbors=3, weights='distance')
dtc :  DecisionTreeClassifier(max_depth=10)
mb :  MultinomialNB()
rtc :  RandomForestClassifier(criterion='entropy', max_depth=12)


In [81]:
model_accuracy

{'svc': 0.8373205741626795,
 'knn': 0.784688995215311,
 'dtc': 0.5789473684210527,
 'mb': 0.7320574162679426,
 'rtc': 0.722488038277512}

In [82]:
print(best_model.best_estimator_)
best_model.best_score_

SVC(C=1, gamma=1, kernel='linear')


0.8595916600533873

In [83]:
test_predict = best_model.predict(test_x)
train_accuracy = round(best_model.score(train_x,train_y)*100)
test_accuracy =round(accuracy_score(test_predict, test_y)*100)

print(f"Train Accuracy Score : {train_accuracy}")
print(f"Test Accuracy Score  : {test_accuracy}")
print()
print(classification_report(test_predict, test_y, target_names=category_class))

Train Accuracy Score : 98
Test Accuracy Score  : 84

               precision    recall  f1-score   support

     business       0.94      0.94      0.94        33
entertainment       0.82      0.76      0.79        62
     politics       0.87      0.89      0.88        44
        sport       0.84      0.80      0.82        54
         tech       0.65      0.94      0.77        16

     accuracy                           0.84       209
    macro avg       0.83      0.86      0.84       209
 weighted avg       0.84      0.84      0.84       209



<h3>Predicting Text by Converting Text to Rough Tf-IDF</h3>


<h3>IDF Calculation</h3>

In [84]:

def calc_idf(Text_column):
    col=tfidf.get_feature_names_out()
    doc_count=len(Text_column)
    df={}
    idf={}
    for char in col:
        df[char]=0
        idf[char]=0
    #Calculating df
    for i in range(0,len(col)):
        for j in range(0,len(Text_column)):
            if col[i] in Text_column[j]:
                df[col[i]]+=1
    #Calculating idf
    for char in col:
        idf[char]=math.log((doc_count+1)/(1+df[char]))+1
    return(idf)

idf=calc_idf(df['News'])
len(idf)


1611

<h3>Above cell takes time to run so saving idf values as text For using Flask Operation</h3>

In [85]:
'''
import ast
with open('idf.txt', 'w') as f:
    print(idf, file=f)
with open('idf.txt') as f:
     idf_load=f.read()
idf_load=ast.literal_eval(idf_load) '''

"\nimport ast\nwith open('idf.txt', 'w') as f:\n    print(idf, file=f)\nwith open('idf.txt') as f:\n     idf_load=f.read()\nidf_load=ast.literal_eval(idf_load) "

<h3>TF-IDF Calculation</h3>

In [86]:

def calc_tf_idf(txt,idf):
    idf=idf
    
    tf_idf={}
    word_count={}
    col=tfidf.get_feature_names_out()
    for ch in col:
        tf_idf[ch]=0
        word_count[ch]=0
    #Calculating tf
    words = txt.split()
    for ch in words:
        if ch in col:
            if ch in word_count:
                word_count[ch] += 1
            else:
                word_count[ch] = 1
   
    rough_tfidf=list(col)
    for keys in word_count.keys():
        tf_idf[keys]=idf[keys]*word_count[keys]
        if keys in rough_tfidf:
            index=rough_tfidf.index(keys)
            rough_tfidf[index]=tf_idf[keys]
    norm=0
    for i in range(0,len(rough_tfidf)):
        norm+=rough_tfidf[i]**2
    if norm==0:
        norm=1
    for i in range(0,len(rough_tfidf)):
        rough_tfidf[i]=round(rough_tfidf[i]/math.sqrt(norm),8)
    return rough_tfidf


In [87]:
#For Visualizing
arr=calc_tf_idf(" अक अक अक अख अख ",idf)
print(tfidf.get_feature_names_out()[0:5])
print(arr[0:5])    

['अक' 'अख' 'अग' 'अघ' 'अच']
[0.80779066, 0.58946946, 0.0, 0.0, 0.0]


In [88]:
def Predict_text(txt):
    #process text
    text=ProcessText(txt)
    #calculate tfidf
    tf_idf=calc_tf_idf(txt,idf)
    #Validate if it is actually news
    if all(i > 0.05 for i in tf_idf) or count_word(txt)<0:
        print("Cannot classify")
    else:
        #Predicting
        index=best_model.predict([tf_idf])
        print(category_class[int(index)])

In [89]:
if 'समारोहमा' == 'समारोहमा':
    print('True')

True


In [90]:
correct=0
for i in range(len(test_x)):
    if best_model.predict([test_x[i]])==test_y[i]:
        correct+=1
print(correct/len(test_x)*100)

83.73205741626795


In [91]:
test_y

array([0, 3, 1, 0, 2, 2, 0, 2, 1, 0, 3, 1, 0, 1, 3, 0, 2, 0, 3, 3, 1, 4,
       1, 2, 4, 1, 2, 1, 3, 3, 1, 0, 3, 1, 4, 4, 2, 1, 3, 1, 4, 0, 1, 4,
       4, 4, 4, 2, 3, 4, 1, 1, 3, 3, 1, 3, 0, 3, 4, 4, 1, 1, 4, 2, 4, 3,
       2, 1, 1, 2, 2, 2, 0, 1, 2, 2, 3, 3, 2, 1, 4, 0, 0, 2, 1, 1, 2, 1,
       2, 3, 2, 1, 1, 0, 1, 2, 3, 0, 3, 3, 1, 0, 2, 1, 2, 3, 0, 2, 3, 1,
       4, 3, 2, 1, 1, 1, 3, 1, 3, 0, 0, 0, 2, 4, 2, 0, 2, 1, 3, 0, 1, 4,
       3, 2, 1, 2, 4, 0, 2, 2, 0, 1, 3, 3, 0, 1, 1, 4, 2, 0, 3, 2, 1, 2,
       1, 2, 1, 3, 3, 3, 2, 2, 1, 4, 1, 3, 0, 1, 2, 3, 3, 1, 1, 1, 2, 3,
       3, 4, 2, 1, 3, 3, 2, 3, 0, 0, 1, 3, 0, 1, 3, 0, 0, 3, 3, 3, 4, 2,
       3, 1, 1, 0, 3, 3, 1, 1, 2, 3, 2], dtype=int64)

In [92]:
Predict_text("नेकपा एमाले र माओवादी केन्द्रको वाम गठबन्धनमा राष्ट्रिय जनमोर्चा पनि सामेल भएको छ ।")

sport


In [93]:
Predict_text("eu referendum question unveiled the question to be asked in the referendum on the eu constitution has been unveiled by the government.  it will be:  should the united kingdom approve the treaty establishing a constitution for the european union   the constitution will be incorporated into uk law if there is a yes vote in the referendum  expected in 2006. critics say the constitution is a further step towards a federal europe  but advocates say it ensures effective operation of the enlarged 25-state eu.  if we reject this treaty  britain will be isolated and weak in europe   said foreign secretary jack straw  who along with the rest of the cabinet  will back a  yes  vote.  patriots by definition wanted the uk to be prosperous at home and strong and influential abroad  mr straw said.  our role as a leading member of the eu is a crucial part of securing that.   conservative shadow foreign secretary michael ancram said the referendum question  seems straightforward . but he accused the government of trying to confuse the issue by putting the eu referendum question in the same bill as the ratification of the constitution  when they should be treated as  two separate issues . despite this  underhand trick   the referendum bill stood  no chance of becoming law before the election   he added.  this is tony blair s cheap gesture to the pro-constitution lobby while he runs scared of a debate on europe he knows he cannot win.  neil o brien  director of anti-constitution group vote no  said:  the reality is that the government doesn t want to discuss the eu constitution ahead of the election because they know it is extremely unpopular with voters and with business.   the uk independence party said:  if the government believes that a no vote would mean that we should leave the european union  they should just ask us if we want to leave the eu. then we can be out of it and better off much sooner.   liberal democrat leader charles kennedy  who backs the constitution  said he expected the referendum would come in the first half of next year. he told bbc radio 4 s today programme:  the sooner we get on with this  the better.  he said the question sounded  very neutral  and  balanced   adding it would enable the argument  to be enjoined fairly and squarely on both sides . green mep caroline lucas welcomed tony blair s  courage in keeping his word  on holding a referendum. but she added:  this treaty is a flawed document that will make the eu less accountable  less sustainable  and less just.   mr blair signed the constitution at a ceremony in rome in november  but had already made it clear the issue would be put to voters in a referendum. that promise came after sustained pressure from opposition parties. jack straw  who argues the constitution reflected a  british vision for europe  and gives  national governments a stronger grip   has said the referendum could be held in spring 2006. but in an interview with the financial times  mr blair refused to be pinned down to that date  saying britain would hold a poll  some time in 2006 but when  i don t know . the paper said the prime minister  claimed ignorance  of when other countries were planning to hold their referendums.")

sport


In [94]:
Predict_text("french consumer spending rising french consumers increased their spending by 1.5% in january  a figure which bodes well for the country s economic growth  figures revealed.  the national statistic institute (insee) added that consumer spending in january rose 3.8% on a year-on-year basis. rising sales of household equipment were behind the increase. the insee also said that french consumer prices fell 0.6% in january  but were up 1.6% on an annual basis.  despite the general increase in spending in january  french households bought fewer cars in january. according to the insee  car sales fell 2.8% in january  following a fall of 0.6% in december. but on a year-on-year basis  the sector still saw a sales increase of 6.5%. consumer spending fuelled france s economic growth in the last quarter of 2004 and analysts expect that it will continue to support the economy.  it s a growth that will remain fragile and vulnerable to risks like a strong rise in long-term interest rates  tension in the oil price   emmanuel ferry  from exane bnp paribas told reuters news agency.  meanwhile in italy  consumer confidence rose to its highest level since october 2004. economic research group isae has said that italian consumer confidence rose to 104.4 from 103.3  despite a slight deterioration in short-term sentiment.")

sport


In [95]:
best_model.predict(test_x)

array([0, 3, 1, 0, 2, 2, 0, 2, 3, 3, 3, 2, 0, 1, 3, 0, 1, 0, 3, 3, 1, 4,
       1, 2, 4, 1, 3, 1, 3, 3, 3, 4, 3, 1, 4, 4, 2, 1, 1, 2, 4, 0, 1, 4,
       3, 4, 4, 2, 3, 1, 1, 1, 3, 3, 2, 1, 0, 3, 4, 1, 1, 1, 4, 2, 0, 3,
       2, 3, 1, 2, 2, 2, 0, 1, 1, 2, 1, 3, 3, 1, 1, 0, 0, 2, 1, 3, 2, 1,
       2, 3, 2, 1, 3, 0, 1, 2, 3, 0, 3, 1, 1, 0, 2, 1, 2, 3, 0, 2, 3, 1,
       4, 3, 1, 1, 1, 1, 3, 1, 3, 0, 0, 0, 2, 4, 2, 0, 2, 2, 3, 0, 1, 4,
       3, 2, 1, 2, 1, 0, 2, 2, 0, 1, 2, 3, 0, 1, 1, 4, 2, 0, 3, 2, 1, 2,
       1, 2, 1, 1, 3, 3, 2, 2, 1, 4, 1, 3, 0, 1, 2, 3, 3, 1, 1, 1, 2, 3,
       3, 1, 3, 1, 3, 3, 2, 3, 0, 0, 1, 3, 0, 1, 3, 0, 0, 3, 3, 3, 0, 2,
       1, 1, 1, 0, 1, 3, 1, 3, 2, 3, 2], dtype=int64)

In [96]:
import joblib

In [97]:
joblib.dump(best_model,'model.pkl')

['model.pkl']

In [98]:
model=joblib.load('model.pkl')

In [99]:
def Predict_text_pickle(txt):
    #process text
    text=ProcessText(txt)
    #calculate tfidf
    tf_idf=calc_tf_idf(txt,idf)
    #Validate if it is actually news
    if all(i <.05 for i in tf_idf) or count_word(txt)<100:
        print("Cannot classify")
    else:
        tf_idf_2=calc_tf_idf(" ",idf)
        tf_idf=np.append(tf_idf,tf_idf_2,axis=0)
        tf_idf=tf_idf.reshape(2,-1)
        #Predicting
        index=model.predict(tf_idf[0:1])
        print(Category_class[int(index)])

In [100]:
Predict_text_pickle("The nepali film industry will need some lifting.")
Predict_text_pickle(" ")

Cannot classify
Cannot classify


In [101]:
Predict_text_pickle("fox  too reliant on reality tv  the head of us tv network fox has admitted the broadcaster had relied too heavily on reality tv shows such as the poor-rating who s your daddy.  chief executive gail berman said  in the case of this fall we drifted to too much on the unscripted side . the series who s your daddy  where a young woman tries to pick her natural father for a cash prize caused outrage from adoption groups and rated badly. last season  fox s prime-time audience fell by 600 000 to 5.9 million. ms berman said:  i think the audience expects loud things from fox. sometimes they work  and sometimes they don t.   who s your daddy  the first episode of which was shown on 3 january  pulled in a disappointing audience of 6.3 million  according to the nielsen ratings system. five other episodes of the show had also been filmed will be dropped from fox s schedules  ms berman said. she was predicting a drop in ratings even for some of the network s established reality shows  such as american idol  which is due to start its fourth series this week. fox had unveiled a new strategy last year promising to launch new shows every season  including the traditionally quiet summer season. though that had met with a poor reception  ms berman said  there s no question that the audience  in our mind  is ready  willing and able to accept new programming in the summer . fox has changed this plan  launching new shows in may instead of june. one of the new shows will be the animated series american dad  made by seth macfarlane  the creator of family guy. that series  after becoming a hit on dvd  is also set to return with new episodes.")

Cannot classify


In [102]:
Predict_text_pickle("qpr keeper day heads for preston queens park rangers keeper chris day is set to join preston on a month s loan.  day has been displaced by the arrival of simon royce  who is in his second month on loan from charlton. qpr have also signed italian generoso rossi. r s manager ian holloway said:  some might say it s a risk as he can t be recalled during that month and simon royce can now be recalled by charlton.  but i have other irons in the fire. i have had a  yes  from a couple of others should i need them.   day s rangers contract expires in the summer. meanwhile  holloway is hoping to complete the signing of middlesbrough defender andy davies - either permanently or again on loan - before saturday s match at ipswich. davies impressed during a recent loan spell at loftus road. holloway is also chasing bristol city midfielder tom doherty.")

Cannot classify


In [103]:
Predict_text_pickle("eu referendum question unveiled the question to be asked in the referendum on the eu constitution has been unveiled by the government.  it will be:  should the united kingdom approve the treaty establishing a constitution for the european union   the constitution will be incorporated into uk law if there is a yes vote in the referendum  expected in 2006. critics say the constitution is a further step towards a federal europe  but advocates say it ensures effective operation of the enlarged 25-state eu.  if we reject this treaty  britain will be isolated and weak in europe   said foreign secretary jack straw  who along with the rest of the cabinet  will back a  yes  vote.  patriots by definition wanted the uk to be prosperous at home and strong and influential abroad  mr straw said.  our role as a leading member of the eu is a crucial part of securing that.   conservative shadow foreign secretary michael ancram said the referendum question  seems straightforward . but he accused the government of trying to confuse the issue by putting the eu referendum question in the same bill as the ratification of the constitution  when they should be treated as  two separate issues . despite this  underhand trick   the referendum bill stood  no chance of becoming law before the election   he added.  this is tony blair s cheap gesture to the pro-constitution lobby while he runs scared of a debate on europe he knows he cannot win.  neil o brien  director of anti-constitution group vote no  said:  the reality is that the government doesn t want to discuss the eu constitution ahead of the election because they know it is extremely unpopular with voters and with business.   the uk independence party said:  if the government believes that a no vote would mean that we should leave the european union  they should just ask us if we want to leave the eu. then we can be out of it and better off much sooner.   liberal democrat leader charles kennedy  who backs the constitution  said he expected the referendum would come in the first half of next year. he told bbc radio 4 s today programme:  the sooner we get on with this  the better.  he said the question sounded  very neutral  and  balanced   adding it would enable the argument  to be enjoined fairly and squarely on both sides . green mep caroline lucas welcomed tony blair s  courage in keeping his word  on holding a referendum. but she added:  this treaty is a flawed document that will make the eu less accountable  less sustainable  and less just.   mr blair signed the constitution at a ceremony in rome in november  but had already made it clear the issue would be put to voters in a referendum. that promise came after sustained pressure from opposition parties. jack straw  who argues the constitution reflected a  british vision for europe  and gives  national governments a stronger grip   has said the referendum could be held in spring 2006. but in an interview with the financial times  mr blair refused to be pinned down to that date  saying britain would hold a poll  some time in 2006 but when  i don t know . the paper said the prime minister  claimed ignorance  of when other countries were planning to hold their referendums.")

Cannot classify


In [104]:
Predict_text_pickle("software watching while you work software that can not only monitor every keystroke and action performed at a pc but also be used as legally binding evidence of wrong-doing has been unveiled.  worries about cyber-crime and sabotage have prompted many employers to consider monitoring employees. the developers behind the system claim it is a break-through in the way data is monitored and stored. but privacy advocates are concerned by the invasive nature of such software.  the system is a joint venture between security firm 3ami and storage specialists bridgehead software. they have joined forces to create a system which can monitor computer activity  store it and retrieve disputed files within minutes. more and more firms are finding themselves in deep water as a result of data misuse. sabotage and data theft are most commonly committed from within an organisation according to the national hi-tech crime unit (nhtcu) a survey conducted on its behalf by nop found evidence that more than 80% of medium and large companies have been victims of some form of cyber-crime. bridgehead software has come up with techniques to prove  to a legal standard  that any stored file on a pc has not been tampered with. ironically the impetus for developing the system came as a result of the freedom of information act  which requires companies to store all data for a certain amount of time.  the storage system has been incorporated into an application developed by security firm 3ami which allows every action on a computer to be logged. potentially it could help employers to follow the trail of stolen files and pinpoint whether they had been emailed to a third party  copied  printed  deleted or saved to cd  floppy disk  memory stick or flash card. other activities the system can monitor include the downloading of pornography  the use of racist or bullying language or the copying of applications for personal use. increasingly organisations that handle sensitive data  such as governments  are using biometric log-ins such as fingerprinting to provide conclusive proof of who was using a particular machine at any given time. privacy advocates are concerned that monitoring at work is not only damaging to employee s privacy but also to the relationship between employers and their staff.  that is not the case   said tim ellsmore  managing director of 3ami.  it is not about replacing dialogue but there are issues that you can talk through but you still need proof   he said.  people need to recognise that you are using a pc as a representative of a company and that employers have a legal requirement to store data   he added.")

Cannot classify


In [105]:
Predict_text_pickle("french consumer spending rising french consumers increased their spending by 1.5% in january  a figure which bodes well for the country s economic growth  figures revealed.  the national statistic institute (insee) added that consumer spending in january rose 3.8% on a year-on-year basis. rising sales of household equipment were behind the increase. the insee also said that french consumer prices fell 0.6% in january  but were up 1.6% on an annual basis.  despite the general increase in spending in january  french households bought fewer cars in january. according to the insee  car sales fell 2.8% in january  following a fall of 0.6% in december. but on a year-on-year basis  the sector still saw a sales increase of 6.5%. consumer spending fuelled france s economic growth in the last quarter of 2004 and analysts expect that it will continue to support the economy.  it s a growth that will remain fragile and vulnerable to risks like a strong rise in long-term interest rates  tension in the oil price   emmanuel ferry  from exane bnp paribas told reuters news agency.  meanwhile in italy  consumer confidence rose to its highest level since october 2004. economic research group isae has said that italian consumer confidence rose to 104.4 from 103.3  despite a slight deterioration in short-term sentiment.")

Cannot classify
