In [1]:
import pandas as pd
import glob
import os
import numpy as np
import nltk
from nltk.stem import PorterStemmer
from nltk.tokenize import sent_tokenize, word_tokenize
nltk.download('punkt')
nltk.download('stopwords')
from sklearn.feature_extraction.text import TfidfVectorizer
from nltk.corpus import stopwords

[nltk_data] Downloading package punkt to /home/camilo/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /home/camilo/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [2]:
books = glob.glob('bbc-fulltext/bbc/*/*.txt')
d = list()
for book_file in books:
    #print(os.path.abspath(book_file))
    with open(book_file, encoding='utf-8') as f:
        catg = os.path.basename(book_file.split('/')[-2])
        news = os.path.basename(book_file.split('/')[-1].split('.')[0])
        lines = f.readlines()
        d.append(pd.DataFrame.from_records([{'news':news, 'catg': catg, 'lines': " ".join(lines)}]))
doc = pd.concat(d)
doc.head()

Unnamed: 0,catg,lines,news
0,politics,Tories pledge free sports lessons\n \n Childre...,171
0,politics,Blair prepares to name poll date\n \n Tony Bla...,401
0,politics,Abortion not a poll issue - Blair\n \n Tony Bl...,405
0,politics,Woolf murder sentence rethink\n \n Plans to gi...,231
0,politics,Hague 'given up' his PM ambition\n \n Former C...,285


In [3]:
porter = PorterStemmer()
def stemSentence(sentence):
    token_words=word_tokenize(sentence)
    token_words
    stem_sentence=[]
    for word in token_words:
       #if porter.stem(word) != word:
       #     print(word," - ",porter.stem(word))
        stem_sentence.append(porter.stem(word=word))
        stem_sentence.append(" ")
    senteence = "".join(stem_sentence)
    #print(sentence,"\n", senteence)
    return senteence
        
doc['words'] = doc.lines.str.strip().str.split('[\W]+')
doc['cl_lines'] = doc.words.str.join(" ")
for index, row in doc.iterrows():
    row['cl_lines']=stemSentence(row['cl_lines']).strip()
#doc['cl_lines'].apply(stemSentence)
doc.head()

Unnamed: 0,catg,lines,news,words,cl_lines
0,politics,Tories pledge free sports lessons\n \n Childre...,171,"[Tories, pledge, free, sports, lessons, Childr...",tori pledg free sport lesson children would be...
0,politics,Blair prepares to name poll date\n \n Tony Bla...,401,"[Blair, prepares, to, name, poll, date, Tony, ...",blair prepar to name poll date toni blair is l...
0,politics,Abortion not a poll issue - Blair\n \n Tony Bl...,405,"[Abortion, not, a, poll, issue, Blair, Tony, B...",abort not a poll issu blair toni blair doe not...
0,politics,Woolf murder sentence rethink\n \n Plans to gi...,231,"[Woolf, murder, sentence, rethink, Plans, to, ...",woolf murder sentenc rethink plan to give murd...
0,politics,Hague 'given up' his PM ambition\n \n Former C...,285,"[Hague, given, up, his, PM, ambition, Former, ...",hagu given up hi PM ambit former conserv leade...


In [4]:
cl_doc = pd.DataFrame(doc, columns=['news','catg', 'cl_lines'])
cl_doc.head()

Unnamed: 0,news,catg,cl_lines
0,171,politics,tori pledg free sport lesson children would be...
0,401,politics,blair prepar to name poll date toni blair is l...
0,405,politics,abort not a poll issu blair toni blair doe not...
0,231,politics,woolf murder sentenc rethink plan to give murd...
0,285,politics,hagu given up hi PM ambit former conserv leade...


In [5]:
corpus = cl_doc['cl_lines'].values
stop_wordds = stopwords.words('english')
vectorizer = TfidfVectorizer(stop_words=stop_wordds)
X = vectorizer.fit_transform(corpus)
print(X.shape)


(2224, 20480)


In [6]:
from sklearn.decomposition import TruncatedSVD
svd = TruncatedSVD(n_components=3)
#print(svd)
X_svd = svd.fit_transform(X)
print(X_svd)

[[ 0.21440846  0.08590012 -0.04226514]
 [ 0.27276225  0.26541399 -0.06813152]
 [ 0.28961755  0.2208493  -0.09741538]
 ...
 [ 0.24199242 -0.13444291 -0.17500457]
 [ 0.195571   -0.13123664 -0.19386461]
 [ 0.1234399  -0.03402338 -0.03113022]]


In [7]:

y = cl_doc["catg"].tolist()
# intilize a null list 
unique_list = [] 
      
# traverse for all elements 
for w in y: 
# check if exists in unique_list or not 
    if w not in unique_list: 
        unique_list.append(w) 
# print list 
for w in unique_list: 
    print(w,) 


politics
business
entertainment
tech
sport


In [44]:
from sklearn.neural_network import MLPClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, precision_score, recall_score, accuracy_score, f1_score
#from sklearn.model_selection import cross_validate
clf = MLPClassifier(solver='lbfgs', alpha=1e-5, hidden_layer_sizes=(4,3))
#from sklearn.model_selection import cross_val_score
#scoring = ['precision', 'recall', 'accuracy']
#scores = cross_validate(clf, X_svd, y, cv=5, scoring=scoring, return_train_score=False)
#print(scores)

# Split the data into a training set and a test set
X_train, X_test, y_train, y_test = train_test_split(X_svd, y, random_state=0)
#cross_val_score(clf, X_train, y_train, scoring='accuracy', cv=5)
clf.fit(X_train,y_train)
y_pred = clf.predict(X_test)
confusion_matrix(y_test, y_pred)

array([[105,   4,  10,   0,  13],
       [  4,  75,   3,  14,   1],
       [  9,   0,  96,   0,   0],
       [  0,   6,   0, 114,   0],
       [ 27,  12,   1,   0,  62]])

In [45]:
recall_score(y_test, y_pred, average=None)

array([0.79545455, 0.77319588, 0.91428571, 0.95      , 0.60784314])

In [46]:
precision_score(y_test, y_pred, average=None)

array([0.72413793, 0.77319588, 0.87272727, 0.890625  , 0.81578947])

In [47]:
f1_score(y_test, y_pred, average=None)

array([0.75812274, 0.77319588, 0.89302326, 0.91935484, 0.69662921])

In [48]:
accuracy_score(y_test, y_pred)

0.8129496402877698