In [1]:
#Proprietary content. ©Great Learning. All Rights Reserved. Unauthorized use or distribution prohibited

# Categorizing news items using text classification

## Import the required libraries

In [1]:
import pandas as pd
import numpy as np
import re
import nltk
from nltk.stem.wordnet import WordNetLemmatizer

from sklearn import model_selection, preprocessing, linear_model, naive_bayes, metrics, svm
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn import decomposition, ensemble
from sklearn.ensemble import RandomForestClassifier

import warnings
warnings.filterwarnings("ignore")

#import pandas, xgboost, numpy, textblob, string
#from keras.preprocessing import text, sequence
#from keras import layers, models, optimizers

In [2]:
news = pd.read_csv('bbctext.csv')
print(news.shape)

(2225, 2)


In [3]:
news.head()

Unnamed: 0,category,text
0,tech,tv future in the hands of viewers with home th...
1,business,worldcom boss left books alone former worldc...
2,sport,tigers wary of farrell gamble leicester say ...
3,sport,yeading face newcastle in fa cup premiership s...
4,entertainment,ocean s twelve raids box office ocean s twelve...


In [4]:
news['category'].value_counts()

sport            511
business         510
politics         417
tech             401
entertainment    386
Name: category, dtype: int64

## create features based on count vectorizer

In [None]:
import nltk
nltk.download()

In [7]:
#remove stopwords
from nltk.corpus import stopwords
stop_words = set(stopwords.words("english"))

LookupError: 
**********************************************************************
  Resource [93mstopwords[0m not found.
  Please use the NLTK Downloader to obtain the resource:

  [31m>>> import nltk
  >>> nltk.download('stopwords')
  [0m
  For more information see: https://www.nltk.org/data.html

  Attempted to load [93mcorpora/stopwords[0m

  Searched in:
    - '/Users/anchhabra/nltk_data'
    - '/Users/anchhabra/opt/anaconda3/nltk_data'
    - '/Users/anchhabra/opt/anaconda3/share/nltk_data'
    - '/Users/anchhabra/opt/anaconda3/lib/nltk_data'
    - '/usr/share/nltk_data'
    - '/usr/local/share/nltk_data'
    - '/usr/lib/nltk_data'
    - '/usr/local/lib/nltk_data'
**********************************************************************


In [6]:
#Adding custom stop words
new_words = ["some","one","like","time","br","movie","film","could","good",'even', 'get', 'would',
             'make', 'really', 'see', 'well', 'much', 'great', 'first', 'people', 'also', 'bad', 
             'show', 'way', 'thing', 'made', 'go', 'think', 'know', 'watch','look','many', 'said',
            'say', 'mr','new','take','told','back']
stop_words = stop_words.union(new_words)

In [7]:
corpus = []
for i in range(0, news.shape[0]):
    #Remove punctuations
    text = re.sub('[^a-zA-Z]', ' ', news['text'][i])
    
    #Convert to lowercase
    text = text.lower()
    ##Convert to list from string
    text = text.split()
    ##Lemmatizing
    lm = WordNetLemmatizer() 
       
    
    text = [lm.lemmatize(word) for word in text if not word in stop_words] 
    text = " ".join(text)
    corpus.append(text)

In [8]:
corpus[22]

'bates seal takeover ken bates completed takeover leeds united year old former chelsea chairman sealed deal gmt friday bought stake club delighted stepping mantel fantastic club recognise leeds club fallen hard time lot hard work ahead club belongs premiership help fan everything bates bought stake guise geneva based company known forward sport fund revealed part plan buy leeds elland road stadium thorp arch training ground due course going tough jon task stabilise cash flow sort remaining creditor bates added light end long tunnel past year matter firefighting start running club outgoing leeds chairman gerald krasner deal ensures medium long term survival club believe bates proposal totally benefit club content bates leeds united continue consolidate move forward took leeds united march club debt since date board succeeded reducing debt worked tirelessly solve problem leeds united eighty percent problem already overcome came agreement bates secure ongoing success krasner revealed cons

In [12]:
#Most frequently occuring words
def get_top_n_words(corpus, n=None):
    vec = TfidfVectorizer(stop_words=stop_words).fit(corpus)
    bag_of_words = vec.transform(corpus)
    sum_words = bag_of_words.sum(axis=0) 
    words_freq = [(word, sum_words[0, idx]) for word, idx in vec.vocabulary_.items()]
    words_freq =sorted(words_freq, key = lambda x: x[1], reverse=True)
    return words_freq[:n]

In [13]:
#Convert most freq words to dataframe for plotting bar plot
top_words = get_top_n_words(corpus, n=20)
top_df = pd.DataFrame(top_words)
top_df.columns=["Word", "Freq"]
top_df

Unnamed: 0,Word,Freq
0,year,50.357041
1,game,37.762725
2,government,28.884761
3,bn,28.811521
4,last,28.172021
5,world,27.595115
6,uk,26.737569
7,labour,26.531352
8,best,26.316804
9,company,26.309189


In [14]:
news.columns

Index(['category', 'text'], dtype='object')

In [15]:
# Creating the Bag of Words model
from sklearn.feature_extraction.text import CountVectorizer
cv = TfidfVectorizer(max_features = 4000, ngram_range=(1, 3))
X = cv.fit_transform(corpus).toarray()
y = news.drop(['text'], axis=1)

In [16]:
print(X.shape, y.shape)

(2225, 4000) (2225, 1)


In [17]:
# Splitting the dataset into the Training set and Test set
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = model_selection.train_test_split(X, y, test_size = 0.30, random_state = 0)

# label encode the target variable 
encoder = preprocessing.LabelEncoder()
y_train = encoder.fit_transform(y_train)
y_test = encoder.fit_transform(y_test)

In [18]:
print (X_train.shape, y_train.shape)
print (X_test.shape, y_test.shape)

(1557, 4000) (1557,)
(668, 4000) (668,)


## Naive Bayes Classifier

In [19]:
# Fitting Naive Bayes to the Training set
from sklearn.naive_bayes import GaussianNB
classifier = GaussianNB()
classifier.fit(X_train, y_train)

GaussianNB(priors=None, var_smoothing=1e-09)

In [20]:
# Predicting the Test set results
y_pred = classifier.predict(X_test)


In [21]:
# Making the Confusion Matrix
from sklearn.metrics import confusion_matrix
#cm = confusion_matrix(y_test, y_pred)
cm = confusion_matrix(y_test, y_pred)
cm

array([[130,   1,   7,   4,  10],
       [  3,  92,   4,   0,  10],
       [  3,   0, 107,   1,   2],
       [  2,   1,   1, 172,   1],
       [  2,   1,   0,   0, 114]], dtype=int64)

In [22]:
from sklearn.metrics import accuracy_score, recall_score
print(accuracy_score(y_test, y_pred))
#print(recall_score(y_test, y_pred))

0.9206586826347305


# Linear Classifier

In [23]:
from sklearn.linear_model import LogisticRegression
classifier_LR = LogisticRegression(random_state=0)
classifier_LR.fit(X_train,y_train)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='warn', n_jobs=None, penalty='l2',
                   random_state=0, solver='warn', tol=0.0001, verbose=0,
                   warm_start=False)

In [24]:
#Predict test set reuslts
y_pred_LR = classifier_LR.predict(X_test)

In [25]:
#cm = confusion_matrix(y_test, y_pred)
cm_LR = confusion_matrix(y_test, y_pred_LR)
cm_LR

array([[145,   0,   6,   0,   1],
       [  1, 106,   2,   0,   0],
       [  0,   2, 109,   1,   1],
       [  0,   0,   0, 177,   0],
       [  1,   1,   1,   0, 114]], dtype=int64)

In [26]:
from sklearn.metrics import accuracy_score, recall_score
print(accuracy_score(y_test, y_pred_LR))

0.9745508982035929


## Classification using Random forest

In [27]:
from sklearn.ensemble import RandomForestClassifier
classifier_RF = RandomForestClassifier(n_estimators=500, random_state=123, verbose=4)
classifier_RF.fit(X_train, y_train)

[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.0s remaining:    0.0s
[Parallel(n_jobs=1)]: Done   2 out of   2 | elapsed:    0.0s remaining:    0.0s
[Parallel(n_jobs=1)]: Done   3 out of   3 | elapsed:    0.0s remaining:    0.0s


building tree 1 of 500
building tree 2 of 500
building tree 3 of 500
building tree 4 of 500
building tree 5 of 500
building tree 6 of 500
building tree 7 of 500
building tree 8 of 500
building tree 9 of 500
building tree 10 of 500
building tree 11 of 500
building tree 12 of 500
building tree 13 of 500
building tree 14 of 500
building tree 15 of 500
building tree 16 of 500
building tree 17 of 500
building tree 18 of 500
building tree 19 of 500
building tree 20 of 500
building tree 21 of 500
building tree 22 of 500
building tree 23 of 500
building tree 24 of 500
building tree 25 of 500
building tree 26 of 500
building tree 27 of 500
building tree 28 of 500
building tree 29 of 500
building tree 30 of 500
building tree 31 of 500
building tree 32 of 500
building tree 33 of 500
building tree 34 of 500
building tree 35 of 500
building tree 36 of 500
building tree 37 of 500
building tree 38 of 500
building tree 39 of 500
building tree 40 of 500
building tree 41 of 500
building tree 42 of 500
b

building tree 342 of 500
building tree 343 of 500
building tree 344 of 500
building tree 345 of 500
building tree 346 of 500
building tree 347 of 500
building tree 348 of 500
building tree 349 of 500
building tree 350 of 500
building tree 351 of 500
building tree 352 of 500
building tree 353 of 500
building tree 354 of 500
building tree 355 of 500
building tree 356 of 500
building tree 357 of 500
building tree 358 of 500
building tree 359 of 500
building tree 360 of 500
building tree 361 of 500
building tree 362 of 500
building tree 363 of 500
building tree 364 of 500
building tree 365 of 500
building tree 366 of 500
building tree 367 of 500
building tree 368 of 500
building tree 369 of 500
building tree 370 of 500
building tree 371 of 500
building tree 372 of 500
building tree 373 of 500
building tree 374 of 500
building tree 375 of 500
building tree 376 of 500
building tree 377 of 500
building tree 378 of 500
building tree 379 of 500
building tree 380 of 500
building tree 381 of 500


[Parallel(n_jobs=1)]: Done 500 out of 500 | elapsed:    7.9s finished


RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
                       max_depth=None, max_features='auto', max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=500,
                       n_jobs=None, oob_score=False, random_state=123,
                       verbose=4, warm_start=False)

In [28]:
#Predict test set reuslts
y_pred_RF = classifier_RF.predict(X_test)

[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.0s remaining:    0.0s
[Parallel(n_jobs=1)]: Done   2 out of   2 | elapsed:    0.0s remaining:    0.0s
[Parallel(n_jobs=1)]: Done   3 out of   3 | elapsed:    0.0s remaining:    0.0s
[Parallel(n_jobs=1)]: Done 500 out of 500 | elapsed:    0.1s finished


In [29]:
#cm = confusion_matrix(y_test, y_pred)
cm_RF = confusion_matrix(y_test, y_pred_RF)
cm_RF

array([[146,   0,   3,   0,   3],
       [  2, 102,   1,   2,   2],
       [  2,   2, 105,   2,   2],
       [  0,   0,   0, 177,   0],
       [  1,   2,   1,   0, 113]], dtype=int64)

In [30]:
from sklearn.metrics import accuracy_score, recall_score
print(accuracy_score(y_test, y_pred_RF))


0.9625748502994012


In [31]:
### Convert to dataframe

tf_vect_df = pd.DataFrame(X, columns=cv.get_feature_names())
tf_vect_df



Unnamed: 0,aaa,ability,able,abroad,absence,absolute,absolutely,abuse,abused,ac,...,york,young,younger,youngster,yuan,yugansk,yuganskneftegas,yukos,zealand,zone
0,0.0,0.000000,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,...,0.000000,0.0,0.042454,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.045801,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,...,0.000000,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.000000,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,...,0.000000,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.000000,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,...,0.000000,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.000000,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,...,0.059556,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2220,0.0,0.000000,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,...,0.000000,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2221,0.0,0.000000,0.0,0.084635,0.0,0.0,0.0,0.0,0.0,0.0,...,0.000000,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2222,0.0,0.000000,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,...,0.000000,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2223,0.0,0.000000,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,...,0.000000,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [33]:
## Feature importance from random forest
feat_importance = classifier_RF.feature_importances_


feature_cols = tf_vect_df.columns

feat_imp_dict = dict(zip(feature_cols, classifier_RF.feature_importances_))
feat_imp = pd.DataFrame.from_dict(feat_imp_dict, orient='index')
feat_imp.sort_values(by=0, ascending=False).head(10)

Unnamed: 0,0
bn,0.014262
game,0.013345
user,0.012919
firm,0.011637
market,0.01143
minister,0.011247
computer,0.010935
company,0.010891
player,0.010644
government,0.010392
