## Classify Articles:Three Steps
- Start with a corpus of articles
- Identify underlying themes using kmeans++
- Assign themes to new article


In [3]:
#We have 2804 articles under a folder, read and store them in a List of articles
articles = []
for i in range(0, 2804):
    with open('Data/NLP/%s.txt'%i, 'r') as file:
        text = file.read()
        articles.append(text)

In [4]:
print(articles)



In [5]:
#Convert text to TF-IDF representation
from sklearn.feature_extraction.text import TfidfVectorizer

#By setting stopwords to relevant language, it will ignore the stopwords
vectorizer = TfidfVectorizer(max_df=0.5,min_df=2,stop_words='english')

X = vectorizer.fit_transform(articles)
X


<2804x13220 sparse matrix of type '<class 'numpy.float64'>'
	with 280835 stored elements in Compressed Sparse Row format>

In [6]:
print (X[0])

  (0, 12835)	0.06733599036660341
  (0, 6245)	0.06370310603358903
  (0, 4383)	0.07127822024207892
  (0, 3699)	0.06861761774394902
  (0, 950)	0.046724977667531975
  (0, 13134)	0.0622780907314438
  (0, 6435)	0.061142245852373975
  (0, 1648)	0.06447533430679811
  (0, 11319)	0.053834229815857414
  (0, 10951)	0.07894687355503974
  (0, 1291)	0.06758290561613112
  (0, 2279)	0.047223338973571005
  (0, 9326)	0.0642781593136864
  (0, 1292)	0.055482562340284515
  (0, 8259)	0.07377665013036055
  (0, 10923)	0.10044228507666966
  (0, 2448)	0.06809049792662643
  (0, 1730)	0.040223278045178494
  (0, 2059)	0.10350011612958668
  (0, 13048)	0.05347359253941316
  (0, 10302)	0.042660938762487305
  (0, 8623)	0.049288817826055084
  (0, 7910)	0.08734850508251554
  (0, 12211)	0.06594121878805016
  (0, 11730)	0.06888898157373022
  :	:
  (0, 575)	0.07683448118327757
  (0, 882)	0.04222334662742109
  (0, 3398)	0.05187934981734116
  (0, 2618)	0.059667337053642584
  (0, 12739)	0.07064073666904813
  (0, 6196)	0.040775

In [7]:
X[0]

from sklearn.cluster import KMeans

#max_iter means if algorithm don't converge even after this no# of iteration, it will stop
km = KMeans(n_clusters = 3, init = 'k-means++', max_iter = 100, n_init = 1, verbose = True)
km.fit(X)

Initialization complete
Iteration  0, inertia 5295.832
Iteration  1, inertia 2687.842
Iteration  2, inertia 2678.478
Iteration  3, inertia 2673.537
Iteration  4, inertia 2671.267
Iteration  5, inertia 2670.631
Iteration  6, inertia 2670.116
Iteration  7, inertia 2669.586
Iteration  8, inertia 2668.939
Iteration  9, inertia 2668.555
Iteration 10, inertia 2668.321
Iteration 11, inertia 2668.145
Iteration 12, inertia 2668.048
Iteration 13, inertia 2668.007
Iteration 14, inertia 2667.977
Iteration 15, inertia 2667.953
Iteration 16, inertia 2667.915
Iteration 17, inertia 2667.854
Iteration 18, inertia 2667.779
Iteration 19, inertia 2667.717
Iteration 20, inertia 2667.659
Iteration 21, inertia 2667.559
Iteration 22, inertia 2667.467
Iteration 23, inertia 2667.333
Iteration 24, inertia 2667.167
Iteration 25, inertia 2666.938
Iteration 26, inertia 2666.655
Iteration 27, inertia 2666.472
Iteration 28, inertia 2666.348
Iteration 29, inertia 2666.264
Iteration 30, inertia 2666.188
Iteration 31, i

KMeans(algorithm='auto', copy_x=True, init='k-means++', max_iter=100,
       n_clusters=3, n_init=1, n_jobs=None, precompute_distances='auto',
       random_state=None, tol=0.0001, verbose=True)

In [8]:
import numpy as np

#km.lables_ is an attribute of K-means object. unique function will print the unique clusters name with each article have been assign one cluster
np.unique(km.labels_, return_counts=True)

(array([0, 1, 2]), array([ 342, 2028,  434], dtype=int64))

In [9]:
#Aggregate text in each cluster
text={}
for i,cluster in enumerate(km.labels_):
        oneDocument = articles[i]
        if cluster not in text.keys():
            text[cluster] = oneDocument
        else:
            text[cluster] += oneDocument

In [10]:
from nltk.tokenize import sent_tokenize, word_tokenize
from nltk.corpus import stopwords
from nltk.probability import FreqDist
from collections import defaultdict
from string import punctuation
from heapq import nlargest
import nltk

_stopwords = set(stopwords.words('english') + list(punctuation)+["million","billion","year","millions","billions","y/y","'s","''","``"])

#Top keywords in each cluster and their count
keywords = {}
counts={}
for cluster in range(3):
    word_sent = word_tokenize(text[cluster].lower())
    word_sent=[word for word in word_sent if word not in _stopwords]
    freq = FreqDist(word_sent)
    keywords[cluster] = nlargest(100, freq, key=freq.get)
    counts[cluster]=freq


                                                                 
                                        

In [11]:
unique_keys={}
for cluster in range(3):
    other_clusters=list(set(range(3))-set([cluster]))
    #collect all the keywords from other clusters
    keys_other_clusters=set(keywords[other_clusters[0]]).union(set(keywords[other_clusters[1]]))
    #Take out these keywords from the whole set to get unique keywords
    unique=set(keywords[cluster])-keys_other_clusters
    #Store top 10 keywords 
    unique_keys[cluster]=nlargest(10, unique, key=counts[cluster].get)

In [12]:
#After analysing words in each cluster we can identify the theme of each cluster
unique_keys

{0: ['ads',
  'video',
  'ad',
  'social',
  'instagram',
  'snapchat',
  'advertisers',
  'news',
  'user',
  'feature'],
 1: ['uber',
  'india',
  'tech',
  'deal',
  'pay',
  'chinese',
  'firm',
  'mr.',
  'around',
  'capital'],
 2: ['quarter',
  'profit',
  'rose',
  'earnings',
  'analysts',
  'stock',
  'cents',
  'net',
  'fell',
  'per']}

In [16]:
article = "US-based Facebook has agreed to buy 9.99% equity stake in oil-to-retail conglomerate Reliance Industries Ltd’s telecommunications arm Reliance Jio for Rs 43,574 crore. While the Mukesh Ambani-led company said that Facebook and Jio will together further India’s digital economy, the deal in itself has retail at its core, as both Facebook and Reliance Jio eye a huge market of 1.3 billion Indians and Indian businesses. “Our focus will be India’s 60 million micro, small and medium businesses, 120 million farmers, 30 million small merchants and millions of small and medium enterprises in the informal sector,” Reliance Jio said in the statement announcing the deal. In fact, RIL has been building up to become a retail giant with several acquisitions and deals in the past as well."
from sklearn.neighbors import KNeighborsClassifier

#here K(n_neighbors)=5 by default
classifier = KNeighborsClassifier(n_neighbors=3)

#training phase
classifier.fit(X,km.labels_)
test=vectorizer.transform([article.encode('ascii',errors='ignore')])
classifier.predict(test)

array([1])