### K N Anantha nandanan
#### Roll No AM.EN.U4CSE19326

BBC news clustering

## Imports

In [27]:
import nltk
import pandas as pd
import os
import numpy as np
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem.snowball import SnowballStemmer
import re
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
import string
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, accuracy_score
from sklearn.neighbors import KNeighborsClassifier
from sklearn import svm
from pprint import pprint

### Load the dataset

In [29]:
path = '../Datas/bbc-text.csv'
df = pd.read_csv(path)
df.head()

Unnamed: 0,category,text
0,tech,tv future in the hands of viewers with home th...
1,business,worldcom boss left books alone former worldc...
2,sport,tigers wary of farrell gamble leicester say ...
3,sport,yeading face newcastle in fa cup premiership s...
4,entertainment,ocean s twelve raids box office ocean s twelve...


### preprocessing

In [30]:
def preProcessing(text):
    #print(type(text))
    stemm_snow = SnowballStemmer("english")
    lem = WordNetLemmatizer()
    text = text.replace("\n", " ")
    text = re.sub(r"[0-9]", "", text)
    text = text.lower()
    text = word_tokenize(text)
    #text = [stemm_snow.stem(word) for word in text]
    text = [lem.lemmatize(word) for word in text]
    text = [char for char in text if char not in string.punctuation]
    text = [word for word in text if word not in stopwords.words('english')]
    text = ' '.join(text)
    
    return text

In [31]:
df['clean_text'] = df['text'].apply(preProcessing)

In [32]:
df.head()

Unnamed: 0,category,text,clean_text
0,tech,tv future in the hands of viewers with home th...,tv future hand viewer home theatre system plas...
1,business,worldcom boss left books alone former worldc...,worldcom bos left book alone former worldcom b...
2,sport,tigers wary of farrell gamble leicester say ...,tiger wary farrell gamble leicester say rushed...
3,sport,yeading face newcastle in fa cup premiership s...,yeading face newcastle fa cup premiership side...
4,entertainment,ocean s twelve raids box office ocean s twelve...,ocean twelve raid box office ocean twelve crim...


In [33]:
df.shape

(2225, 3)

In [34]:
docs =[]
for d in df.clean_text:
    docs.append(d.split())

In [35]:
# Remove both rare and common tokens.
from gensim.corpora import Dictionary
# Create a dictionary representation of the documents.
dictionary = Dictionary(docs)
# Filter out words that occur less than 20 documents, or more than 50% of the doc
dictionary.filter_extremes(no_below=20, no_above=0.5)

In [36]:
# Bag-of-words representation of the documents.
corpus = [dictionary.doc2bow(doc) for doc in docs]

In [37]:
print('Number of unique tokens: %d' % len(dictionary))
print('Number of documents: %d' % len(corpus))

Number of unique tokens: 2973
Number of documents: 2225


In [56]:
# Train LDA model.
from gensim.models import LdaModel, LsiModel

In [41]:
# Set training parameters.
num_topics = 10
chunksize = 2000
passes = 20
iterations = 400
eval_every = None# Don't evaluate model perplexity, takes too much time.
# Make a index to word dictionary.
temp = dictionary[0]  # This is only to "load" the dictionary.
id2word = dictionary.id2token
model = LdaModel(corpus=corpus, id2word=id2word,num_topics=num_topics)

In [42]:
top_topics = model.top_topics(corpus,topn=2)
pprint(top_topics)

[([(0.019003633, 'mobile'), (0.017456427, 'phone')], -0.2033409239962164),
 ([(0.014300388, 'search'), (0.008270076, 'new')], -0.6286086593528429),
 ([(0.010778276, 'u'), (0.006428739, 'one')], -0.6943313133158895),
 ([(0.01165616, 'film'), (0.010448933, 'u')], -0.8690378470045925),
 ([(0.011175838, 'people'), (0.008026994, 'u')], -0.9737444140339473),
 ([(0.012527265, 'u'), (0.010251986, 'mr')], -1.0027643298708047),
 ([(0.009585452, 'u'), (0.009432481, 'month')], -1.2380784168033652),
 ([(0.010201984, 'player'), (0.008070503, 'england')], -1.4601993905789359),
 ([(0.027912728, 'mr'), (0.011052411, 'blair')], -1.6413895122262547),
 ([(0.013202245, 'mr'), (0.010955351, 'game')], -2.401675995607374)]


In [55]:
len(top_topics)

10

### LSI model

In [59]:
# Train LSI model.
model = LsiModel(
    corpus=corpus,id2word=dictionary, num_topics=5)

In [61]:
model.print_topics(10,5)

[(0, '0.300*"mr" + 0.231*"people" + 0.171*"new" + 0.161*"one" + 0.147*"u"'),
 (1,
  '-0.504*"mr" + 0.234*"game" + 0.219*"music" + 0.213*"best" + -0.208*"labour"'),
 (2,
  '0.450*"best" + 0.350*"song" + 0.238*"award" + -0.206*"game" + 0.203*"music"'),
 (3,
  '-0.518*"game" + 0.285*"music" + 0.216*"people" + 0.183*"mobile" + 0.169*"phone"'),
 (4,
  '0.288*"u" + -0.246*"mr" + -0.223*"game" + -0.206*"mobile" + -0.196*"music"')]

In [62]:
# Create a new corpus, made of previously unseen documents.
other_texts = [  
    ['computer', 'time', 'mobile'],  
    ['vote', 'election', 'candidate'],  
    ['defeat', 'player', 'cup', 'england','cricket']]
other_corpus = [dictionary.doc2bow(text) for text in other_texts]
unseen_doc = other_corpus[2]
vector = model[unseen_doc]  # get topic probability distribution for a document
vector

[(0, 0.12003839476702446),
 (1, 0.1430590665313521),
 (2, -0.06687045827921333),
 (3, -0.2916993855803869),
 (4, -0.009322990375509072)]

In [63]:
for d in other_corpus:
    vector = model[d]
print(vector)

[(0, 0.12003839476702446), (1, 0.1430590665313521), (2, -0.06687045827921333), (3, -0.2916993855803869), (4, -0.009322990375509072)]
