In [72]:
from sklearn.datasets import fetch_20newsgroups

### Categories available from the dataset

In [73]:
fetch_20newsgroups(subset='train').target_names

['alt.atheism',
 'comp.graphics',
 'comp.os.ms-windows.misc',
 'comp.sys.ibm.pc.hardware',
 'comp.sys.mac.hardware',
 'comp.windows.x',
 'misc.forsale',
 'rec.autos',
 'rec.motorcycles',
 'rec.sport.baseball',
 'rec.sport.hockey',
 'sci.crypt',
 'sci.electronics',
 'sci.med',
 'sci.space',
 'soc.religion.christian',
 'talk.politics.guns',
 'talk.politics.mideast',
 'talk.politics.misc',
 'talk.religion.misc']

### Segregation of data in to train and test modules

In [74]:
train = fetch_20newsgroups(subset='train',remove=('headers', 'footers', 'quotes'))
test = fetch_20newsgroups(subset='test',remove=('headers', 'footers', 'quotes'))

In [76]:
print(len(train.data))
print(len(test.data))

11314
7532


# Data Preprocessing

In [77]:
from gensim.models.doc2vec import Doc2Vec,TaggedDocument

In [78]:
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords
import nltk
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')
nltk.download('stopwords')
nltk.download('wordnet')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

### Tokenization, Lowercase Conversion, Stop words removal, Punctuation removal and Lemmatization

In [79]:
from collections import defaultdict
from nltk.corpus import wordnet
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords
from string import punctuation
lemmatizer = WordNetLemmatizer()
def preprocess(sen):
  tokens= nltk.word_tokenize(sen)
  #to2 = nltk.sent_tokenize(sen)
  #lower case conversion
  low_str = [word.lower() for word in tokens]
  #Stop Words Removal
  stop_words = set(stopwords.words('english'))
  st_str = [word for word in low_str if word not in stop_words]
  #Punctuation Removal
  pu_str = [word for word in st_str if word not in punctuation]
  #Lemmatization
  pos_data = nltk.pos_tag(pu_str)
  tag_dict = defaultdict(lambda : wordnet.NOUN)
  tag_dict['J'] = wordnet.ADJ
  tag_dict['N'] = wordnet.NOUN
  tag_dict['V'] = wordnet.VERB
  tag_dict['R'] = wordnet.ADV
  lem_str = [lemmatizer.lemmatize(word,tag_dict[tag[0]]) for word,tag in pos_data]
  return lem_str

In [80]:
test_sen = "This is a BEAUTIFUL sentence written wonderfully to test Doc2Vec module !!!"
preprocess(test_sen)

['beautiful', 'sentence', 'write', 'wonderfully', 'test', 'doc2vec', 'module']

In [81]:
def apply_preprocess(ip_lst,ip_data,flag):
  '''Method to apply preprocessing to train and test modules '''
  if flag:
    for i in ip_data.data:
      pre_data = preprocess(i)
      ip_lst.append(pre_data)
    return ip_lst
  else:
    for i in range(len(ip_data.target)):
      tr_no = ip_data.target[i]
      name = str(i)+"_"+str(tr_no)+"_"+ip_data.target_names[tr_no]
      ip_lst.append(name)
    return ip_lst


In [82]:
train_data = []
train_data = apply_preprocess(train_data,train,True)
test_data = []
test_data = apply_preprocess(test_data,test,True)
train_labels = []
train_labels = apply_preprocess(train_labels,train,False)
test_labels = []
test_labels = apply_preprocess(test_labels,test,False)

In [83]:
train_tag_data = [TaggedDocument(words=tokens,tags=[labels]) for tokens,labels in zip(train_data,train_labels)]
test_tag_data = [TaggedDocument(words=tokens,tags=[labels]) for tokens,labels in zip(test_data,test_labels)]

In [84]:
test_tag_data[10]

TaggedDocument(words=['uploaded', 'windows', 'on-line', 'review', 'shareware', 'edition', 'ftp.cica.indiana.edu', '/pub/pc/win3/uploads/wolrs7.zip', 'on-line', 'magazine', 'contain', 'review', 'shareware', 'product', '...', 'grab', 'windows', 'on-line', 'bb', '--'], tags=['10_2_comp.os.ms-windows.misc'])

# **Training the Model**

In [99]:
model = Doc2Vec(dm=1,epochs=15) #Distributed memory option (PV-DM) is used.

In [100]:
model.build_vocab(train_tag_data)

In [101]:
model.train(train_tag_data,total_examples=model.corpus_count,epochs=model.epochs)

# **Testing the Model**

In [102]:
test_tag_data[0]

TaggedDocument(words=['little', 'confused', 'model', '88-89', 'bonnevilles', 'heard', 'le', 'se', 'lse', 'sse', 'ssei', 'could', 'someone', 'tell', 'difference', 'far', 'feature', 'performance', 'also', 'curious', 'know', 'book', 'value', 'prefereably', '89', 'model', 'much', 'less', 'book', 'value', 'usually', 'get', 'word', 'much', 'demand', 'time', 'year', 'heard', 'mid-spring', 'early', 'summer', 'best', 'time', 'buy'], tags=['0_7_rec.autos'])

In [103]:
new_doc_vec = model.infer_vector(test_tag_data[0].words, steps=50, alpha=0.25)

In [104]:
new_doc_vec

array([-1.7935109 ,  0.7540246 , -0.16122025, -0.545765  ,  0.45376423,
       -2.032601  , -4.937308  ,  3.3727207 ,  4.7072663 , -3.3231025 ,
       -5.0037346 ,  0.38572502,  4.8789825 ,  0.12678495, -0.9008269 ,
        0.24126776, -2.7013404 , -2.6746    , -3.1907356 , -1.3237969 ,
       -0.0269166 , -6.0495114 ,  1.4782971 ,  3.3871176 , -0.86826503,
        2.3594987 , -4.9592166 ,  3.2656288 ,  6.738087  , -3.075308  ,
       -0.99784577, -4.961427  ,  4.405932  ,  5.6751785 , -1.2428669 ,
        1.4303904 ,  4.114554  , -0.99306524, -4.8093204 ,  2.3648942 ,
        2.546966  , -0.65628314, -2.3360338 ,  1.4044759 ,  3.2958539 ,
        0.25910574,  1.7022059 , -0.28890026,  1.8317842 , -0.04786375,
        0.8769499 ,  0.38989124, -4.528877  ,  0.2891187 ,  2.5197265 ,
       -4.640683  , -2.5646632 , -0.9427929 , -0.9179611 ,  3.0829244 ,
        1.6449412 , -6.422558  , -3.96225   ,  0.04162598, -0.38182175,
        3.1148252 , -0.34094238,  3.6212294 , -4.756236  , -3.51

In [105]:
model.docvecs.most_similar(positive=[new_doc_vec])

[('8713_7_rec.autos', 0.4161534309387207),
 ('9160_12_sci.electronics', 0.4147428870201111),
 ('0_7_rec.autos', 0.3962128758430481),
 ('4542_4_comp.sys.mac.hardware', 0.38412684202194214),
 ('7500_2_comp.os.ms-windows.misc', 0.37587204575538635),
 ('4345_4_comp.sys.mac.hardware', 0.37281814217567444),
 ('2309_3_comp.sys.ibm.pc.hardware', 0.36781325936317444),
 ('8429_7_rec.autos', 0.36581921577453613),
 ('10121_7_rec.autos', 0.36104828119277954),
 ('7920_7_rec.autos', 0.35777318477630615)]

### Model was able to successfully identify document with similar context. As an improvement the model can be fine tuned to improve accuracy