In [1]:
import pandas as pd
import numpy as np
import gensim
import nltk
import re
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize,sent_tokenize
from gensim.models import Word2Vec
import os

In [3]:
from sklearn.datasets import fetch_20newsgroups

categories = [
    'talk.religion.misc',
    'comp.graphics',
    'sci.space',
]

print("Loading 20 newsgroups dataset for categories:")
print(categories)

Loading 20 newsgroups dataset for categories:
['talk.religion.misc', 'comp.graphics', 'sci.space']


In [4]:
df = fetch_20newsgroups(subset='all', categories=categories,
                             shuffle=False, remove=('headers', 'footers', 'quotes'))
labels = df.target
true_k = len(np.unique(labels)) ## This should be 3 in this example
print(true_k)

3


In [5]:
labels.size

2588

In [6]:
df.target[1]

0

In [7]:
len(df.data)

2588

In [8]:
data=[]
for text in df.data:
  data.append(text)

In [9]:
data[0]

'\n\nI think I can. Largely as a result of efforts by people reading this group\nwriting letters and making phone calls the following has happened:\n\n1. NASA reprogrammed funds to keep NASP alive in 1991.\n2. Efforts to kill DC-X and the SSRT progam where twice twarted\n   (Feb. and June of last year).\n3. Gouldin kept his job in spite of heavy lobbying against him.\n\nThis may not be what Mark was thinking of but it shows that the\nreaders of sci.space DO have power and influence.\n\n  Allen\n'

In [10]:
new=[]
for i in data:
  i=re.sub('[\s+\d+:\.\)\( ]',' ',i) #'\.' spaces, numbers, colon, paranthesis, full stop rmoval
  i=re.sub(r'\S*@\S*\s?','',i)  # Email removal
  new.append(i)

new[0:3]


['  I think I can  Largely as a result of efforts by people reading this group writing letters and making phone calls the following has happened      NASA reprogrammed funds to keep NASP alive in          Efforts to kill DC-X and the SSRT progam where twice twarted     Feb  and June of last year      Gouldin kept his job in spite of heavy lobbying against him   This may not be what Mark was thinking of but it shows that the readers of sci space DO have power and influence     Allen ',
 'In regards to fractal commpression, I have seen   fractal compressed "movies"  They were both fairly impressive   The first one was a    gray scale "movie" of Casablanca, it was    MB and had    minutes of    fps video   It was a little grainy but not bad at all   The second one I saw was only   minutes but it had   bit color with   fps and measured in at    MB   I consider the fractal movies a practical thing to explore   But unlike many  other formats out there, you do end up losing resolution   I don

In [11]:
nltk.download('punkt')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


True

In [12]:
import nltk
nltk.download('punkt_tab')
tokens=[]
for text in new:
  text= re.sub(r'\s+',' ',text.lower())
  tokens.append(word_tokenize(text))


[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt_tab.zip.


In [13]:
tokens[0:2]

[['i',
  'think',
  'i',
  'can',
  'largely',
  'as',
  'a',
  'result',
  'of',
  'efforts',
  'by',
  'people',
  'reading',
  'this',
  'group',
  'writing',
  'letters',
  'and',
  'making',
  'phone',
  'calls',
  'the',
  'following',
  'has',
  'happened',
  'nasa',
  'reprogrammed',
  'funds',
  'to',
  'keep',
  'nasp',
  'alive',
  'in',
  'efforts',
  'to',
  'kill',
  'dc-x',
  'and',
  'the',
  'ssrt',
  'progam',
  'where',
  'twice',
  'twarted',
  'feb',
  'and',
  'june',
  'of',
  'last',
  'year',
  'gouldin',
  'kept',
  'his',
  'job',
  'in',
  'spite',
  'of',
  'heavy',
  'lobbying',
  'against',
  'him',
  'this',
  'may',
  'not',
  'be',
  'what',
  'mark',
  'was',
  'thinking',
  'of',
  'but',
  'it',
  'shows',
  'that',
  'the',
  'readers',
  'of',
  'sci',
  'space',
  'do',
  'have',
  'power',
  'and',
  'influence',
  'allen'],
 ['in',
  'regards',
  'to',
  'fractal',
  'commpression',
  ',',
  'i',
  'have',
  'seen',
  'fractal',
  'compressed',

Parameter Explanation:

tokens: The input tokenized sentences (list of lists of words).

size=50: Each word will be represented as a 50-dimensional vector.

window=5: The context window size (words within 5 positions before/after the target word are considered).

sg=1: Use Skip-Gram (instead of CBOW, which is sg=0).

hs=0: Use negative sampling instead of hierarchical softmax.

iter=10: Train the model for 10 iterations over the corpus.

In [None]:
from gensim.models import Word2Vec
Gensim_model=Word2Vec(tokens,vector_size=50,window=5,sg=1,hs=0, epochs=10 )

In [None]:
w1='software'
Gensim_model.wv.most_similar(positive=w1,topn=4)

In [None]:
w1='science'
Gensim_model.wv.most_similar(positive=w1,topn=4)

[('division', 0.7008097767829895),
 ('advancement', 0.6986432671546936),
 ('bachelor', 0.6963627934455872),
 ('astronautical', 0.6944034099578857)]

In [None]:
w1='technology'
Gensim_model.wv.most_similar(positive=w1,topn=4)

[('aerospace', 0.8437926173210144),
 ('marketing', 0.7785007357597351),
 ('transportation', 0.7733446955680847),
 ('administration', 0.7608449459075928)]

In [None]:
Gensim_model.wv.get_vector('science')

array([-0.4437847 ,  0.10570253,  0.18103418,  0.37861255, -0.45923668,
        0.31392658,  0.3181613 ,  0.32678303, -0.0749656 , -0.45862103,
       -0.5658499 ,  0.13764642, -0.79453087,  0.27018952, -0.04286365,
        0.82968897,  1.3496867 ,  0.7611589 , -0.48415688, -0.52067834,
       -0.61840653,  0.767492  ,  0.6603273 ,  0.11469961,  0.19282089,
        0.26738393,  0.00806295, -0.774899  , -0.5350636 ,  0.02775862,
        0.25840202,  0.29144746, -0.06131092,  0.3973377 ,  0.26408085,
        0.10315035, -0.2680168 , -0.23911966, -0.05574455, -0.10741802,
        0.19677609, -0.17266019,  0.26986033,  0.3470933 ,  0.21937953,
       -0.29072917,  0.32798764,  0.05832859,  0.48227578,  0.05779702],
      dtype=float32)

In [None]:
Gensim_model.wv.most_similar('Gastroenteritis')

In [None]:
from gensim.models import FastText
model_fast = FastText(tokens, vector_size=100, window=5, min_count=5, workers=4,sg=1) # religion, space and graphics

In [None]:
model_fast.wv.most_similar("Gastroenteritis")

[('amateur', 0.9519757628440857),
 ('industrial', 0.9359416365623474),
 ('chemistry', 0.935250997543335),
 ('auxiliary', 0.9334954619407654),
 ('isotropic', 0.9304940104484558),
 ('category', 0.9288355112075806),
 ('activists', 0.9268349409103394),
 ('territory', 0.9260854125022888),
 ('pacastro', 0.9248908162117004),
 ('civilian', 0.9247483611106873)]

In [None]:
?model_fast.wv.most_similar

In [None]:
from gensim.models import FastText
?FastText()

Object `FastText()` not found.
