In [None]:
!pip3 install gensim
!pip3 install pyspellchecker

In [None]:
import pandas as pd
import numpy as np
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import RegexpTokenizer
from nltk.stem import WordNetLemmatizer
import gensim.downloader
from spellchecker import SpellChecker
import re
import pickle

In [None]:
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('omw-1.4')

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
train_df = pd.read_csv('/content/drive/MyDrive/NLP_proj/NLP_train.csv')

In [None]:
train_df.shape

(8695, 3)

In [None]:
train_df.head()

Unnamed: 0,Id,Comment,Topic
0,0x840,A few things. You might have negative- frequen...,Biology
1,0xbf0,Is it so hard to believe that there exist part...,Physics
2,0x1dfc,There are bees,Biology
3,0xc7e,I'm a medication technician. And that's alot o...,Biology
4,0xbba,Cesium is such a pretty metal.,Chemistry


## Preprocess

In [None]:
# List of different exisitng pre0trained word2vec models 
list(gensim.downloader.info()['models'].keys())

['fasttext-wiki-news-subwords-300',
 'conceptnet-numberbatch-17-06-300',
 'word2vec-ruscorpora-300',
 'word2vec-google-news-300',
 'glove-wiki-gigaword-50',
 'glove-wiki-gigaword-100',
 'glove-wiki-gigaword-200',
 'glove-wiki-gigaword-300',
 'glove-twitter-25',
 'glove-twitter-50',
 'glove-twitter-100',
 'glove-twitter-200',
 '__testing_word2vec-matrix-synopsis']

In [None]:
word2vec_model = gensim.downloader.load('word2vec-google-news-300')

In [None]:
def preprocess_sentecnce(sentence, tokenizer, lemmatizer, stop_words):
    sentence = sentence.replace(r'\n', ' ')
    sentence = re.sub(r'\d+', '', sentence) # remove numbers
    tokens = tokenizer.tokenize(sentence) # split a sentence into tokens
    filtered_tokens = []
    for token in tokens:
        if token.lower() not in stop_words:
            lemmatized_token = lemmatizer.lemmatize(token)
            filtered_tokens.append(lemmatized_token.lower())
            
    return '#'.join(np.unique(filtered_tokens)) # form a string from obtained tokens

In [None]:
stop_words = stopwords.words('english')
lemmatizer = WordNetLemmatizer()
tokenizer = RegexpTokenizer(r'([a-zA-Z]+|[0-9]+)')

In [None]:
train_df['Unique_tokens'] = train_df['Comment'].apply(
    lambda row: preprocess_sentecnce(
        row, 
        tokenizer, 
        lemmatizer, 
        stop_words
        )
    )

In [None]:
train_df['Unique_tokens'].iloc[537]

'also#communication#en#fiber#free#http#laser#optical#org#part#point#require#setup#space#wiki#wikipedia'

In [None]:
train_df.head()

Unnamed: 0,Id,Comment,Topic,Unique_tokens
0,0x840,A few things. You might have negative- frequen...,Biology,advantage#allele#alter#alternating#animal#anot...
1,0xbf0,Is it so hard to believe that there exist part...,Physics,anything#believe#detect#exist#far#find#hard#hu...
2,0x1dfc,There are bees,Biology,bee
3,0xc7e,I'm a medication technician. And that's alot o...,Biology,alot#body#care#definitely#die#drug#fine#good#i...
4,0xbba,Cesium is such a pretty metal.,Chemistry,cesium#metal#pretty


In [None]:
len(np.unique('#'.join(train_df['Unique_tokens'].tolist()).split('#')))

15107

In [None]:
len(np.unique(' '.join(train_df['Comment'].tolist()).split()))

34079

In [None]:
print(f'The word2vec model conatins vector of {len(word2vec_model.vocab):,} words!')

The word2vec model conatins vector of 3,000,000 words!


In [None]:
all_unique_words_list = np.unique('#'.join(train_df['Unique_tokens'].tolist()).split('#'))

In [None]:
cnt = 0
words = []
words_vectors = []
spell_checker = SpellChecker()

for indx, word in enumerate(all_unique_words_list):
    if indx % 2000 == 0:
        print(f'Stored {indx}/{len(all_unique_words_list)} words already!')
    
    try:
        if word not in word2vec_model.vocab:
            word = spell_checker.correction(word)

        words_vectors.append(word2vec_model[word])
        words.append(word)
    except Exception as e:
        print(word)
        cnt += 1
print(cnt)

Stored 0/15107 words already!
aaaaaaaam
armand
bronson
abloblololo
abruptchaos
abyssinian
and
acetylpyridine
acetylsalicylic
acetylsalicys
acsnoqnsqm
alaric
agaricus
aharonov
ahukewimfpsvozahxmttabhr
akron
akshually
alcoholicsanonymous
alcohol's
aldrich
albania
alhambra
alizarine
aluminium
amanitamuscaria
amaurobiidae
amaurobius
aminoethyl
amita
anakin
analyse
analysed
analysing
andrussow
anesthesysed
anfo
anl
anomalocaris
antisolvent
antivaxer
antivaxxers
antumbra
aome
aovvawvtjzwxrnzfodxbunov
arabians
arachnoboards
archaeoglobus
archimagirus
archimedes
argiope
arius
ariolimax
arxiv
armillaria
arnold's
arrestfauci
arse
arstechnica
artantica
articlelanding
arxiv
askdocs
askentomologists
askphysics
askreddit
astrochem
atc
audubon
aurantia
austentising
autocannibalism
autohotkey
automail
automatonrobotics
avagadro
ayvbkafwg
azelaic
azidothymidine
aztec
backreaction
barnard
basidiomycota
beardocide
beautihorrifying
beber
bechbwsuuq
beeman
beermeisters
beest
behaviour
benzoic
berlinsky
ber

In [None]:
with open('words.pkl', 'wb') as f:
    pickle.dump(words, f)

with open('words_vectors.pkl', 'wb') as f:
    pickle.dump(words_vectors, f)