# Word Embedding (WE)

## Gensim testing

In [1]:
from nltk.test.gensim_fixt import setup_module
setup_module()

In [2]:
import nltk


In [3]:
import gensim
from nltk.corpus import brown

train_set = brown.sents()[:10000]
model = gensim.models.Word2Vec(train_set)



In [4]:
model.save('brown.embedding')
new_model = gensim.models.Word2Vec.load('brown.embedding')

In [5]:
len(new_model.wv['university'])

100

In [6]:
new_model.wv.similarity('university','school') > 0.3

True

In [7]:
from nltk.data import find

word2vec_sample = str(find('models/word2vec_sample/pruned.word2vec.txt'))
model = gensim.models.KeyedVectors.load_word2vec_format(word2vec_sample, binary=False)

In [8]:
len(model)
len(model['university'])

300

In [9]:
model.most_similar(positive=['university'], topn = 3)

[('universities', 0.7003918290138245),
 ('faculty', 0.6780906915664673),
 ('undergraduate', 0.6587096452713013)]

In [10]:
 model.doesnt_match('breakfast cereal dinner lunch'.split())

'cereal'

## Spacy testing

In [12]:
!python -m spacy download en_core_web_sm

Collecting en-core-web-sm==3.3.0
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.3.0/en_core_web_sm-3.3.0-py3-none-any.whl (12.8 MB)
     -------------------------------------- 12.8/12.8 MB 321.5 kB/s eta 0:00:00
[+] Download and installation successful
You can now load the package via spacy.load('en_core_web_sm')


In [13]:
import spacy
from spacy.lang.en.examples import sentences

model2 =spacy.load('en_core_web_sm')

In [15]:
doc=model2(sentences[0])
doc

Apple is looking at buying U.K. startup for $1 billion

In [17]:
for token in doc:
    print(token.text, token.pos_, token.dep_,token.shape_)

Apple PROPN nsubj Xxxxx
is AUX aux xx
looking VERB ROOT xxxx
at ADP prep xx
buying VERB pcomp xxxx
U.K. PROPN compound X.X.
startup NOUN dobj xxxx
for ADP prep xxx
$ SYM quantmod $
1 NUM compound d
billion NUM pobj xxxx


In [19]:
for x in doc:
    for y in doc:
        print(x.text , y.text , x.similarity(y))

Apple Apple 1.0
Apple is -0.03927920013666153
Apple looking -0.1143060103058815
Apple at -0.16858862340450287
Apple buying -0.044184841215610504
Apple U.K. 0.2328547090291977
Apple startup 0.12263332307338715
Apple for -0.1312788426876068
Apple $ -0.006184321828186512
Apple 1 0.12815910577774048
Apple billion -0.03881775960326195
is Apple -0.03927920013666153
is is 1.0
is looking 0.1738957315683365
is at 0.001597534865140915
is buying 0.004276538733392954
is U.K. -0.07689043879508972
is startup -0.07900673896074295
is for -0.09975917637348175
is $ -0.040489550679922104
is 1 0.03606772795319557
is billion -0.1092299073934555
looking Apple -0.1143060103058815
looking is 0.1738957315683365
looking looking 1.0
looking at 0.05799964442849159
looking buying 0.5491272807121277
looking U.K. -0.03640978783369064
looking startup 0.05116286128759384
looking for -0.08290843665599823
looking $ -0.2658574879169464
looking 1 -0.13110095262527466
looking billion 0.12498926371335983
at Apple -0.1685886

  print(x.text , y.text , x.similarity(y))


In [23]:
new = model2(u'apple looking billion hghdrj')

for i in new :
    print(i.text , i.has_vector , i.vector_norm , i.is_oov)

apple True 7.164267 True
looking True 8.166646 True
billion True 7.531427 True
hghdrj True 5.694851 True


# Part2

In [9]:
import pandas as pd

data = pd.read_csv('Data.csv',encoding='latin1',error_bad_lines=False)

In [None]:
head(data)

#### Data Processing 

In [None]:
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize

tokens = word_tokenize(data)
stop_words = set(stopwords.words("english"))
tokens = [word for word in tokens if word.lower() not in stop_words]
dictionary = corpora.Dictionary(tokenized_text)
term_frequencies = {dictionary[idx]: freq for idx, freq in dictionary.dfs.items()}
sorted_terms = sorted(term_frequencies.items(), key=lambda x: x[1], reverse=True)
for term, freq in sorted_terms[:10]:
    print(term, freq)


In [None]:
from wordcloud import WordCloud
import matplotlib.pyplot as plt

wordcloud = WordCloud(width=600, height=200).generate(corpus)


plt.figure(figsize=(15, 5))
plt.imshow(wordcloud, interpolation='linear')
plt.axis('off')
plt.show()

## Word Embedding : Word2Vec 

In [None]:
import numpy as np
from gensim.models import Word2Vec
from tensorflow.keras.preprocessing.sequence import pad_sequences
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score


model = Word2Vec(tokenized_text, vector_size=150, window=5, min_count=1, workers=4)

vectors = []
for tokens in data:
    vectors = [model.wv[word] for word in tokens if word in model.wv]
    if vectors:
        vector = sum(vectors) / len(vectors)  
        vectors.append(vector)
    else:
        vectors.append([])


padded_vectors = pad_sequences(vectors, padding='post', dtype='float32')


target = data['author']


X_train, X_test, y_train, y_test = train_test_split(padded_vectors, target, test_size=0.2, random_state=42)


clf = SVC()


clf.fit(X_train, y_train)


result = clf.predict(X_test)


accuracy = accuracy_score(y_test, result)