<a href="https://colab.research.google.com/github/amazuzu/hackaton_2021/blob/master/day5.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

Final Research

In [None]:
from sklearn.datasets import fetch_20newsgroups
twenty_train = fetch_20newsgroups(subset='train', shuffle=True)
twenty_test = fetch_20newsgroups(subset='test', shuffle=True)

Downloading 20news dataset. This may take a few minutes.
Downloading dataset from https://ndownloader.figshare.com/files/5975967 (14 MB)


In [None]:
# !unzip /content/sample_data/glove.6B.50d.txt.zip
!unzip /content/drive/MyDrive/hackaton_2021/glove.6B.50d.txt.zip

Archive:  /content/drive/MyDrive/hackaton_2021/glove.6B.50d.txt.zip
  inflating: glove.6B.50d.txt        
  inflating: __MACOSX/._glove.6B.50d.txt  


In [None]:
import re

def preprocess_text(sen):
    # Remove punctuations and numbers
    sentence = re.sub('[^a-zA-Z]', ' ', sen)

    # Single character removal
    sentence = re.sub(r"\s+[a-zA-Z]\s+", ' ', sentence)

    # Removing multiple spaces
    sentence = re.sub(r'\s+', ' ', sentence)

    return sentence.lower()

In [None]:
from sklearn.model_selection import train_test_split

tt_data = [preprocess_text(item) for item in twenty_train.data]

sentences_train, sentences_test, y_train, y_test = train_test_split(tt_data, twenty_train.target, test_size=0.25, random_state=1000)

In [None]:
from keras.preprocessing.text import Tokenizer

tokenizer = Tokenizer(num_words=5000)
tokenizer.fit_on_texts(sentences_train)

X_train = tokenizer.texts_to_sequences(sentences_train)
X_test = tokenizer.texts_to_sequences(sentences_test)

In [None]:
vocab_size = len(tokenizer.word_index) + 1

In [None]:
from sklearn.preprocessing import LabelEncoder
from keras.utils import np_utils

encoder = LabelEncoder()
encoder.fit(y_train)
encoded_Y = encoder.transform(y_train)

encoder = LabelEncoder()
encoder.fit(y_test)
encoded_Y_test = encoder.transform(y_test)

dummy_y = np_utils.to_categorical(encoded_Y)
dummy_y_test = np_utils.to_categorical(encoded_Y_test)

In [None]:
from keras.preprocessing.sequence import pad_sequences

maxlen = 1000

X_train = pad_sequences(X_train, padding='post', maxlen=maxlen)
X_test = pad_sequences(X_test, padding='post', maxlen=maxlen)

In [None]:
from keras.models import Sequential
from keras import layers
import tensorflow as tf

embedding_dim = 40

print(f'vocab_size={vocab_size} input_length={maxlen}')

# create model
model = Sequential(name='xnet')
model.add(layers.Embedding(input_dim=vocab_size, 
                           output_dim=embedding_dim, 
                           input_length=maxlen, name='embed'))
# model.add(layers.Flatten())
model.add(layers.GlobalMaxPool1D())
model.add(layers.Dense(5, activation='relu',name='alpha'))
model.add(layers.Dense(20, activation='softmax', name='beta'))
# Compile model
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
model.summary()

vocab_size=78986 input_length=1000
Model: "xnet"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embed (Embedding)            (None, 1000, 40)          3159440   
_________________________________________________________________
global_max_pooling1d (Global (None, 40)                0         
_________________________________________________________________
alpha (Dense)                (None, 5)                 205       
_________________________________________________________________
beta (Dense)                 (None, 20)                120       
Total params: 3,159,765
Trainable params: 3,159,765
Non-trainable params: 0
_________________________________________________________________


In [None]:
history = model.fit(X_train, dummy_y,
                    epochs=20,
                    verbose=False,   
                    validation_data=(X_test, dummy_y_test),                
                    batch_size=20)

In [None]:
loss, accuracy = model.evaluate(X_train, dummy_y, verbose=False)
print("Training Accuracy: {:.4f}".format(accuracy))
loss, accuracy = model.evaluate(X_test, dummy_y_test, verbose=False)
print("Testing Accuracy:  {:.4f}".format(accuracy))

Training Accuracy: 0.9855
Testing Accuracy:  0.6889


In [None]:
import numpy as np

def create_embedding_matrix(filepath, word_index, embedding_dim):
    vocab_size = len(word_index) + 1  # Adding again 1 because of reserved 0 index
    embedding_matrix = np.zeros((vocab_size, embedding_dim))

    with open(filepath) as f:
        for line in f:
            word, *vector = line.split()
            if word in word_index:
                idx = word_index[word] 
                embedding_matrix[idx] = np.array(
                    vector, dtype=np.float32)[:embedding_dim]

    return embedding_matrix

In [None]:
embedding_dim = 40
embedding_matrix = create_embedding_matrix(
    '/content/glove.6B.50d.txt',
    tokenizer.word_index, embedding_dim)

In [None]:
nonzero_elements = np.count_nonzero(np.count_nonzero(embedding_matrix, axis=1))
nonzero_elements / vocab_size

0.6326437596536095

In [None]:
foodict = {k: v for k, v in tokenizer.word_index.items() if k.find('-') != -1}


In [None]:
def collect_words(filepath):
    words = []

    with open(filepath) as f:
        for line in f:
            word, *vector = line.split()
            words.append(word)

    return words

In [None]:
all_words = collect_words('/content/glove.6B.50d.txt')

In [None]:
for w in tokenizer.word_index.keys():
  if w not in all_words:
    print(w)

bhj
utexas
bxn
udel
xterm
uchicago
argic
ripem
iastate
optilink
umich
colostate
contrib
rlk
oname
uxa
dseg
cunixb
okz
vnet
fnal
sdpa
vnews
solntze
imho
ingr
mcrcim
zv
uww
fprintf
bontchev
uicvm
acns
uwaterloo
psuvm
clh
utoronto
colormap
nrhj
baalke
yfn
uuencode
uoknor
hernlem
pixmap
gtoal
behanna
wustl
qy
thanx
xfree
kaldis
forsale
xview
spdcc
nsmca
utkvm
msdos
wwiz
ioccc
ccwf
fbihh
msstate
okcforum
qtm
jxp
ulowell
lciii
sternlight
lerc
ucsu
mcovingt
ghj
nuy
steveh
jmd
stderr
rtsg
vmk
noring
openwindows
zisfein
xpert
umanitoba
ultb
useragent
nuntius
gfx
ohanus
appressian
gizw
hadn
rkba
rck
uky
xxdate
beauchaine
umcc
xdm
lhz
uio
arromdee
osrhe
netnews
aisun
kjz
enet
mydisplay
xvoid
vpic
speedstar
klj
intrinsics
husc
bxom
dtmedin
catbyte
argv
ucalgary
rwing
calpoly
dpy
bhjn
skndiv
olchowy
golchowy
strnlght
megatek
atlantaga
mangoe
dividian
sunysb
idbsu
rayshade
maxbyte
buphy
wuarchive
uoregon
nswc
nodak
psilink
dbstu
stdin
enterpoop
nriz
shostack
lyuda
lmsc
nysernet
cbnewsj
alink
tclock


KeyboardInterrupt: ignored

In [None]:
sentences_train[1:5]

["From: tervio@katk.Helsinki.FI (TERVI| MARKO J)\nSubject: Realignment in 2000\nOrganization: University of Helsinki, Computing Centre\nLines: 43\n\n   Well, here it is, NHL in the year 2000.\nI got these from a very reliable source in a dream some years ago and \nalthough I initially thought I had just been taking too many too strong \ndrugs now it seems the realization has really begun...  You can see the \nleague has already started to move to this direction.\n\n   *The Walt Disney Conference*\nAnaheim Mighty Chipmunks    -Franchise name to be changed after each new \nLA Kings                      hockey movie         \nLA Flames                   -We've seen some of that\nSan Jose Sharks\nSan Diego Bruins\nTijuana Red Wings   -Detroit's hockey team will follow its car industry...\nDallas Stars           \nHouston Oilers\nTexas Rangers\nSeattle Canucks\n\n   *The Norm Green Conference*\nAlabama White Hawks\nBiloxi Blues\nTampa Bay Lightning\nMiami Blades\nHelsinki Jets        -You'v

In [None]:
tst = ['fooa13 bar','baz groo45d']

In [None]:
preprocess_text(tst)

TypeError: ignored

In [None]:
tst = [preprocess_text(item) for item in tst]

In [None]:
print(tst)

['fooa bar', 'baz groo d']
