As usual we start loading the packages that we will use in our notebook

In [2]:
import tensorflow as tf
import tensorflow_hub as hub
import numpy as np
import pandas as pd
from sklearn import model_selection
from sklearn.preprocessing import LabelEncoder 
from nltk.corpus import stopwords #provides list of english stopwords
stop = stopwords.words('english')

  from ._conv import register_converters as _register_converters


In [27]:
train_df = pd.read_csv('CommentsApril2017.csv').sample(n=50000)

  interactivity=interactivity, compiler=compiler, result=result)


In [28]:
len(train_df)

50000

In [30]:
classes = len(np.unique(train_df['newDesk'])) #we print the length, not a big one but sufficient

A key step is to label encode the target variable from text to number


In [31]:
Y = train_df['newDesk']
encoder = LabelEncoder()
encoder.fit(Y)
Y = encoder.transform(Y)
Y = tf.keras.utils.to_categorical(
    Y,
    num_classes=classes#equals to the number of languages
    
)

As we mentioned in the slides we will perform the previous text processing steps except for stopword removal.

In [32]:
train_df['sentence_lower'] = train_df["commentBody"].str.lower()
train_df['sentence_no_punctuation'] = train_df['sentence_lower'].str.replace('[^\w\s]','')
train_df['sentence_no_punctuation'] = train_df["sentence_no_punctuation"].fillna("fillna")
train_df['sentence_no_punctuation'] = train_df['sentence_no_punctuation'].apply(lambda x: ' '.join([word for word in x.split() if word not in (stop)]))


In [33]:
max_features=5000 #we set maximum number of words to 5000
maxlen=100 #we set maximum sequence length to 400

In [34]:
tok = tf.keras.preprocessing.text.Tokenizer(num_words=max_features) #again tokenizer step

In [35]:
tok.fit_on_texts(list(train_df['sentence_no_punctuation'])) #fit to cleaned text

In [56]:
print(len(tok.word_index))
vocab_size = len(tok.word_index)+1
#this represents the number of words that we tokenize different from max_features but necessary for
#the definition of the dimension of the embedding space

112917


In [37]:
train_df = tok.texts_to_sequences(list(train_df['sentence_no_punctuation'])) #this is how we create sequences
train_df = tf.keras.preprocessing.sequence.pad_sequences(train_df, maxlen=maxlen) #let's execute pad step

In [38]:

from sklearn.model_selection import train_test_split #divide into train and test set

In [39]:
X_train, X_test, y_train, y_test = train_test_split(train_df, Y, test_size=0.1, random_state=42)

In [40]:
embedding_dim = 50 #this is the final dimension of the embedding space.


Let's write down the model

In [57]:
model = tf.keras.models.Sequential([
  tf.keras.layers.Embedding(input_dim=vocab_size, #embedding input
                           output_dim=embedding_dim,#embedding output
                           input_length=maxlen), #maximum length of an input sequence
  tf.keras.layers.Flatten(), #flatten layer

  tf.keras.layers.Dense(28, activation=tf.nn.softmax) #ouput layer a Dense layer with 4 probabilities
  #we also define our final activation function which is the softmax function typical for multiclass
  #classifiction problems

])

In [58]:
model.compile(optimizer='adam',
              loss='categorical_crossentropy', #we recommend this loss function you
              metrics=['accuracy'])



In [59]:
model.summary() #here we show the architecture 

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_3 (Embedding)      (None, 100, 50)           5645900   
_________________________________________________________________
flatten_3 (Flatten)          (None, 5000)              0         
_________________________________________________________________
dense_3 (Dense)              (None, 28)                140028    
Total params: 5,785,928
Trainable params: 5,785,928
Non-trainable params: 0
_________________________________________________________________


In [60]:
model.fit(np.array(X_train), np.array(y_train), epochs=3) #let's fit the model we are not very interested in fitting 
#well the model but rather use the target variable to understand better the corpus

  "Converting sparse IndexedSlices to a dense Tensor of unknown shape. "


Epoch 1/3
Epoch 2/3
Epoch 3/3


<tensorflow.python.keras.callbacks.History at 0x1c2b512fda0>

In [61]:
data = np.array(model.layers[0].get_weights())

In [62]:
data.shape

(1, 112918, 50)

In [63]:
data = data.reshape(data.shape[0]*data.shape[1], data.shape[2])

In [64]:
data = pd.DataFrame(data)

In [73]:
len(data)
data[1:].to_csv("data_pietro.csv", sep='\t', index=False, header=False)

In [68]:
meta_data = pd.DataFrame.from_dict(list(tok.word_index))
print(len(meta_data))
print(len(tok.word_index))

112917
112917


In [69]:
meta_data.columns = ['word']
#meta_data['word'] = meta_data['word'].astype(str)

In [70]:
len(meta_data)

112917

In [71]:
meta_data['word'].to_csv("meta_data_pietro.csv", index=False, header=False)

In [72]:
meta_data = pd.read_csv("meta_data_pietro.csv")
print(len(meta_data))

112916


In [25]:
data2 = np.array(model2.layers[0].get_weights())

In [26]:
data2

array([[[ 0.00553126, -0.00665088, -0.00956161, ..., -0.00173408,
         -0.03520825, -0.00941396],
        [-0.04221674, -0.028815  ,  0.02700207, ..., -0.00346949,
          0.03863809, -0.03069094],
        [ 0.01958964,  0.00439131, -0.02145624, ...,  0.01370366,
         -0.02752153, -0.01289343],
        ...,
        [ 0.01398655, -0.0245279 ,  0.01203852, ..., -0.04977977,
          0.04090806, -0.04057592],
        [ 0.02409924, -0.01895559,  0.04764244, ...,  0.00054241,
         -0.04741385,  0.00887945],
        [-0.02240181, -0.01668267,  0.00659778, ..., -0.03599209,
         -0.03164964,  0.02837657]]], dtype=float32)

Remember the train_test_split? now we use the test to evaluate our model

In [84]:
model.evaluate(np.array(X_test), np.array(y_test)) 



[0.013595647582845701, 1.0]

LOOKS LIKE WE HAVE A PERFECT MODEL!! 
LET'S TAKE A LOOK AT THE CONFUSION MATRIX OF OUR EVALUATION SET!!

In [85]:
from sklearn.metrics import confusion_matrix #we import this package from sklearn and output it
predictions = model.predict(X_test) #here we make predictions
cm = confusion_matrix(predictions.argmax(axis=1), y_test.argmax(axis=1))#we generate the confusion matrix

In [86]:
cm #well this is really perfect!

array([[ 91,   0,   0,   0],
       [  0,  88,   0,   0],
       [  0,   0, 102,   0],
       [  0,   0,   0,  83]], dtype=int64)

Let's try brand new text

In [87]:
#these are the codes for each language in order to evaluate properly
print('english', encoder.transform(['english']))
print('french', encoder.transform(['french']))
print('italian', encoder.transform(['italian']))
print('spanish', encoder.transform(['spanish']))

english [0]
french [1]
italian [2]
spanish [3]


In this experiment we will predict the language of the same sentence in the different languages

In [97]:
#new_text = ["tensorflow is a great tool you can find a lot of tutorials from packt"]
#new_text = ["tensorflow est un excellent outil vous pouvez trouver beaucoup de tutoriels de packt"]
#new_text = ["tensorflow è un ottimo strumento puoi trovare molti tutorial di packt"]
new_text = ["tensorflow es una gran herramienta puedes encontrar muchos tutoriales de packt"]


In [98]:
test_text = tok.texts_to_sequences(new_text) #this is how we create sequences
test_text = tf.keras.preprocessing.sequence.pad_sequences(test_text, maxlen=maxlen) #let's execute pad step

In [99]:
np.set_printoptions(suppress=True)
predictions = model.predict(test_text)
print(predictions.argmax())
print(predictions) #spanish you can get confused with italian which makes sense since they are more similar languages

3
[[0.05204275 0.0830349  0.10003749 0.7648848 ]]


In [100]:
import wikipedia

Let's build a brand new data set with only spanish and let's see if we recognize it ...

In [101]:
new_wiki_text = []
wikipedia.set_lang('es')
for i in range(0, 5):
    print(i)
    random = wikipedia.random(1)
       
    try:
        new_wiki_text.append([wikipedia.page(random).summary])
    except wikipedia.exceptions.DisambiguationError as e:
        random = wikipedia.random(1)

0
1
2
3
4


In [102]:
new_wiki_text = pd.DataFrame(new_wiki_text)
new_wiki_text.columns = ['sentence']
new_wiki_text

Unnamed: 0,sentence
0,Pontefract es un pueblo del distrito de Wakefi...
1,Joan de Giorgio Vitelli i Simon (Alguer 1870 -...
2,Glitter es una banda sonora original y el octa...
3,Bajo el apelativo de Cocinas del Sureste Asiát...
4,Neolucanus diffusus es una especie de coleópte...


In [103]:
new_wiki_text['sentence_lower'] = new_wiki_text["sentence"].str.lower()
new_wiki_text['sentence_no_punctuation'] = new_wiki_text['sentence_lower'].str.replace('[^\w\s]','')
new_wiki_text['sentence_no_punctuation'] = new_wiki_text["sentence_no_punctuation"].fillna("fillna")

In [104]:
np.set_printoptions(suppress=True)
test_wiki_text = tok.texts_to_sequences(list(new_wiki_text['sentence_no_punctuation'] )) #this is how we create sequences
test_wiki_text = tf.keras.preprocessing.sequence.pad_sequences(test_wiki_text, maxlen=maxlen) #let's execute pad step

In [105]:
predictions = model.predict(test_wiki_text)
print(predictions)

[[0.00000093 0.00010869 0.00000099 0.9998894 ]
 [0.00038548 0.00102032 0.00080112 0.9977931 ]
 [0.0000383  0.00060026 0.00007764 0.99928385]
 [0.00000019 0.00000787 0.00000026 0.99999166]
 [0.00212952 0.01033464 0.00334356 0.98419225]]


In [106]:
print('english', encoder.transform(['english']))
print('french', encoder.transform(['french']))
print('italian', encoder.transform(['italian']))
print('spanish', encoder.transform(['spanish']))

english [0]
french [1]
italian [2]
spanish [3]


WE DID A GOOD JOB!!