In [None]:
from google.colab import drive
drive.mount('/content/drive')
base_path = '/content/drive/My Drive/IIR_orientation/HW3/'

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


## Text Preprocessing

In [None]:
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [None]:
from ast import literal_eval
import pandas as pd
import numpy as np
import re

In [None]:
def read_data(filename):
  data = pd.read_csv(filename, sep='\t')
  data['tags'] = data['tags'].apply(literal_eval)  # str to list
  return data

In [None]:
train = read_data(base_path+'/train.tsv')
validation = read_data(base_path+'/validation.tsv')
test = pd.read_csv(base_path+'/test.tsv', sep='\t')

In [None]:
train.head()

Unnamed: 0,title,tags
0,How to draw a stacked dotplot in R?,[r]
1,mysql select all records where a datetime fiel...,"[php, mysql]"
2,How to terminate windows phone 8.1 app,[c#]
3,get current time in a specific country via jquery,"[javascript, jquery]"
4,Configuring Tomcat to Use SSL,[java]


In [None]:
X_train, y_train = train['title'].values, train['tags'].values
X_val, y_val = validation['title'].values, validation['tags'].values
X_test = test['title'].values

#### Task 1 (TextPrepare).
#### Implement the function text_prepare following the instructions.text_prepare

In [None]:
from nltk.tokenize import word_tokenize
nltk.download('punkt')

REPLACE_BY_SPACE_RE = re.compile('[/(){}\[\]\|@,;]')
BAD_SYMBOLS_RE = re.compile('[^0-9a-z #+_]')
STOPWORDS = set(stopwords.words('english'))

def text_prepare(text):
  text = text.lower() # lowercase text
  text = re.sub(REPLACE_BY_SPACE_RE, ' ', text)  # replace REPLACE_BY_SPACE_RE symbols by space in text
  text = re.sub(BAD_SYMBOLS_RE, '', text) # delete symbols which are in BAD_SYMBOLS_RE from text
  
  # delete stopwords from text
  word_tokens = word_tokenize(text)  
  filtered_sentence = [w for w in word_tokens if not w.lower() in STOPWORDS]
  text = ' '.join(filtered_sentence)

  
  return text

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [None]:
def test_text_prepare():
  examples = ["SQL Server - any equivalent of Excel's CHOOSE function?",
              "How to free c++ memory vector<int> * arr?"]
  answers = ["sql server equivalent excels choose function", 
              "free c++ memory vectorint arr"]
  for ex, ans in zip(examples, answers):
    if text_prepare(ex) != ans:
        return "Wrong answer for the case: '%s'" % ex
  return 'Basic tests are passed.'

In [None]:
test_text_prepare()

'Basic tests are passed.'

#### Task 2 (WordsTagsCount)
#### Find 3 most popular tags and 3 most popular words in the train data 

In [None]:
from collections import Counter

# Dictionary of all tags from train corpus with their counts.
tags_counts = {}
# Dictionary of all words from train corpus with their counts.
words_counts = {}

#words_counts are dictionaries like {'some_word_or_tag': frequency}. 
tags_counts = Counter(np.concatenate(y_train))

words_counts = Counter(np.concatenate([re.findall(r'\w+', text_prepare(x)) for x in X_train]))



In [None]:
most_common_tags = sorted(tags_counts.items(), key=lambda x: x[1], reverse=True)[:3]
print(most_common_tags)
most_common_words = sorted(words_counts.items(), key=lambda x: x[1], reverse=True)[:3]
print(most_common_words)

[('javascript', 19078), ('c#', 19077), ('java', 18661)]
[('using', 8279), ('c', 8183), ('php', 5624)]


## Transforming text to a vector
#### Machine Learning algorithms work with numeric data and we cannot use the provided text data "as is". There are many ways to transform text data to numeric vectors. In this task you will try to use two of them.

### Bag of words
#### One of the well-known approaches is a bag-of-words representation. To create this transformation, follow the steps:



1.   Find N most popular words in train corpus and numerate them. Now we have a dictionary of the most popular words.
2.  For each title in the corpora create a zero vector with the dimension equals to N.
3.  For each text in the corpora iterate over words which are in the dictionary and increase by 1 the corresponding coordinate.



In [None]:
# Implement the described encoding in the function my_bag_of_words with the size of the dictionary equals to 5000. 
DICT_SIZE = 5000
WORDS_TO_INDEX = {}
INDEX_TO_WORDS = {}
for i, key in enumerate(dict(sorted(words_counts.items(), key=lambda x: x[1], reverse=True)[:5000])):
  WORDS_TO_INDEX[key]=i
  INDEX_TO_WORDS[i]=key
ALL_WORDS = WORDS_TO_INDEX.keys()


def my_bag_of_words(text, words_to_index, dict_size):
  """
      text: a string
      dict_size: size of the dictionary
      
      return a vector which is a bag-of-words representation of 'text'
  """
  result_vector = np.zeros(dict_size)
  for word in text.split():
    if word in words_to_index:
      result_vector[words_to_index[word]]=1    
    
    
  return result_vector

In [None]:
def test_my_bag_of_words():
  words_to_index = {'hi': 0, 'you': 1, 'me': 2, 'are': 3}
  examples = ['hi how are you']
  answers = [[1, 1, 0, 1]]
  for ex, ans in zip(examples, answers):
    if (my_bag_of_words(ex, words_to_index, 4) != ans).any():
      return "Wrong answer for the case: '%s'" % ex
  return 'Basic tests are passed.'

In [None]:
print(test_my_bag_of_words())

Basic tests are passed.


In [None]:
from scipy import sparse as sp_sparse

In [None]:
X_train_mybag = sp_sparse.vstack([sp_sparse.csr_matrix(my_bag_of_words(text, WORDS_TO_INDEX, DICT_SIZE)) for text in X_train])
X_val_mybag = sp_sparse.vstack([sp_sparse.csr_matrix(my_bag_of_words(text, WORDS_TO_INDEX, DICT_SIZE)) for text in X_val])
X_test_mybag = sp_sparse.vstack([sp_sparse.csr_matrix(my_bag_of_words(text, WORDS_TO_INDEX, DICT_SIZE)) for text in X_test])
print('X_train shape ', X_train_mybag.shape)
print('X_val shape ', X_val_mybag.shape)
print('X_test shape ', X_test_mybag.shape)

X_train shape  (100000, 5000)
X_val shape  (30000, 5000)
X_test shape  (20000, 5000)


### Task 3 (BagOfWords). 
#### For the 11th row in X_train_mybag find how many non-zero elements it has.

In [None]:
Counter(X_train_mybag[10].toarray()[0])

Counter({0.0: 4994, 1.0: 6})

### TF-IDF

In [None]:
print('X_train ', X_train.shape) 
print('X_test ', X_test.shape) 
print('X_val ',X_val.shape)

X_train  (100000,)
X_test  (20000,)
X_val  (30000,)


In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [None]:
def tfidf_features(X_train, X_val, X_test):
  """
      X_train, X_val, X_test — samples        
      return TF-IDF vectorized representation of each sample and vocabulary
  """
  # Create TF-IDF vectorizer with a proper parameters choice
  # Fit the vectorizer on the train set
  # Transform the train, test, and val sets and return the result
  
  tfidf_vectorizer =  TfidfVectorizer(min_df=5,max_df=0.9,ngram_range=(1,2),token_pattern= '(\S+)')#  '(\S+)'  means any no white space
  X_train=tfidf_vectorizer.fit_transform(X_train)
  X_val=tfidf_vectorizer.transform(X_val)
  X_test=tfidf_vectorizer.transform(X_test)

  
  return X_train, X_val, X_test, tfidf_vectorizer.vocabulary_

In [None]:
X_train_tfidf, X_val_tfidf, X_test_tfidf, tfidf_vocab = tfidf_features(X_train, X_val, X_test)
tfidf_reversed_vocab = {i:word for word,i in tfidf_vocab.items()}

In [None]:
print('X_train_tfidf ', X_train_tfidf.shape) 
print('X_test_tfidf ', X_test_tfidf.shape) 
print('X_val_tfidf ',X_val_tfidf.shape)


X_train_tfidf  (100000, 26875)
X_test_tfidf  (20000, 26875)
X_val_tfidf  (30000, 26875)


check whether you have c++ or c# in your vocabulary, as they are obviously important tokens in our tags prediction task:

In [None]:
if 'c++' in tfidf_vocab.keys():
  print('c++')
if 'c#' in tfidf_vocab.keys():
  print('c#')

c++
c#


If you can't find it, we need to understand how did it happen that we lost them? It happened during the built-in tokenization of TfidfVectorizer. Luckily, we can influence on this process. Get back to the function above and use '(\S+)' regexp as a token_pattern in the constructor of the vectorizer.

## MultiLabel classifier


*   compare the quality of the bag-of-words and TF-IDF approaches 




In [None]:
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.multiclass import OneVsRestClassifier
from sklearn.linear_model import LogisticRegression

mlb = MultiLabelBinarizer(classes=sorted(tags_counts.keys()))
y_train = mlb.fit_transform(y_train) # it chnage the y_train in feature form like alll clases with 0,1 value
y_val = mlb.fit_transform(y_val)

mybag_Classifier = OneVsRestClassifier(LogisticRegression(solver='lbfgs', max_iter=400)).fit(X_train_mybag,y_train)
tfidf_Classifier = OneVsRestClassifier(LogisticRegression(solver='lbfgs', max_iter=400)).fit(X_train_tfidf,y_train)

In [None]:
y_val_predicted_labels_mybag = mybag_Classifier.predict(X_val_mybag)
y_val_predicted_labels_tfidf = tfidf_Classifier.predict(X_val_tfidf)

#### Evaluation


*   accuracy
*   F1-score macro/micro
* Precision macro/micro



In [None]:
from sklearn.metrics import accuracy_score, f1_score, average_precision_score

def print_evaluation(y_val, y_val_pred):
  accuracy = accuracy_score(y_val, y_val_pred)
  f1_score_macro = f1_score(y_val, y_val_pred, average='macro')
  f1_score_micro = f1_score(y_val, y_val_pred, average='micro')
  precision_macro = average_precision_score(y_val, y_val_pred, average='macro')
  precision_micro = average_precision_score(y_val, y_val_pred, average='micro')
  print('accuracy:', accuracy)
  print('F1-score macro:', f1_score_macro)
  print('F1-score micro:', f1_score_micro)
  print('Precision macro:', precision_macro)
  print('Precision micro:', precision_micro)  

In [None]:
print('Bag of words:')
print_evaluation(y_val, y_val_predicted_labels_mybag)
print('TF-IDF:')
print_evaluation(y_val, y_val_predicted_labels_tfidf)

Bag of words:
accuracy: 0.09733333333333333
F1-score macro: 0.1964697202183837
F1-score micro: 0.2893387759589879
Precision macro: 0.10627894537898579
Precision micro: 0.14889076989808317
TF-IDF:
accuracy: 0.28596666666666665
F1-score macro: 0.37940186239086443
F1-score micro: 0.5843944448933885
Precision macro: 0.2478332431308088
Precision micro: 0.40054288565793844


## Word2Vec

#### ex: mean(word embeddings) --> MLP
#### ex: word embeddings --> LSTM

In [None]:
X_train, y_train = train['title'].values, train['tags'].values
X_val, y_val = validation['title'].values, validation['tags'].values
X_test = test['title'].values

In [None]:
import gensim.downloader
print(list(gensim.downloader.info()['models'].keys()))
Word2Vec_model = gensim.downloader.load('glove-wiki-gigaword-50')

['fasttext-wiki-news-subwords-300', 'conceptnet-numberbatch-17-06-300', 'word2vec-ruscorpora-300', 'word2vec-google-news-300', 'glove-wiki-gigaword-50', 'glove-wiki-gigaword-100', 'glove-wiki-gigaword-200', 'glove-wiki-gigaword-300', 'glove-twitter-25', 'glove-twitter-50', 'glove-twitter-100', 'glove-twitter-200', '__testing_word2vec-matrix-synopsis']


In [None]:
def Word2VecFunc(wordList):
  res = []
  for word in wordList:
    try:
      res.append(Word2Vec_model[word])
    except:
      pass
  return res

In [None]:
def MLPDataPreprocessing(X, y):
  DelID = []
  for i,sentence in enumerate(X):
    if not Word2VecFunc(re.findall(r'\w+', text_prepare(sentence))):
      DelID.append(i)
  X_word2vec = np.delete(X, DelID)
  y_word2vec = np.delete(y, DelID)
  
  X_word2vec = np.array([np.mean(Word2VecFunc(re.findall(r'\w+', text_prepare(sentence))),axis=0) for sentence in X_word2vec])
  y_word2vec = mlb.fit_transform(y_word2vec) # it chnage the y_train in feature form like alll clases with 0,1 value

  return X_word2vec, y_word2vec

  

In [None]:
X_train_word2vec, y_train_word2vec = MLPDataPreprocessing(X_train, y_train)
X_val_word2vec, y_val_word2vec = MLPDataPreprocessing(X_val, y_val)

In [None]:
from sklearn.neural_network import MLPClassifier

model = OneVsRestClassifier(MLPClassifier(hidden_layer_sizes=(10,10,5), activation="relu", solver='adam', alpha=0.0001, learning_rate_init=0.001,max_iter=400)).fit(X_train_word2vec, y_train_word2vec)

In [None]:
y_val_predicted = model.predict(X_val_word2vec)

In [None]:
print_evaluation(y_val_word2vec, y_val_predicted)

accuracy: 0.15182965616525496
F1-score macro: 0.2666527655277241
F1-score micro: 0.4026181314262383
Precision macro: 0.14846712475497198
Precision micro: 0.21557978649355372


# MLP嘗試的一些參數:

### 只有一層(10):
- accuracy: 0.14094880980280558
- F1-score macro: 0.21407210680997196
- F1-score micro: 0.36824084243168165
- Precision macro: 0.127640284291824
- Precision micro: 0.19703320701906632

### 兩層(10, 5):
- accuracy: 0.14489939402055643
- F1-score macro: 0.26559877112830266
- F1-score micro: 0.38986502726019256
- Precision macro: 0.15123081713699052
- Precision micro: 0.21051468544977722

### 三層(10, 10, 5):
- accuracy: 0.14697512471123908
- F1-score macro: 0.2792130877669789
- F1-score micro: 0.4030414484031158
- Precision macro: 0.15656189372989365
- Precision micro: 0.2162811328943847



In [None]:
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences

tokenizer = Tokenizer(num_words=8278, lower=True)
tokenizer.fit_on_texts(X_train)
sequences = tokenizer.texts_to_sequences(X_train)
X_train_lstm_word2vec = pad_sequences(sequences, maxlen=max([len(item) for item in sequences]))
y_train_lstm_word2vec = mlb.fit_transform(y_train)

In [None]:
# LSTM
from tensorflow.keras import Sequential
from tensorflow.keras.layers import Dense,LSTM,Dropout,GRU,Embedding

embedding_matrix = Word2Vec_model.vectors

model = Sequential()
model.add(Embedding(
    input_dim=embedding_matrix.shape[0],
    output_dim=embedding_matrix.shape[1],
    input_length=max([len(item) for item in sequences]),
    weights=[embedding_matrix],
    trainable=False))
model.add(LSTM(256, dropout=0.2, recurrent_dropout=0.2, return_sequences = True))
model.add(LSTM(128, dropout=0.2, recurrent_dropout=0.2))
model.add(Dense(100,activation='sigmoid'))

model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['categorical_accuracy'])
model.summary()

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding (Embedding)        (None, 30, 50)            20000000  
_________________________________________________________________
lstm (LSTM)                  (None, 30, 256)           314368    
_________________________________________________________________
lstm_1 (LSTM)                (None, 128)               197120    
_________________________________________________________________
dense (Dense)                (None, 100)               12900     
Total params: 20,524,388
Trainable params: 524,388
Non-trainable params: 20,000,000
_________________________________________________________________


In [None]:
from keras.callbacks import EarlyStopping, ModelCheckpoint
callbacks = [
    EarlyStopping(patience=4), 
    ModelCheckpoint(filepath='/content/drive/My Drive/IIR_orientation/HW3/lstm_word2vec_model_2layers_20epochs.h5', save_best_only=True)
]

In [None]:
sequences = tokenizer.texts_to_sequences(X_val)
X_val_lstm_word2vec = pad_sequences(sequences, maxlen=30)
y_val_lstm_word2vec = mlb.fit_transform(y_val)

In [None]:
history = model.fit(X_train_lstm_word2vec, y_train_lstm_word2vec, epochs=20, batch_size=32, validation_data=(X_val_lstm_word2vec, y_val_lstm_word2vec), callbacks=callbacks)

Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


In [None]:
from tensorflow import keras

model = keras.models.load_model('/content/drive/My Drive/IIR_orientation/HW3/lstm_word2vec_model_2layers_20epochs.h5')



In [None]:
y_val_predicted = model.predict(X_val_lstm_word2vec)
y_val_predicted = np.where(y_val_predicted>0.5, 1, y_val_predicted)
y_val_predicted = np.where(y_val_predicted<0.5, 0, y_val_predicted)  

In [None]:
print_evaluation(y_val_lstm_word2vec, y_val_predicted)

accuracy: 0.2960333333333333
F1-score macro: 0.4242628468837895
F1-score micro: 0.6065254946272776
Precision macro: 0.2889776781597249
Precision micro: 0.4073279783086749


# LSTM嘗試的一些參數
### 只有一層(256)&Epochs(15):
- accuracy: 0.2194
- F1-score macro: 0.32790583598756134
- F1-score micro: 0.5253077975376197
- Precision macro: 0.21710522061134882
- Precision micro: 0.3281097064167475

### 兩層(256, 128)&Epochs(15):
- accuracy: 0.2894
- F1-score macro: 0.41511924342947515
- F1-score micro: 0.6017966676817331
- Precision macro: 0.2821232301586698
- Precision micro: 0.40153045773160667

### 兩層(256, 128)&Epochs(20):
- accuracy: 0.2960333333333333
- F1-score macro: 0.4242628468837895
- F1-score micro: 0.6065254946272776
- Precision macro: 0.2889776781597249
- Precision micro: 0.4073279783086749