## Importing Libraries


In [41]:
import tensorflow as tf
import pandas as pd
import numpy as np
import ast
from sklearn.model_selection import train_test_split
from tensorflow.keras.layers.experimental.preprocessing import TextVectorization
from sklearn.preprocessing import MultiLabelBinarizer


## Read and Display data


In [42]:
train_df = pd.read_csv("/content/drive/MyDrive/Document Tag Generator/stackoverflow.csv")
train_df.head()

Unnamed: 0.1,Unnamed: 0,Text,Tags
0,2,aspnet site maps has anyone got experience cre...,"['sql', 'asp.net']"
1,4,adding scripting functionality to net applicat...,"['c#', '.net']"
2,5,should i use nested classes in this case i am ...,['c++']
3,6,homegrown consumption of web services i have b...,['.net']
4,8,automatically update version number i would li...,['c#']


In [43]:
train_df['Tags'].value_counts()

['java']                            5894
['c#']                              4409
['android']                         4270
['python']                          4176
['c++']                             3689
                                    ... 
['c#', '.net', 'mysql']                1
['c#', 'java', 'python', 'c++']        1
['php', 'jquery', 'html', 'css']       1
['php', 'mysql', 'c']                  1
['jquery', 'ios', 'iphone']            1
Name: Tags, Length: 438, dtype: int64

In [44]:
train_df['Tags'] = train_df['Tags'].apply(lambda x: ast.literal_eval(x))


In [45]:
# Let's visualize some random training examples
import random

random_index = random.randint(0, len(train_df) - 5)  # create random indexes not higher than the total number of samples
for row in train_df[["Text", "Tags"]][random_index:random_index + 5].itertuples():
    _, text, labels = row
    print(f"Text:\n{text}\n")
    print(f"Labels: {labels}")

    print("---\n")

Text:
how do i compare two timestamps in c i am writing a socket program that maintains fifo queues for two input sockets when deciding which queue to service the program pulls the most recent timestamp from each queue i need a reliable method for comparing two timeval structs i tried using timercmp but my version of gcc does not support it and documentation states that the function is not posix compliantwhat should i do

Labels: ['c']
---

Text:
how to prevent newlineline break within a this is my codeform namepublishphp include locationselectorhtml input typesubmit valuesubmit formwhen thisplayed there is a newline preceding the submitbutton how to eliminate this newlineline breakthe html content of locationselectorhtml istabletrtde12eaocae a aatdtda aocoae a aatdtrtrtd aligncenterselect namepref onchangechangepreftrueoption value99a a12option value0aeoption value1ecoption value2a2coption value3aacoption value4coption value5aa12coption value6ca3coption value7e acoption value8 coption

## One hot encoding of labels


In [46]:
multilabel = MultiLabelBinarizer()
labels = multilabel.fit_transform(train_df['Tags'])
labels

array([[0, 0, 1, ..., 0, 0, 1],
       [1, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       ...,
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]])

## Split data into train and validation set


In [47]:
train_text, val_text, train_labels, val_labels = train_test_split(train_df["Text"].to_numpy(),
                                                                  labels,
                                                                  test_size=0.1,
                                                                  # dedicate 10% of samples to validation set
                                                                  random_state=42)  # random state for reproducibility

In [48]:
len(train_text), len(train_labels), len(val_text), len(val_labels)

(44078, 44078, 4898, 4898)

## Converting text into numbers (Tokenization)


In [49]:
average_sentence_len = round(sum([len(i.split()) for i in train_text]) / len(train_text))


In [50]:
max_vocab_length = 10000
max_sentence_length = average_sentence_len
text_vectorizer = TextVectorization(max_tokens=max_vocab_length,  # how many words in the vocabulary
                                    standardize="lower_and_strip_punctuation",  # how to process text
                                    split="whitespace",  # how to split tokens
                                    ngrams=None,  # create groups of n-words?
                                    output_mode="int",  # how to map tokens to numbers
                                    output_sequence_length=max_sentence_length,
                                    # how long should the output sequence of tokens be?
                                    pad_to_max_tokens=True)

In [51]:
text_vectorizer.adapt(train_text)


In [52]:
random_sentence = random.choice(train_text)
print(f"Original text:\n{random_sentence}\
      \n\nVectorized version:")
text_vectorizer([random_sentence])

Original text:
how to play two sound file at the same time with wpf i use soundplayer to play sound effects in the wpf program however i find that when two sounds effects are played at the same time the new one will replace the old one ie the new will terminate the old and play itself but what i want is to keep playing the old one even when the new one is playedsoundplayer wowsound new soundplayersoundeffectwowwavsoundplayer countingsound new soundplayersoundeffectfunnywavwowsoundplay play like background musiccountingsoundplay from click to generate the sound effect      

Vectorized version:


<tf.Tensor: shape=(1, 139), dtype=int64, numpy=
array([[  27,    4,  736,  133, 1704,   50,   30,    2,   91,   96,   16,
        1187,    3,   40,    1,    4,  736, 1704, 2725,    7,    2, 1187,
         242,  172,    3,  104,   11,   36,  133, 2402, 2725,   35, 3473,
          30,    2,   91,   96,    2,   32,   62,   59,  743,    2,  699,
          62,  341,    2,   32,   59, 3119,    2,  699,    8,  736,  676,
          20,   42,    3,   52,    6,    4,  458, 1413,    2,  699,   62,
         197,   36,    2,   32,   62,    6,    1,    1,   32,    1,    1,
          32,    1,  736,   34,  320,    1,   26,  349,    4,  532,    2,
        1704,  886,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0]])>

In [53]:
# Get the unique words in the vocabulary
words_in_vocab = text_vectorizer.get_vocabulary()
top_5_words = words_in_vocab[:5]  # most common tokens (notice the [UNK] token for "unknown" words)
bottom_5_words = words_in_vocab[-5:]  # least common tokens
print(f"Number of words in vocab: {len(words_in_vocab)}")
print(f"Top 5 most common words: {top_5_words}")
print(f"Bottom 5 least common words: {bottom_5_words}")

Number of words in vocab: 10000
Top 5 most common words: ['', '[UNK]', 'the', 'i', 'to']
Bottom 5 least common words: ['fibonacci', 'falsebut', 'fades', 'externally', 'explore']


## Embedding


In [54]:
from tensorflow.keras import layers

embedding = layers.Embedding(input_dim=max_vocab_length,  # set input shape
                             output_dim=128,  # set size of embedding vector
                             embeddings_initializer="uniform",  # default, intialize randomly
                             input_length=max_sentence_length)  # how long is each input

embedding

<keras.layers.embeddings.Embedding at 0x7f299b87ad90>

In [55]:
# Get a random sentence from training set
random_sentence = random.choice(train_text)
print(f"Original text:\n{random_sentence}\
      \n\nEmbedded version:")

# Embed the random sentence (turn it into numerical representation)
sample_embed = embedding(text_vectorizer([random_sentence]))
sample_embed

Original text:
php call to undefined function mb strlen on custom compiled php with mbstring enabled i have this custom compiled php v533 with the following extensions enabled via configureconfigure prefixusrlocalphp533 withconfigfilepathusrlocalapache2conf withapxs2usrlocalapache2binapxs withbz2 withcurlusrlib withcurlwrappers withfreetypedirusrlocal withgdusrlocal withgettext withgmp withiconvusrlocal withimapusrlocalimap2007e withimapssl withjpegdirusrlocallib withkerberos withlibxmldirusrlib withmcryptusrlocal withmhash withmysqlusrlibmysql withmysqlsockvarlibmysqlmysqlsock withmysqliusrlibmysqlmysql config withopensslusr withpcredirusrlocallib withpear withpngdirusrlocallib withreadline withsqlite withxmlrpc withxslusrlocal withzlibdirusrlocallib withzlibusrlocal withoutpgsql enablebcmath enablecalendar enableexif enableembeddedmysqlishared enableftp enablegdjisconv enablegdnativettf enablembstringall enablembregex enableshared enablesockets enablesoap enablesqliteutf8 enablezendm

<tf.Tensor: shape=(1, 139, 128), dtype=float32, numpy=
array([[[-0.01071844,  0.0046672 ,  0.03429085, ..., -0.04029176,
         -0.02166603,  0.00506119],
        [ 0.03639902, -0.03988792, -0.03639244, ..., -0.02591577,
          0.02700436,  0.00857703],
        [ 0.03170644,  0.01225835,  0.04985449, ...,  0.02632095,
         -0.00350149,  0.04537134],
        ...,
        [ 0.02010592, -0.01643562, -0.01262695, ..., -0.04196341,
         -0.01875532, -0.03415616],
        [ 0.02010592, -0.01643562, -0.01262695, ..., -0.04196341,
         -0.01875532, -0.03415616],
        [ 0.02010592, -0.01643562, -0.01262695, ..., -0.04196341,
         -0.01875532, -0.03415616]]], dtype=float32)>

## Creating Baseline Model


In [56]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.pipeline import Pipeline
from sklearn.svm import LinearSVC
from sklearn.multiclass import OneVsRestClassifier

# Create tokenization and modelling
# pipeline => To Sequentially apply a list of transforms
model_0 = Pipeline([
    ("tfidf", TfidfVectorizer()),  # convert words to numbers using tfidf
    ('clf', OneVsRestClassifier(LinearSVC(), n_jobs=1))
])


In [57]:
model_0.fit(train_text, train_labels)

Pipeline(steps=[('tfidf', TfidfVectorizer()),
                ('clf', OneVsRestClassifier(estimator=LinearSVC(), n_jobs=1))])

In [58]:
model_0_predictions = model_0.predict(val_text)

In [59]:
# Function to evaluate: accuracy, precision, recall, f1-score
from sklearn.metrics import accuracy_score, precision_recall_fscore_support


def calculate_results(y_true, y_pred):
    """
    Calculates model accuracy, precision, recall and f1 score of a binary classification model.

    Args:
    -----
    y_true = true labels in the form of a 1D array
    y_pred = predicted labels in the form of a 1D array

    Returns a dictionary of accuracy, precision, recall, f1-score.
    """

    true_labels = 0
    count = 0
    for i in range(len(y_true[0])):
        true_labels += np.count_nonzero((model_0_predictions == val_labels)[:, i] == True)
        count += len(model_0_predictions)

    element_wise_accuracy = true_labels * 100 / count

    # Calculate model accuracy
    model_accuracy = accuracy_score(y_true, y_pred) * 100
    # Calculate model precision, recall and f1 score using "weighted" average
    model_precision, model_recall, model_f1, _ = precision_recall_fscore_support(y_true, y_pred, average="weighted")
    model_results = {
        "elementwise_accuracy": element_wise_accuracy,
        "accuracy": model_accuracy,
        "precision": model_precision,
        "recall": model_recall,
        "f1": model_f1}
    return model_results

In [60]:
calculate_results(val_labels, model_0_predictions)

{'accuracy': 54.98162515312372,
 'elementwise_accuracy': 97.09473254389546,
 'f1': 0.7261454279536945,
 'precision': 0.8371819082860844,
 'recall': 0.6455717118307998}

## Simple Dense Model


Dense Layer is simple layer of neurons in which each neuron receives input from all the neurons of previous layer


In [61]:
# Build model with the Functional API
from tensorflow.keras import layers
inputs = layers.Input(shape=(1,), dtype="string") # inputs are 1-dimensional strings
x = text_vectorizer(inputs) # turn the input text into numbers
x = embedding(x) # create an embedding of the numerized numbers
x = layers.GlobalAveragePooling1D()(x) # lower the dimensionality of the embedding (try running the model without this layer and see what happens)

# Activation function decide whether the neuron should be activated or not
outputs = layers.Dense(20, activation="sigmoid")(x) # create the output layer, want binary outputs so use sigmoid activation
model_1 = tf.keras.Model(inputs, outputs, name="model_1_dense") # construct the model

In [62]:
# Compile model
model_1.compile(loss="binary_crossentropy",
                optimizer=tf.keras.optimizers.Adam(),
                metrics=["accuracy"])

In [63]:
model_1.summary()

Model: "model_1_dense"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 input_1 (InputLayer)        [(None, 1)]               0         
                                                                 
 text_vectorization (TextVec  (None, 139)              0         
 torization)                                                     
                                                                 
 embedding (Embedding)       (None, 139, 128)          1280000   
                                                                 
 global_average_pooling1d (G  (None, 128)              0         
 lobalAveragePooling1D)                                          
                                                                 
 dense (Dense)               (None, 20)                2580      
                                                                 
Total params: 1,282,580
Trainable params: 1,282,580
N

In [64]:
# Fit the model
model_1_history = model_1.fit(train_text, # input sentences can be a list of strings due to text preprocessing layer built-in model
                              train_labels,
                              epochs=5,
                              validation_data=(val_text, val_labels))

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


In [65]:
model_1_predictions = model_1.predict(val_text)
model_1_predictions

array([[0.07838702, 0.00107896, 0.02205145, ..., 0.00620398, 0.00676614,
        0.12313461],
       [0.00901812, 0.06354684, 0.0009785 , ..., 0.00633049, 0.00142828,
        0.00154325],
       [0.01424709, 0.00052884, 0.00118136, ..., 0.00442037, 0.00088558,
        0.00232303],
       ...,
       [0.01686698, 0.00245553, 0.00854111, ..., 0.0124782 , 0.00549766,
        0.0032863 ],
       [0.00250921, 0.01018035, 0.00139329, ..., 0.0157817 , 0.00706756,
        0.00321338],
       [0.12546796, 0.12481517, 0.07196257, ..., 0.02503598, 0.02118418,
        0.02179417]], dtype=float32)

In [66]:
model_1_predictions = tf.squeeze(tf.round(model_1_predictions)) # squeeze removes single dimensions
model_1_predictions

<tf.Tensor: shape=(4898, 20), dtype=float32, numpy=
array([[0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]], dtype=float32)>

In [67]:
calculate_results(val_labels, model_1_predictions)

{'accuracy': 40.7309105757452,
 'elementwise_accuracy': 97.09473254389546,
 'f1': 0.5899267330864266,
 'precision': 0.7838456728554627,
 'recall': 0.48545935228023795}

## Model 2 RNN (LSTM)

In [68]:
# Create LSTM model
from tensorflow.keras import layers
inputs = layers.Input(shape=(1,), dtype="string")
x = text_vectorizer(inputs)
x = embedding(x)
print(x.shape)
# x = layers.LSTM(64, return_sequences=True)(x) # return vector for each word in the Tweet (you can stack RNN cells as long as return_sequences=True)
x = layers.LSTM(64)(x) # return vector for whole sequence
print(x.shape)
# x = layers.Dense(64, activation="relu")(x) # optional dense layer on top of output of LSTM cell
outputs = layers.Dense(20, activation="sigmoid")(x)
model_2 = tf.keras.Model(inputs, outputs, name="model_2_LSTM")

(None, 139, 128)
(None, 64)


In [69]:
# Compile model
model_2.compile(loss="binary_crossentropy",
                optimizer=tf.keras.optimizers.Adam(),
                metrics=["accuracy"])

In [70]:
model_2.summary()

Model: "model_2_LSTM"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 input_2 (InputLayer)        [(None, 1)]               0         
                                                                 
 text_vectorization (TextVec  (None, 139)              0         
 torization)                                                     
                                                                 
 embedding (Embedding)       (None, 139, 128)          1280000   
                                                                 
 lstm (LSTM)                 (None, 64)                49408     
                                                                 
 dense_1 (Dense)             (None, 20)                1300      
                                                                 
Total params: 1,330,708
Trainable params: 1,330,708
Non-trainable params: 0
____________________________________________

In [71]:
# Fit model
model_2_history = model_2.fit(train_text,
                              train_labels,
                              epochs=5,
                              validation_data=(val_text, val_labels))

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


In [72]:
model_2_predictions = tf.squeeze(tf.round(model_2.predict(val_text)))
model_1_predictions

<tf.Tensor: shape=(4898, 20), dtype=float32, numpy=
array([[0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]], dtype=float32)>

In [73]:
calculate_results(val_labels, model_2_predictions)

{'accuracy': 62.22948142098815,
 'elementwise_accuracy': 97.09473254389546,
 'f1': 0.7496415029100427,
 'precision': 0.8146992853479141,
 'recall': 0.7115003304692663}

## TensorFlow Hub Pretrained Sentence Encoder


In [74]:
# Example of pretrained embedding with universal sentence encoder - https://tfhub.dev/google/universal-sentence-encoder/4
import tensorflow_hub as hub
embed = hub.load("https://tfhub.dev/google/universal-sentence-encoder/4") # load Universal Sentence Encoder

In [None]:
embed_samples = embed(["When you call the universal sentence encoder on a sentence, it turns it into numbers."])

print(embed_samples[0][:50])

In [None]:
embed_samples[0].shape

In [None]:
# We can use this encoding layer in place of our text_vectorizer and embedding layer
sentence_encoder_layer = hub.KerasLayer("https://tfhub.dev/google/universal-sentence-encoder/4",
                                        input_shape=[], # shape of inputs coming to our model 
                                        dtype=tf.string, # data type of inputs coming to the USE layer
                                        trainable=False, # keep the pretrained weights (we'll create a feature extractor)
                                        name="USE") 

In [None]:
# Create model using the Sequential API
model_6 = tf.keras.Sequential([
  sentence_encoder_layer, # take in sentences and then encode them into an embedding
  layers.Dense(64, activation="relu"),
  layers.Dense(20, activation="sigmoid")
], name="model_6_USE")

# Compile model
model_6.compile(loss="binary_crossentropy",
                optimizer=tf.keras.optimizers.Adam(),
                metrics=["accuracy"])

model_6.summary()

In [None]:
# Train a classifier on top of pretrained embeddings
model_6_history = model_6.fit(train_text,
                              train_labels,
                              epochs=5,
                              validation_data=(val_text, val_labels))

In [None]:
model_6_predictions = tf.squeeze(tf.round(model_6.predict(val_text)))
model_6_predictions

In [None]:
calculate_results(val_labels, model_6_predictions)

# Document Tag Generator

In [3]:
train_df = pd.read_csv("/content/drive/MyDrive/Document Tag Generator/projects_details.csv")
train_df.head()

Unnamed: 0,title,description,batch,category,project_url,repo_url,page_url,data_url,api_url
0,A GUI for controlling and supervising multiple...,The idea is to create a GUI platform where a u...,E15,Cyber-Physical Systems Projects,https://projects.ce.pdn.ac.lk/3yp/e15/A-GUI-fo...,https://github.com/cepdnaclk/e15-3yp-A-GUI-for...,https://cepdnaclk.github.io/e15-3yp-A-GUI-for-...,https://cepdnaclk.github.io/e15-3yp-A-GUI-for-...,http://api.ce.pdn.ac.lk/projects/v1/3yp/E15/A-...
1,An Efficient System For Waste Collection,This Project aims to implement an Efficient Wa...,E15,Cyber-Physical Systems Projects,https://projects.ce.pdn.ac.lk/3yp/e15/An-Effic...,https://github.com/cepdnaclk/e15-3yp-An-Effici...,https://cepdnaclk.github.io/e15-3yp-An-Efficie...,https://cepdnaclk.github.io/e15-3yp-An-Efficie...,http://api.ce.pdn.ac.lk/projects/v1/3yp/E15/An...
2,An automated system for monitoring and control...,This is a system for automatically controlling...,E15,Cyber-Physical Systems Projects,https://projects.ce.pdn.ac.lk/3yp/e15/An-autom...,https://github.com/cepdnaclk/e15-3yp-An-automa...,https://cepdnaclk.github.io/e15-3yp-An-automat...,https://cepdnaclk.github.io/e15-3yp-An-automat...,http://api.ce.pdn.ac.lk/projects/v1/3yp/E15/An...
3,Automated Bike Sharing System,This project is about building an automated bi...,E15,Cyber-Physical Systems Projects,https://projects.ce.pdn.ac.lk/3yp/e15/Automate...,https://github.com/cepdnaclk/e15-3yp-Automated...,https://cepdnaclk.github.io/e15-3yp-Automated-...,https://cepdnaclk.github.io/e15-3yp-Automated-...,http://api.ce.pdn.ac.lk/projects/v1/3yp/E15/Au...
4,Automated Book Management System Automated Boo...,"In libraries, We have planned to implement a b...",E15,Cyber-Physical Systems Projects,https://projects.ce.pdn.ac.lk/3yp/e15/Automate...,https://github.com/cepdnaclk/e15-3yp-Automated...,https://cepdnaclk.github.io/e15-3yp-Automated-...,https://cepdnaclk.github.io/e15-3yp-Automated-...,http://api.ce.pdn.ac.lk/projects/v1/3yp/E15/Au...


## Function to preprocess text data

*   Remove punctuation and numbers
*   Lowercase
*   Remove stop words



In [4]:
import re
import nltk
from nltk.corpus import stopwords

nltk.download('stopwords')


def preprocessText(text):
    text = re.sub("[^a-zA-Z]", ' ', text)
    text = text.lower().split()
    swords = set(stopwords.words("english"))
    text = [w for w in text if w not in swords]
    return text

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


In [5]:
preprocessText("NEver tell me the odds.")

['never', 'tell', 'odds']

In [6]:
import random

random_index = random.randint(0, len(train_df))
description = train_df.iloc[random_index]["description"]
print("Original Text\n--------------->")
print(description)

print("\n\nPreprocessed Text\n--------------->")
print(preprocessText(description))




Original Text
--------------->
An intelligent API which is capable of passively tracking gyroscopic data to classify sobriety level in real-time that will help minimize risks of drunk-riding accidents for e-scooter sharing systems.


Preprocessed Text
--------------->
['intelligent', 'api', 'capable', 'passively', 'tracking', 'gyroscopic', 'data', 'classify', 'sobriety', 'level', 'real', 'time', 'help', 'minimize', 'risks', 'drunk', 'riding', 'accidents', 'e', 'scooter', 'sharing', 'systems']


## Read Proposed Tags

In [7]:
f = open("/content/drive/MyDrive/Document Tag Generator/tags.txt")
lines = f.readlines()

tags = []
for line in lines:
  tags.append(line.strip().lower())

In [8]:
print(tags)

['machine learning', 'artificial intelligence', 'embedded system', 'gui', 'network', 'internet of things', 'image processing', 'neural network', 'health', 'agriculture']


In [9]:
import tensorflow_hub as hub

universal_sentence_encoder = hub.load("https://tfhub.dev/google/universal-sentence-encoder/4")
wikipedia_embedding = hub.load("https://tfhub.dev/google/Wiki-words-250-with-normalization/2")

In [11]:
def getMostFrequentWords(text, n):
  from collections import Counter

  Counter = Counter(text)
  words_with_count = Counter.most_common(n)

  words = list(t[0] for t in words_with_count)
  return words

In [13]:
getMostFrequentWords(preprocessText(description), 10)

['intelligent',
 'api',
 'capable',
 'passively',
 'tracking',
 'gyroscopic',
 'data',
 'classify',
 'sobriety',
 'level']

In [39]:
generated_tags_file = "generated_tags.txt"

f = open("/content/drive/MyDrive/Document Tag Generator/" + generated_tags_file, "w")

for i in range(len(train_df)):
  print()
  print(train_df.iloc[i]["title"])
  f.write(train_df.iloc[i]["title"] + "\n")
  if train_df.isnull().iloc[i]["description"]:
    continue
  description = train_df.iloc[i]["description"].strip()

  if len(description) == 0:
    continue

  preprocessed_description = preprocessText(description)
  frequentWords = getMostFrequentWords(preprocessed_description, 300)

  # print("Description\n--------------->")
  # print(description)
  matching_tags = []
  for word in frequentWords:
    for tag in tags:
      embedding1 = universal_sentence_encoder([tag])
      embedding2 = universal_sentence_encoder([word])

      if np.inner(embedding1, embedding2) >= 0.5:
        # print(tag, word, np.inner(embedding1, embedding2))
        if tag not in matching_tags:
          # print(tag, word, np.inner(embedding1, embedding2))
          matching_tags.append(tag)

  print(matching_tags)
  f.write(str(matching_tags) + "\n\n")

f.close()


A GUI for controlling and supervising multiple robots remotely
['gui', 'network', 'artificial intelligence']

An Efficient System For Waste Collection
['embedded system']

An automated system for monitoring and controlling the water supply to a large farmland
['embedded system']

Automated Bike Sharing System
['embedded system']

Automated Book Management System Automated Book Carrying Robot
['artificial intelligence', 'embedded system']

Automated Vehicle Parking System
['embedded system']

Automated Water Quality Monitoring System
['agriculture', 'embedded system', 'internet of things']

Automatic Door Lock System
['embedded system']

E Checkup
['health', 'gui', 'embedded system']

Embedded system for detecting adverse gases
['embedded system']

Fire Detection and Alert System
[]

Health Watch
['health']

Hydroponics Automation System
['agriculture', 'artificial intelligence', 'embedded system']

Intelligent Road Traffic Control System
['machine learning']

Monitoring and Tracking S

array([[0.63941824]], dtype=float32)