## Importing Libraries

In [50]:
import tensorflow as tf
import pandas as pd
import numpy as np
import ast
from sklearn.model_selection import train_test_split
from tensorflow.keras.layers.experimental.preprocessing import TextVectorization
from sklearn.preprocessing import MultiLabelBinarizer
from skmultilearn.problem_transform import BinaryRelevance



## Read and Display data

In [6]:
train_df = pd.read_csv("stackoverflow.csv")
train_df.head()

Unnamed: 0.1,Unnamed: 0,Text,Tags
0,2,aspnet site maps has anyone got experience cre...,"['sql', 'asp.net']"
1,4,adding scripting functionality to net applicat...,"['c#', '.net']"
2,5,should i use nested classes in this case i am ...,['c++']
3,6,homegrown consumption of web services i have b...,['.net']
4,8,automatically update version number i would li...,['c#']


In [7]:
train_df['Tags'].value_counts()

['java']                            5894
['c#']                              4409
['android']                         4270
['python']                          4176
['c++']                             3689
                                    ... 
['c#', '.net', 'mysql']                1
['c#', 'java', 'python', 'c++']        1
['php', 'jquery', 'html', 'css']       1
['php', 'mysql', 'c']                  1
['jquery', 'ios', 'iphone']            1
Name: Tags, Length: 438, dtype: int64

In [8]:
train_df['Tags'] = train_df['Tags'].apply(lambda x: ast.literal_eval(x))

In [10]:
# Let's visualize some random training examples
import random

random_index = random.randint(0, len(train_df) - 5)  # create random indexes not higher than the total number of samples
for row in train_df[["Text", "Tags"]][random_index:random_index + 5].itertuples():
    _, text, labels = row
    print(f"Text:\n{text}\n")
    print(f"Labels: {labels}")

    print("---\n")

Text:
what is the use of an ioc framework in an mvc application i am trying to understand the use of an ioc framework like structuremap but i cannot help thinking that these design patterns are just nonsense making code just more complexlet me start with an example where i think an ioc is somewhat usefulli think an ioc can be usefull when dealing with the instantiation of controller classes in an mvc framework in this case i am thinking about the net mvc frameworknormally the instantiation of the controller class is handled by the framework so that means you cannot really pass any parameters to the constructor of your controller classthis is where an ioc framework can come in handy somewhere in an ioc container you specify what class should be instantiated and passed to your controllers constructor when the controller class is invokedthis is also handy when you want to unit test the controller because you can mock the object that is passed to itbut like i said i can somewhat understand

## One hot encoding of labels

In [34]:
multilabel = MultiLabelBinarizer()
labels = multilabel.fit_transform(train_df['Tags'])
labels

array([[0, 0, 1, ..., 0, 0, 1],
       [1, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       ...,
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]])

## Split data into train and validation set

In [35]:
train_text, val_text, train_labels, val_labels = train_test_split(train_df["Text"].to_numpy(),
                                                                  labels,
                                                                  test_size=0.1,
                                                                  # dedicate 10% of samples to validation set
                                                                  random_state=42)  # random state for reproducibility

In [36]:
len(train_text), len(train_labels), len(val_text), len(val_labels)

(44078, 44078, 4898, 4898)

## Converting text into numbers (Tokenization)

In [37]:
average_sentence_len = round(sum([len(i.split()) for i in train_text]) / len(train_text))

In [38]:
max_vocab_length = 10000
max_sentence_length = average_sentence_len
text_vectorizer = TextVectorization(max_tokens=max_vocab_length,  # how many words in the vocabulary
                                    standardize="lower_and_strip_punctuation",  # how to process text
                                    split="whitespace",  # how to split tokens
                                    ngrams=None,  # create groups of n-words?
                                    output_mode="int",  # how to map tokens to numbers
                                    output_sequence_length=max_sentence_length,
                                    # how long should the output sequence of tokens be?
                                    pad_to_max_tokens=True)

In [39]:
text_vectorizer.adapt(train_text)

In [40]:
random_sentence = random.choice(train_text)
print(f"Original text:\n{random_sentence}\
      \n\nVectorized version:")
text_vectorizer([random_sentence])

Original text:
long static strings in shortlived objects this might be a stupid question or just make me look stupid however i would be interested in how to work with long string objects in the context of shortlived objectsthink about long sql queries in cron job or anonymous command or functionlike classes these are very shortlived classes and even will use these long strings once in their lifetime for most of the time what is better to construct a string inline and let it be collected with the instance or make it static final anyway and let them sit in the memory useless until the classes next instantiation      

Vectorized version:


<tf.Tensor: shape=(1, 139), dtype=int64, numpy=
array([[ 260,  113,  564,    7,    1,  268,   12,  377,   23,    5, 2916,
         129,   31,   67,  108,   73,  363, 2916,  172,    3,   38,   23,
         964,    7,   27,    4,   92,   16,  260,   58,  268,    7,    2,
         333,    9,    1,    1,  105,  260,  278, 1003,    7, 5031, 1022,
          31, 1752,  342,   31,    1,  321,  152,   35,  196,    1,  321,
           8,  197,   59,   40,  152,  260,  564,  489,    7,  334, 3538,
          14,  306,    9,    2,   96,   42,    6,  291,    4, 1633,    5,
          58, 1088,    8,  353,   10,   23, 3591,   16,    2,  277,   31,
         108,   10,  113,  324, 1293,    8,  353,  153, 2237,    7,    2,
         229, 3235,  613,    2,  321,  455, 2906,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0]], dtype=int64)

In [41]:
# Get the unique words in the vocabulary
words_in_vocab = text_vectorizer.get_vocabulary()
top_5_words = words_in_vocab[:5]  # most common tokens (notice the [UNK] token for "unknown" words)
bottom_5_words = words_in_vocab[-5:]  # least common tokens
print(f"Number of words in vocab: {len(words_in_vocab)}")
print(f"Top 5 most common words: {top_5_words}")
print(f"Bottom 5 least common words: {bottom_5_words}")

Number of words in vocab: 10000
Top 5 most common words: ['', '[UNK]', 'the', 'i', 'to']
Bottom 5 least common words: ['fibonacci', 'falsebut', 'fades', 'externally', 'explore']


## Embedding

In [42]:
from tensorflow.keras import layers

embedding = layers.Embedding(input_dim=max_vocab_length,  # set input shape
                             output_dim=128,  # set size of embedding vector
                             embeddings_initializer="uniform",  # default, intialize randomly
                             input_length=max_sentence_length)  # how long is each input

embedding

<keras.layers.embeddings.Embedding at 0x1bfa8203490>

In [43]:
# Get a random sentence from training set
random_sentence = random.choice(train_text)
print(f"Original text:\n{random_sentence}\
      \n\nEmbedded version:")

# Embed the random sentence (turn it into numerical representation)
sample_embed = embedding(text_vectorizer([random_sentence]))
sample_embed

Original text:
how to define and work with an array of bits in c i want to create a very large array on which i write 0s and 1s i am trying to simulate a physical process called random sequential adsorption where units of length 2 dimers are deposited onto an ndimensional lattice at a random location without overlapping each other the process stops when there is no more room left on the lattice for depositing more dimers lattice is jammedinitially i start with a lattice of zeroes and the dimers are represented by a pair of 1s as each dimer is deposited the site on the left of the dimer is blocked due to the fact that the dimers cannot overlap so i simulate this process by depositing a triple of 1s on the lattice i need to repeat the entire simulation a large number of times and then work out the average coverage i have already done this using an array of chars for 1d and 2d lattices at the moment i am trying to make the code as efficient as possible before working on the 3d problem and

<tf.Tensor: shape=(1, 139, 128), dtype=float32, numpy=
array([[[ 0.03810194, -0.00194151,  0.02151889, ..., -0.0097611 ,
         -0.03590286,  0.03955403],
        [ 0.01416263, -0.03613079,  0.02529489, ...,  0.04558947,
          0.0273128 , -0.01165075],
        [-0.02942413,  0.00273955, -0.02120998, ...,  0.0380934 ,
          0.02103237, -0.01704551],
        ...,
        [ 0.01416263, -0.03613079,  0.02529489, ...,  0.04558947,
          0.0273128 , -0.01165075],
        [ 0.01301611,  0.04941064, -0.00040913, ..., -0.02778605,
          0.02211258,  0.04483546],
        [-0.0367678 , -0.0084657 , -0.03834645, ...,  0.01718051,
          0.03707523,  0.00561762]]], dtype=float32)>

## Creating Baseline Model

In [66]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.pipeline import Pipeline
from sklearn.svm import LinearSVC
from sklearn.multiclass import OneVsRestClassifier

# Create tokenization and modelling
# pipeline => To Sequentially apply a list of transforms
model_0 = Pipeline([
    ("tfidf", TfidfVectorizer()),  # convert words to numbers using tfidf
    ('clf', OneVsRestClassifier(LinearSVC(), n_jobs=1))
])


In [67]:
model_0.fit(train_text, train_labels)

Pipeline(steps=[('tfidf', TfidfVectorizer()),
                ('clf', OneVsRestClassifier(estimator=LinearSVC(), n_jobs=1))])

In [68]:
model_0_predictions = model_0.predict(val_text)

In [104]:
# Function to evaluate: accuracy, precision, recall, f1-score
from sklearn.metrics import accuracy_score, precision_recall_fscore_support


def calculate_results(y_true, y_pred):
    """
    Calculates model accuracy, precision, recall and f1 score of a binary classification model.

    Args:
    -----
    y_true = true labels in the form of a 1D array
    y_pred = predicted labels in the form of a 1D array

    Returns a dictionary of accuracy, precision, recall, f1-score.
    """

    true_labels = 0
    count = 0
    for i in range(len(y_true[0])):
        true_labels += np.count_nonzero((model_0_predictions == val_labels)[:, i] == True)
        count += len(model_0_predictions)

    element_wise_accuracy = true_labels * 100 / count

    # Calculate model accuracy
    model_accuracy = accuracy_score(y_true, y_pred) * 100
    # Calculate model precision, recall and f1 score using "weighted" average
    model_precision, model_recall, model_f1, _ = precision_recall_fscore_support(y_true, y_pred, average="weighted")
    model_results = {
        "elementwise_accuracy": element_wise_accuracy,
        "accuracy": model_accuracy,
        "precision": model_precision,
        "recall": model_recall,
        "f1": model_f1}
    return model_results

In [133]:
calculate_results(val_labels, model_0_predictions)

{'elementwise_accuracy': 97.09473254389546,
 'accuracy': 54.98162515312372,
 'precision': 0.8371819082860844,
 'recall': 0.6455717118307998,
 'f1': 0.7261454279536945}

## Simple Dense Model

Dense Layer is simple layer of neurons in which each neuron receives input from all the neurons of previous layer

In [118]:
# Build model with the Functional API
from tensorflow.keras import layers
inputs = layers.Input(shape=(1,), dtype="string") # inputs are 1-dimensional strings
x = text_vectorizer(inputs) # turn the input text into numbers
x = embedding(x) # create an embedding of the numerized numbers
x = layers.GlobalAveragePooling1D()(x) # lower the dimensionality of the embedding (try running the model without this layer and see what happens)

// decide whether the neuron should be activated or not
outputs = layers.Dense(20, activation="sigmoid")(x) # create the output layer, want binary outputs so use sigmoid activation
model_1 = tf.keras.Model(inputs, outputs, name="model_1_dense") # construct the model

In [119]:
# Compile model
model_1.compile(loss="binary_crossentropy",
                optimizer=tf.keras.optimizers.Adam(),
                metrics=["accuracy"])

In [120]:
model_1.summary()

Model: "model_1_dense"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 input_3 (InputLayer)        [(None, 1)]               0         
                                                                 
 text_vectorization_5 (TextV  (None, 139)              0         
 ectorization)                                                   
                                                                 
 embedding_1 (Embedding)     (None, 139, 128)          1280000   
                                                                 
 global_average_pooling1d_2   (None, 128)              0         
 (GlobalAveragePooling1D)                                        
                                                                 
 dense_2 (Dense)             (None, 20)                2580      
                                                                 
Total params: 1,282,580
Trainable params: 1,282,580
N

In [122]:
# Fit the model
model_1_history = model_1.fit(train_text, # input sentences can be a list of strings due to text preprocessing layer built-in model
                              train_labels,
                              epochs=5,
                              validation_data=(val_text, val_labels))

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


In [129]:
model_1_predictions = model_1.predict(val_text)
model_1_predictions

array([[0.07564071, 0.00131911, 0.01794747, ..., 0.01041836, 0.00801349,
        0.12972194],
       [0.00993046, 0.07407355, 0.00124362, ..., 0.00667396, 0.00120899,
        0.00145915],
       [0.01438844, 0.00039795, 0.001531  , ..., 0.00373155, 0.00045359,
        0.0022527 ],
       ...,
       [0.01449877, 0.00262266, 0.01206872, ..., 0.01118988, 0.00484034,
        0.00296563],
       [0.00289783, 0.01402417, 0.0007312 , ..., 0.02070445, 0.00832382,
        0.0032801 ],
       [0.12063766, 0.12050229, 0.07319388, ..., 0.03447977, 0.0268411 ,
        0.02078125]], dtype=float32)

In [131]:
model_1_predictions = tf.squeeze(tf.round(model_1_predictions)) # squeeze removes single dimensions
model_1_predictions

<tf.Tensor: shape=(4898, 20), dtype=float32, numpy=
array([[0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]], dtype=float32)>

In [132]:
calculate_results(val_labels, model_1_predictions)

{'elementwise_accuracy': 97.09473254389546,
 'accuracy': 41.2004899959167,
 'precision': 0.7880815539187447,
 'recall': 0.4897554527428949,
 'f1': 0.5946402259705589}

## Model 2 RNN (LSTM)

In [139]:
# Create LSTM model
from tensorflow.keras import layers
inputs = layers.Input(shape=(1,), dtype="string")
x = text_vectorizer(inputs)
x = embedding(x)
print(x.shape)
# x = layers.LSTM(64, return_sequences=True)(x) # return vector for each word in the Tweet (you can stack RNN cells as long as return_sequences=True)
x = layers.LSTM(64)(x) # return vector for whole sequence
print(x.shape)
# x = layers.Dense(64, activation="relu")(x) # optional dense layer on top of output of LSTM cell
outputs = layers.Dense(20, activation="sigmoid")(x)
model_2 = tf.keras.Model(inputs, outputs, name="model_2_LSTM")

(None, 139, 128)
(None, 64)


In [140]:
# Compile model
model_2.compile(loss="binary_crossentropy",
                optimizer=tf.keras.optimizers.Adam(),
                metrics=["accuracy"])

In [141]:
model_2.summary()

Model: "model_2_LSTM"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 input_5 (InputLayer)        [(None, 1)]               0         
                                                                 
 text_vectorization_5 (TextV  (None, 139)              0         
 ectorization)                                                   
                                                                 
 embedding_1 (Embedding)     (None, 139, 128)          1280000   
                                                                 
 lstm_1 (LSTM)               (None, 64)                49408     
                                                                 
 dense_4 (Dense)             (None, 20)                1300      
                                                                 
Total params: 1,330,708
Trainable params: 1,330,708
Non-trainable params: 0
____________________________________________

In [142]:
# Fit model
model_2_history = model_2.fit(train_text,
                              train_labels,
                              epochs=5,
                              validation_data=(val_text, val_labels))

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


In [143]:
model_2_predictions = tf.squeeze(tf.round(model_2.predict(val_text)))
model_1_predictions

<tf.Tensor: shape=(4898, 20), dtype=float32, numpy=
array([[0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]], dtype=float32)>

In [144]:
calculate_results(val_labels, model_2_predictions)

{'elementwise_accuracy': 97.09473254389546,
 'accuracy': 61.39240506329114,
 'precision': 0.8211625200206847,
 'recall': 0.6981163251817581,
 'f1': 0.7490979114643821}

## TensorFlow Hub Pretrained Sentence Encoder

In [146]:
# Example of pretrained embedding with universal sentence encoder - https://tfhub.dev/google/universal-sentence-encoder/4
import tensorflow_hub as hub
embed = hub.load("https://tfhub.dev/google/universal-sentence-encoder/4") # load Universal Sentence Encoder
embed_samples = embed([sample_sentence,
                      "When you call the universal sentence encoder on a sentence, it turns it into numbers."])

print(embed_samples[0][:50])

ModuleNotFoundError: No module named 'tensorflow_hub'

In [None]:
import tf
# Compile model
model_3.compile(loss="binary_crossentropy",
                loss=tf.metrics.F1Score,
                optimizer=tf.keras.optimizers.Adam(),
                metrics=["accuracy"])