In [1]:
import re
import io
import numpy as np
import tensorflow as tf
import pandas as pd
from tensorflow import keras
from tqdm import tqdm
from tensorflow.keras.layers.experimental.preprocessing import TextVectorization
from gensim.models.keyedvectors import KeyedVectors

DATA_DIR="data/"
WORD2VEC_BIN_PATH=DATA_DIR+"SO_vectors_200.bin"
ANSWERS_CSV_PATH=DATA_DIR+"Answers.csv"
QUESTIONS_CSV_PATH=DATA_DIR+"Questions.csv"
TAGS_CSV_PATH=DATA_DIR+"Tags.csv"


In [2]:
word_vect = KeyedVectors.load_word2vec_format(WORD2VEC_BIN_PATH, binary=True)

In [3]:
idToTagIndex = {}          #dict mapping post ID to a list of tag indices
tagToTagIndex = {}         #dict mapping tag to tag index

In [4]:
import math
def truncate(number, digits) -> float:
    stepper = 10.0 ** digits
    return math.trunc(stepper * number) / stepper

In [5]:
questions_data = None
with io.open(QUESTIONS_CSV_PATH, 'r',encoding='utf-8',errors='ignore') as question_input:
    questions_data = pd.read_csv(question_input, engine='python')

print(questions_data.columns)
questions_data = questions_data[['Id', 'Title', 'Body']]
questions_data.insert(len(questions_data.columns), 'Code', "")

a = re.compile(r'<pre><code>([^<]*)</code></pre>')
b = re.compile(r'<.*?>')
questions_data['Code'] = questions_data['Body'].apply(lambda x: ' '.join(re.findall(a, x)))

def clean(text):
    x = re.sub(a, '', text)
    x = re.sub(b, '', x)
    x = x.replace('\n\n', '\n')
    return x
questions_data['Body'] = questions_data['Title'].str.cat(questions_data['Body'], sep=" ")
questions_data['Body'] = questions_data['Body'].apply(clean)
    
#     questions_data['Body'] = questions_data['Body'].apply(nltk.tokenize.word_tokenize) #need to fix: don't convert
#     # C#, C++ to C
#     questions_data['Body'] = questions_data['Body'].apply(lambda x: [word for word in x if word.  isalnum()])
#     questions_data['Body'] = questions_data['Body'].apply(lambda x: [word.lower() for word in x])

Index(['Id', 'OwnerUserId', 'CreationDate', 'ClosedDate', 'Score', 'Title',
       'Body'],
      dtype='object')


In [6]:
from collections import defaultdict

idToTagIndex = {}          #dict mapping post ID to a list of tag indices
tagToTagIndex = {}         #dict mapping tag to tag index
tagIndexToTag = {}
tagToFrequency = defaultdict(lambda: 0)

with open(TAGS_CSV_PATH) as tag_input:
    tag_data = pd.read_csv(tag_input)
    tagIndex = 0
    for index, row in tqdm(tag_data.iterrows()):
        currId = int(row[0])
        currTag = row[1]
        if currTag not in tagToTagIndex:
            tagToTagIndex[currTag] = tagIndex
            tagIndexToTag[tagIndex] = currTag
            currTagIndex = tagIndex
            tagIndex += 1

        else:
            currTagIndex = tagToTagIndex[currTag]  
        
        tagToFrequency[currTagIndex] += 1        
    
        if currId not in idToTagIndex.keys():
            idToTagIndex[currId] = [tagToTagIndex[row[1]]]
        else:
            idToTagIndex[currId].append(tagToTagIndex[row[1]])
            

3750994it [09:31, 6562.73it/s]


In [7]:
print("Number of examples: ", len(idToTagIndex))

Number of examples:  1264216


In [8]:
# find 10 most common tags

n = 15      #number of top tags

tagToFrequencyList = []

for key, value in tagToFrequency.items():
    temp = [key, value]
    tagToFrequencyList.append(temp)
    
tagToFrequencyList.sort(reverse=True, key=lambda x: x[1])

for tag in tagToFrequencyList[:n]:
    print(f"{tagIndexToTag[tag[0]]} ({tag[0]}): {tag[1]} times")
    
mostCommonTags = {}
for counter, tag in enumerate(tagToFrequencyList[:n]):   #currently takes top 10 tags
    mostCommonTags[tag[0]] = counter


javascript (132): 124155 times
java (89): 115212 times
c# (14): 101186 times
php (76): 98808 times
android (395): 90659 times
jquery (370): 78542 times
python (196): 64601 times
html (58): 58976 times
c++ (18): 47591 times
ios (2045): 47009 times
mysql (77): 42464 times
css (141): 42308 times
sql (7): 35782 times
asp.net (8): 29970 times
objective-c (163): 26922 times


In [9]:
# list(idToTagIndex.values())[:10]
print(mostCommonTags)

{132: 0, 89: 1, 14: 2, 76: 3, 395: 4, 370: 5, 196: 6, 58: 7, 18: 8, 2045: 9, 77: 10, 141: 11, 7: 12, 8: 13, 163: 14}


In [10]:
idToTenTags = {}

for postId, tags in idToTagIndex.items():
    containsTopTenTags = [mostCommonTags[tag] for tag in tags if tag in mostCommonTags.keys()]
    idToTenTags[postId] = containsTopTenTags
    
questions_data['Top-Tags'] = questions_data['Id'].apply(lambda x: idToTenTags[x])

print(questions_data)

               Id                                              Title  \
0              80  SQLStatement.execute() - multiple queries in o...   
1              90  Good branching and merging tutorials for Torto...   
2             120                                  ASP.NET Site Maps   
3             180                 Function for creating color wheels   
4             260  Adding scripting functionality to .NET applica...   
...           ...                                                ...   
1264211  40143210                           URL routing in PHP (MVC)   
1264212  40143300           Bigquery.Jobs.Insert - Resumable Upload?   
1264213  40143340                 Obfuscating code in android studio   
1264214  40143360         How to fire function after v-model change?   
1264215  40143380            npm run mocha test - files being cached   

                                                      Body  \
0        SQLStatement.execute() - multiple queries in o...   
1        Go

In [79]:
text_dataset = tf.data.Dataset.from_tensor_slices(["foo", "bar", "baz"])
print(text_dataset)
#text_ds = tf.data.Dataset.from_tensor_slices(questions_data['Body'].values.tolist())
#print(text_ds)
max_features = 5000  # Maximum vocab size.
max_len = 4  # Sequence length to pad the outputs to.
embedding_dims = 2

# Create the layer.
#vectorize_layer = TextVectorization(
# max_tokens=max_features,
# output_mode='int',
# output_sequence_length=max_len)

vectorize_layer = TextVectorization(max_tokens=20000, output_sequence_length=200)
vectorize_layer.adapt(text_dataset.batch(64))

input_data = [["foo qux bar"], ["qux baz"]]
questions_X = np.ndarray(shape=(2, 200))
questions_X = vectorize_layer([["foo qux bar"]]).numpy()
a = vectorize_layer([["qux baz"]]).numpy()
#print("A: " + str(a))
#print("qx: " + str(questions_X))
questions_X = np.append(questions_X, a)
#print(questions_X)
#print(type(questions_X))

# Now that the vocab layer has been created, call `adapt` on the text-only
# dataset to create the vocabulary. You don't have to batch, but for large
# datasets this means we're not keeping spare copies of the dataset.
#vectorize_layer.adapt(text_dataset.batch(64))

# Create the model that uses the vectorize text layer
model = tf.keras.models.Sequential()

# Start by creating an explicit input layer. It needs to have a shape of
# (1,) (because we need to guarantee that there is exactly one string
# input per batch), and the dtype needs to be 'string'.
model.add(tf.keras.Input(shape=(1,), dtype=tf.string))

# The first layer in our model is the vectorization layer. After this
# layer, we have a tensor of shape (batch_size, max_len) containing vocab
# indices.
model.add(vectorize_layer)

# Now, the model can map strings to integers, and you can add an embedding
# layer to map these integers to learned embeddings.
input_data = [["foo qux bar"], ["qux baz"]]
q = model.predict(input_data, batch_size=2)
#print(q)
#q.numpy()

voc = vectorize_layer.get_vocabulary()
print(len(voc))
word_index = dict(zip(voc, range(len(voc))))

num_tokens = len(voc) + 2
embedding_dim = 200
hits = 0
misses = 0
missed_words = []
print("num_tokens: " + str(num_tokens))

embedding_matrix = np.zeros((num_tokens, embedding_dim))
for word, i in tqdm(word_index.items()):
    try:
        embedding_vector = word_vect.get_vector(word)
        # Words not found in embedding index will be all-zeros.
        # This includes the representation for "padding" and "OOV"
        embedding_matrix[i] = embedding_vector
        #if hits == 0:
            #print("word: " + word)
            #print("embedding vector: " + str(embedding_vector))
        hits += 1
    except:
        misses += 1
        missed_words.append(word)
print("Converted %d words (%d misses)" % (hits, misses))
#print("missed words: " + str(missed_words))
print(embedding_matrix)

<TensorSliceDataset shapes: (), types: tf.string>


100%|██████████| 5/5 [00:00<00:00, 10804.49it/s]

5
num_tokens: 7
Converted 3 words (2 misses)
[[ 0.          0.          0.         ...  0.          0.
   0.        ]
 [ 0.          0.          0.         ...  0.          0.
   0.        ]
 [ 2.85269427  0.34682763  1.36381459 ...  2.50019026  4.65356922
  -1.35075629]
 ...
 [ 2.30001998  0.05065627  3.62873197 ...  1.7731936   1.27082145
  -0.52977061]
 [ 0.          0.          0.         ...  0.          0.
   0.        ]
 [ 0.          0.          0.         ...  0.          0.
   0.        ]]





In [41]:
from tensorflow.keras.layers.experimental.preprocessing import TextVectorization

vectorizer = TextVectorization(max_tokens=20000, output_sequence_length=200)
text_ds = tf.data.Dataset.from_tensor_slices(questions_data['Body'].values.tolist())
vectorizer.adapt(text_ds.batch(512))
vectorizer.get_vocabulary()[:10]

['', '[UNK]', 'the', 'i', 'to', 'a', 'is', 'in', 'and', 'of']

In [47]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MultiLabelBinarizer
mlb = MultiLabelBinarizer()

numex = 0
print()
#questions_X = vectorizer(np.array([[s] for s in questions_data['Body'].values])).numpy()

model = tf.keras.models.Sequential()
model.add(tf.keras.Input(shape=(1,), dtype=tf.string))
model.add(vectorizer) 
input_data = np.array([[s] for s in questions_data['Body'].values])
print("input data created")
questions_X = model.predict(input_data, batch_size=512)


# Now, the model can map strings to integers, and you can add an embedding
# layer to map these integers to learned embeddings.
input_data = [["foo qux bar"], ["qux baz"]]
model.predict(input_data, batch_size=2)


questions_y = mlb.fit_transform(np.array(questions_data['Top-Tags'].values))
print(questions_y[:5])

train_X, test_X, train_y, test_y = train_test_split(questions_X, questions_y, train_size=0.80, random_state=200)


input data created
[[0 0 0 0 0 0 0 0 0 0 0 0 0 0 0]
 [0 0 0 0 0 0 0 0 0 0 0 0 0 0 0]
 [0 0 0 0 0 0 0 0 0 0 0 0 1 1 0]
 [0 0 0 0 0 0 0 0 0 0 0 0 0 0 0]
 [0 0 1 0 0 0 0 0 0 0 0 0 0 0 0]]


In [55]:
print(questions_data['Top-Tags'][4])

[2]


In [56]:
output = vectorizer([["I tried running this line of code, but I'm receiving a null pointer exception"]])
output.numpy()

array([[   3,   80,  182,   10,  139,    9,   29,   17,   31, 1448,    5,
         330,  770,  276,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0, 

In [80]:
voc = vectorizer.get_vocabulary()
print(len(voc))
word_index = dict(zip(voc, range(len(voc))))

20000


In [81]:
test = ["this", "line", "of", "code"]
[word_index[w] for w in test] 

[10, 139, 9, 29]

In [82]:
num_tokens = len(voc) + 2
embedding_dim = 200
hits = 0
misses = 0
missed_words = []

embedding_matrix = np.zeros((num_tokens, embedding_dim))
for word, i in tqdm(word_index.items()):
    try:
        embedding_vector = word_vect.get_vector(word)
        # Words not found in embedding index will be all-zeros.
        # This includes the representation for "padding" and "OOV"
        embedding_matrix[i] = embedding_vector
        #if hits == 0:
            #print("word: " + word)
            #print("embedding vector: " + str(embedding_vector))
        hits += 1
    except:
        misses += 1
        missed_words.append(word)
print("Converted %d words (%d misses)" % (hits, misses))
print(word_vect.get_vector("qux"))
#print("missed words: " + str(missed_words))

100%|██████████| 20000/20000 [00:00<00:00, 261831.44it/s]

Converted 18902 words (1098 misses)
[ 1.11054885e+00 -1.90948397e-01  1.01150262e+00 -1.16011727e+00
 -4.01061118e-01 -1.99401509e-02  1.26393306e+00 -1.05262327e+00
 -7.08893120e-01 -1.16170633e+00 -3.82255346e-01 -6.64718449e-01
  2.39925131e-01 -9.89318669e-01  4.03541863e-01 -3.09030920e-01
 -8.93009543e-01  1.47940707e+00  3.45112324e-01 -5.33240616e-01
 -2.03384962e-02 -8.17787588e-01 -3.95911753e-01 -1.00960648e+00
  1.45841193e+00 -1.91910848e-01  1.96030110e-01 -4.74538594e-01
  2.30615154e-01 -1.33799136e+00 -7.01056361e-01 -8.22254896e-01
 -1.97593376e-01 -9.74523842e-01 -1.74921310e+00 -7.13844419e-01
  1.00619650e+00  1.94554651e+00  5.25697649e-01  2.44234558e-02
  1.18549097e+00  1.38673410e-01  1.57413578e+00  1.35944700e+00
  4.00970370e-01  9.36624467e-01 -1.38397664e-01  9.08933640e-01
 -6.06455147e-01  7.80146241e-01  2.07494706e-01 -1.31700456e+00
 -5.65629125e-01 -7.15693310e-02 -2.10581228e-01  3.94899547e-01
 -3.96223903e-01  1.25231707e+00  7.13234767e-04 -1.07




In [83]:
from tensorflow.keras.layers import Embedding

embedding_layer = Embedding(
    num_tokens,
    embedding_dim,
    embeddings_initializer=keras.initializers.Constant(embedding_matrix),
    trainable=False,
)

In [84]:
from tensorflow.keras import layers

int_sequences_input = keras.Input(shape=(None,), dtype="int64")
embedded_sequences = embedding_layer(int_sequences_input)
x = layers.Conv1D(128, 5, activation="relu")(embedded_sequences)
x = layers.MaxPooling1D(5)(x)
x = layers.Conv1D(128, 5, activation="relu")(x)
x = layers.MaxPooling1D(5)(x)
x = layers.Conv1D(128, 5, activation="relu")(x)
x = layers.GlobalMaxPooling1D()(x)
x = layers.Dense(128, activation="sigmoid")(x)
x = layers.Dropout(0.5)(x)
x = layers.Dropout(0.5)(x)
preds = layers.Dense(len(mostCommonTags))(x) #change to all tags
# preds = layers.Softmax(axis=-1)(preds)
preds = layers.Activation(activation="sigmoid")(preds)
model = keras.Model(int_sequences_input, preds)
model.summary()

Model: "functional_1"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_28 (InputLayer)        [(None, None)]            0         
_________________________________________________________________
embedding_1 (Embedding)      (None, None, 200)         4000400   
_________________________________________________________________
conv1d (Conv1D)              (None, None, 128)         128128    
_________________________________________________________________
max_pooling1d (MaxPooling1D) (None, None, 128)         0         
_________________________________________________________________
conv1d_1 (Conv1D)            (None, None, 128)         82048     
_________________________________________________________________
max_pooling1d_1 (MaxPooling1 (None, None, 128)         0         
_________________________________________________________________
conv1d_2 (Conv1D)            (None, None, 128)        

In [86]:
from sklearn.utils import class_weight
from tensorflow.keras import backend as K

def recall_m(y_true, y_pred):
    true_positives = K.sum(K.round(K.clip(y_true * y_pred, 0, 1)))
    possible_positives = K.sum(K.round(K.clip(y_true, 0, 1)))
    recall = true_positives / (possible_positives + K.epsilon())
    return recall

def precision_m(y_true, y_pred):
    true_positives = K.sum(K.round(K.clip(y_true * y_pred, 0, 1)))
    predicted_positives = K.sum(K.round(K.clip(y_pred, 0, 1)))
    precision = true_positives / (predicted_positives + K.epsilon())
    return precision

def f1_m(y_true, y_pred):
    precision = precision_m(y_true, y_pred)
    recall = recall_m(y_true, y_pred)
    return 2*((precision*recall)/(precision+recall+K.epsilon()))

model.compile(optimizer='rmsprop',
              loss='binary_crossentropy',
              metrics=['accuracy', recall_m])
              # metrics=[recall_m])
weights = {}

for i in range(n):
    weights[i] = (1 / tagToFrequencyList[i][1])

model.fit(train_X, train_y, batch_size=128, epochs=20, validation_data=(test_X, test_y), class_weight=weights)

Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


<tensorflow.python.keras.callbacks.History at 0x7fc508e427d0>

In [96]:
string_input = keras.Input(shape=(1,), dtype="string")
x = vectorizer(string_input)
preds = model(x)
end_to_end_model = keras.Model(string_input, preds)

probabilities = end_to_end_model.predict(
    #["I wanted to open my app without safari system alert but I found out that is impossible. so i decided to handle this alert event but I couldn't find the way. if I click [open], then safari open App, but if I click [cancel], then 'appCheckTimer' will be executed, then safari moves to 'some page's url'. if there is no way to not open this alert, I want to handle this alert's button event, when user click [cancel], I just want to stay that page. that alert is not opened by me, it's by safari So I can't handle it."]
    # ["My sorting algorithm time is fast"]
    # ["I try to create a person detection model with Tensorflow object detection api. I'm using Tensorflow 2. But at the end, the model has poor accuracy. It is detecting persons on image, but it only predicts between 30-60%."]
    #["Using Linux mint which has multiple packages installed. I have installed opencv and imutils using sudo pip3 install opencv and sudo pip3 install imutils."]
    #["How does system.out.println work in java?"]
    #["Questions about Collections"]
    ["How to install an app from app store"]
)

for i, prob in np.ndenumerate(probabilities):
    print('{:<16}  {:<16}'.format(tagIndexToTag[tagToFrequencyList[i[1]][0]], truncate(prob, 3)))
print(f"\nMost likely tag: {tagIndexToTag[tagToFrequencyList[np.argmax(probabilities)][0]]}")

javascript        0.0             
java              0.031           
c#                0.005           
php               0.0             
android           0.275           
jquery            0.0             
python            0.0             
html              0.0             
c++               0.001           
ios               0.241           
mysql             0.0             
css               0.0             
sql               0.0             
asp.net           0.0             
objective-c       0.118           

Most likely tag: android


In [None]:
# predictions=model.predict([padded_docs_test])
# thresholds=[0.1,0.2,0.3,0.4,0.5,0.6,0.7,0.8,0.9]
# for val in thresholds:
#     pred=predictions.copy()
  
#     pred[pred>=val]=1
#     pred[pred<val]=0
  
#     precision = precision_score(y_test, pred, average='micro')
#     recall = recall_score(y_test, pred, average='micro')
#     f1 = f1_score(y_test, pred, average='micro')
   
#     print("Micro-average quality numbers")
#     print("Precision: {:.4f}, Recall: {:.4f}, F1-measure: {:.4f}".format(precision, recall, f1))

NameError: ignored