# Imports

In [31]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from collections import Counter
from os.path import join

from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.utils import to_categorical

from imblearn.over_sampling import RandomOverSampler

from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import *
from tensorflow.keras.activations import softmax
from tensorflow.keras.utils import plot_model
from tensorflow.keras.optimizers import Adam
from tensorflow.keras import Model

import nltk
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords

from sklearn.feature_extraction.text import TfidfVectorizer
import tensorflow as tf

# 1. Evaluation of generated questions

## Models used for evaluation

### Original dataset

In [None]:
data_path = "dataset_6.csv"
df = pd.read_csv(data_path)
df.head()

Unnamed: 0,question,answer,difficulty,topic
0,What is supervised machine learning?,Supervised learning is a type of machine learn...,beginner,supervised learning
1,What is regression? Which models can you use t...,Regression is a part of supervised ML. Regress...,beginner,supervised learning
2,What is linear regression? When do we use it?,Linear regression is a model that assumes a li...,beginner,supervised learning
3,What are the main assumptions of linear regres...,There are several assumptions of linear regres...,intermediate,supervised learning
4,What’s the normal distribution? Why do we care...,The normal distribution is a continuous probab...,beginner,supervised learning


### Keyword tokenization

In [None]:
class Vocabulary:
    def __init__(self):
        self.word2idx = {}
        self.idx2word = {}
        self.idx = 0

    def add_word(self, word):
        if word not in self.word2idx:
            self.word2idx[word] = self.idx
            self.idx2word[self.idx] = word
            self.idx += 1

    def build_vocabulary(self, captions):
            for caption in captions:
                words = nltk.word_tokenize(caption.lower())
                for word in words:
                    self.add_word(WordNetLemmatizer().lemmatize(word))

In [None]:
nltk.download('wordnet')
nltk.download('punkt_tab')
nltk.download('stopwords')

[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [None]:
vocab = Vocabulary()
vocab.build_vocabulary(list(df['question']))

VOCAB_SIZE = len(vocab.word2idx)

word_occurrence = {}
for idx in range(vocab.idx):
    word_occurrence[vocab.idx2word[idx]] = 0

len(word_occurrence)

i = 0
for caption in list(df['question']):
    words = nltk.word_tokenize(caption.lower())
    for word in words:
        new_word = WordNetLemmatizer().lemmatize(word)
        word_occurrence[new_word] = word_occurrence[new_word] + 1

word_occurrence_no_stopwords = {}
for k, v in word_occurrence.items():
    if any(c.isalpha() for c in k) and len(k) > 1 and k not in stopwords.words('english'):
      word_occurrence_no_stopwords[k] = v

sorted_tokens = sorted(word_occurrence_no_stopwords.items(), key=lambda x: x[1], reverse=True)
sorted_tokens = [('<PAD>', 0)] + sorted_tokens

word2idx = {}
idx2word = {}
idx = 0
with open("keywords.txt") as file:
    for line in file:
        line = line.rstrip()
        word = line.split(' ')[0]
        word2idx[word] = idx
        idx2word[idx] = word
        idx = idx + 1

key_vectors = []
vec_len = 10
for text in list(df['question']):
  vec_rep = []
  words = nltk.word_tokenize(text.lower())
  keywords = []
  for word in words:
      new_word = WordNetLemmatizer().lemmatize(word)
      if new_word in word2idx:
        keywords.append(new_word)
        vec_rep.append(word2idx[new_word])
  if len(vec_rep) >= vec_len:
    vec_rep = vec_rep[:vec_len]
  else:
    vec_rep = vec_rep + [0 for x in range(vec_len - len(vec_rep))]
  vec_rep = np.array(vec_rep)
  key_vectors.append(vec_rep)
key_vectors = np.array(key_vectors)
key_vectors

array([[102,  37,  10, ...,   0,   0,   0],
       [  6,   1,  38, ...,   0,   0,   0],
       [  4,   6,   0, ...,   0,   0,   0],
       ...,
       [ 22,  17,  29, ...,   0,   0,   0],
       [ 22,  17,  62, ...,   0,   0,   0],
       [  3,   2,  31, ...,   0,   0,   0]])

In [None]:
def keyword_tokenize(text):
  vec_len = 10
  vec_rep = []
  words = nltk.word_tokenize(text.lower())
  keywords = []
  for word in words:
      new_word = WordNetLemmatizer().lemmatize(word)
      if new_word in word2idx:
        keywords.append(new_word)
        vec_rep.append(word2idx[new_word])
  if len(vec_rep) >= vec_len:
    vec_rep = vec_rep[:vec_len]
  else:
    vec_rep = vec_rep + [0 for x in range(vec_len - len(vec_rep))]

  return np.array(vec_rep)

#### 1. Datascience question classification

In [None]:
def classify(text):
  sequence = keyword_tokenize(text)
  c = 0
  for elm in sequence:
    if elm != 0:
      c = c + 1
  if c >= (len(sequence)/5.0):
    return "yes"
  else:
    return "no"

In [None]:
print('Is "What is supervised machine learning?" a datascience question? --->', classify("What is supervised machine learning?"))
print('Is "Why is the sky blue?" a datascience question? --->', classify("Why is the sky blue?"))

Is "What is supervised machine learning?" a datascience question? ---> yes
Is "Why is the sky blue?" a datascience question? ---> no


Non-data science questions:

In [None]:
data_path = "question_dataset - question_dataset.csv"
df_questions = pd.read_csv(data_path)
df_questions.head()

Unnamed: 0,question
0,"What is the difference between a ""pocket"" and ..."
1,I was wondering if anyone knows of any way tha...
2,How do you like to spend your time when you’re...
3,What would you like to see on this site?
4,I am interested in finding out if there is a w...


In [None]:
c = 0
for elm in df['question']:
  if classify(elm) == 'yes':
    c = c + 1
for elm in df_questions['question']:
  if classify(elm) == 'no':
    c = c + 1
accuracy = c / (len(df['question']) + len(df_questions['question']))
print("Data science question prediction accuracy:", accuracy)

Data science question prediction accuracy: 0.9161676646706587


#### 2. Difficulty classification

In [None]:
df['difficulty'].value_counts()

Unnamed: 0_level_0,count
difficulty,Unnamed: 1_level_1
intermediate,114
beginner,40
advanced,13


In [None]:
targets = [elm for elm in df.loc[:, 'difficulty']]
ratings = pd.Categorical(targets)
targets = to_categorical(ratings.codes)

X_train, X_test, y_train, y_test = train_test_split(key_vectors, targets, stratify=targets, test_size=0.15)

oversampler = RandomOverSampler()
xTrain_oversampled, yTrain_oversampled = oversampler.fit_resample(X_train, y_train)

##### Transformer

In [None]:
class Transformer(Model):
    def __init__(self):
        super().__init__()
        self.embedding = Embedding(5000, 100)
        self.multiheadattention = MultiHeadAttention(8, 3, value_dim=None, dropout=0.05)
        self.layernorm = LayerNormalization()
        self.add = Add()
        self.feedforward = Sequential([Dense(100, activation='relu'), Dense(100, activation='relu'), Dropout(0.1)])
        self.dense1 = Dense(64, activation='relu')
        self.dense2 = Dense(3, activation=softmax)

    def call(self, inputs):
        x1 = self.embedding(inputs)
        attn_output = self.multiheadattention(x1, x1)
        x1 = self.add([x1, attn_output])
        x1 = self.layernorm(x1)
        x1 = self.feedforward(x1)
        x1 = self.add([x1, attn_output])
        x1 = self.layernorm(x1)

        x2 = self.embedding(inputs)
        attn_output = self.multiheadattention(x2, x2)
        x2 = self.add([x2, attn_output])
        x2 = self.layernorm(x2)
        x2 = self.feedforward(x2)
        x2 = self.add([x2, attn_output])
        x2 = self.layernorm(x2)

        attn_output = self.multiheadattention(x2, x1)
        x = self.add([x2, attn_output])
        x = self.layernorm(x)
        x = self.feedforward(x)
        x = self.add([x, attn_output])
        x = self.layernorm(x)

        x = GlobalAveragePooling1D()(x)
        x = self.dense1(x)
        return self.dense2(x)

In [None]:
# The model has to be built first so that the saved weights could be load into it

difficulty_model = Transformer()
difficulty_model.compile(optimizer=Adam(), loss='categorical_crossentropy', metrics=['accuracy'])

difficulty_model.fit(xTrain_oversampled, yTrain_oversampled, epochs=10, validation_split=0.15)
print()
print("Prediction on test set")
predicted = difficulty_model.predict(X_test)
s = 0
i = 0
for pred in predicted:
  pred_list = np.ndarray.tolist(pred)
  pred_ind = pred_list.index(max(pred_list))
  if 1 == y_test[i][pred_ind]:
    s = s + 1
  i = i + 1
accuracy = s / len(y_test)
# test accuracy
print("Test accuracy:", accuracy)

In [None]:
# Loading the model with the highest balanced accuracy score on the test set
difficulty_model.load_weights("difficulty_pred_model_transformer_keyword.weights.h5")

In [None]:
def classify_difficulty(text):
  sequence = keyword_tokenize(text)
  predicted = difficulty_model.predict(np.array([sequence]))
  pred_list = np.ndarray.tolist(predicted[0])
  pred_ind = pred_list.index(max(pred_list))
  if pred_ind == 0:
    return 'advanced'
  elif pred_ind == 1:
    return 'beginner'
  elif pred_ind == 2:
    return 'intermediate'

In [None]:
question = "What is supervised machine learning?"
print(question, "difficulty:", classify_difficulty(question))
question = "What is the normal equation?"
print(question, "difficulty:", classify_difficulty(question))
question = "How do you approach tuning parameters in XGBoost or LightGBM?"
print(question, "difficulty:", classify_difficulty(question))

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 378ms/step
What is supervised machine learning? difficulty: beginner
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 32ms/step
What is the normal equation? difficulty: intermediate
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 37ms/step
How do you approach tuning parameters in XGBoost or LightGBM? difficulty: advanced


#### 3. Topic classification

In [None]:
df['topic'].value_counts()

Unnamed: 0_level_0,count
topic,Unnamed: 1_level_1
neural networks,34
feature selection,30
classification,21
unsupervised learning,21
supervised learning,20
text classification,14
regularization,13
recommender systems,7
time series,7


In [None]:
targets = [elm for elm in df.loc[:, 'topic']]
ratings = pd.Categorical(targets)
targets = to_categorical(ratings.codes)

X_train, X_test, y_train, y_test = train_test_split(key_vectors, targets, stratify=targets, test_size=0.15)

oversampler = RandomOverSampler()
xTrain_oversampled, yTrain_oversampled = oversampler.fit_resample(X_train, y_train)

##### Transformer

In [None]:
class Transformer2(Model):
    def __init__(self):
        super().__init__()
        self.embedding = Embedding(5000, 100)
        self.multiheadattention = MultiHeadAttention(8, 3, value_dim=None, dropout=0.05)
        self.layernorm = LayerNormalization()
        self.add = Add()
        self.feedforward = Sequential([Dense(100, activation='relu'), Dense(100, activation='relu'), Dropout(0.1)])
        self.dense1 = Dense(64, activation='relu')
        self.dense2 = Dense(9, activation=softmax)

    def call(self, inputs):
        x1 = self.embedding(inputs)
        attn_output = self.multiheadattention(x1, x1)
        x1 = self.add([x1, attn_output])
        x1 = self.layernorm(x1)
        x1 = self.feedforward(x1)
        x1 = self.add([x1, attn_output])
        x1 = self.layernorm(x1)

        x2 = self.embedding(inputs)
        attn_output = self.multiheadattention(x2, x2)
        x2 = self.add([x2, attn_output])
        x2 = self.layernorm(x2)
        x2 = self.feedforward(x2)
        x2 = self.add([x2, attn_output])
        x2 = self.layernorm(x2)

        attn_output = self.multiheadattention(x2, x1)
        x = self.add([x2, attn_output])
        x = self.layernorm(x)
        x = self.feedforward(x)
        x = self.add([x, attn_output])
        x = self.layernorm(x)

        x = GlobalAveragePooling1D()(x)
        x = self.dense1(x)
        return self.dense2(x)

In [None]:
# The model has to be built first so that the saved weights could be load into it

topic_model = Transformer2()
topic_model.compile(optimizer=Adam(), loss='categorical_crossentropy', metrics=['accuracy'])

topic_model.fit(xTrain_oversampled, yTrain_oversampled, epochs=10, validation_split=0.15)
print()
print("Prediction on test set")
predicted = topic_model.predict(X_test)
s = 0
i = 0
for pred in predicted:
  pred_list = np.ndarray.tolist(pred)
  pred_ind = pred_list.index(max(pred_list))
  if 1 == y_test[i][pred_ind]:
    s = s + 1
  i = i + 1
accuracy = s / len(y_test)
# test accuracy
print("Test accuracy:", accuracy)

In [None]:
# Loading the model with the highest balanced accuracy score on the test set
topic_model.load_weights("topic_pred_model_transformer_keyword.weights.h5")

In [None]:
def classify_topic(text):
  sequence = keyword_tokenize(text)
  predicted = topic_model.predict(np.array([sequence]))
  pred_list = np.ndarray.tolist(predicted[0])
  pred_ind = pred_list.index(max(pred_list))
  if pred_ind == 0:
    return 'classification'
  elif pred_ind == 1:
    return 'feature selection'
  elif pred_ind == 2:
    return 'neural networks'
  elif pred_ind == 3:
    return 'recommender systems'
  elif pred_ind == 4:
    return 'regularization'
  elif pred_ind == 5:
    return 'supervised learning'
  elif pred_ind == 6:
    return 'text classification'
  elif pred_ind == 7:
    return 'time series'
  elif pred_ind == 8:
    return 'unsupervised learning'

In [None]:
question = "How do you approach tuning parameters in XGBoost or LightGBM?"
print(question, "topic:", classify_topic(question))
question = "Why do we need one-hot encoding?"
print(question, "topic:", classify_topic(question))
question = "How L1 regularization looks like in a linear model?"
print(question, "topic:", classify_topic(question))
question = "What’s pooling in CNN? Why do we need it?"
print(question, "topic:", classify_topic(question))
question = "Possible approaches to solving the cold start problem?"
print(question, "topic:", classify_topic(question))

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 384ms/step
How do you approach tuning parameters in XGBoost or LightGBM? topic: feature selection
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 33ms/step
Why do we need one-hot encoding? topic: classification
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 33ms/step
How L1 regularization looks like in a linear model? topic: regularization
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 34ms/step
What’s pooling in CNN? Why do we need it? topic: neural networks
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 33ms/step
Possible approaches to solving the cold start problem? topic: recommender systems


## Evaluation of general models

In [None]:
class TransformTokenizer:
    def __init__(self, transform):
        self.transform = transform

    def __call__(self, doc):
        return [self.transform(word) for word in nltk.word_tokenize(doc)]

In [None]:
def create_tf_idf(df, all_questions):
  num_of_features = 1000
  corpus = [elm for elm in df.loc[:, 'question']]
  corpus = [doc.lower() for doc in corpus]
  corpus = corpus + all_questions
  stemmer = nltk.stem.PorterStemmer()
  tfidf_vectorizer_stemming = TfidfVectorizer(tokenizer=TransformTokenizer(stemmer.stem), max_df=1.0, min_df=1, max_features=num_of_features)
  bows = tfidf_vectorizer_stemming.fit_transform(corpus)
  bows = [elm.toarray()[0] for elm in bows]
  bows = np.array(bows)
  return bows, corpus

In [None]:
def norm(x):
  sum = 0
  for elm in x:
    sum = sum + elm*elm
  return np.sqrt(sum)

def cosDist(a, b):
  return 1 - ((a @ b) / (norm(a)*norm(b)))

def minDistance(x, bows, corpus):
  min = 10
  for i in range(len(corpus)):
    if x == i:
      continue
    if cosDist(bows[x], bows[i]) < min:
      min = cosDist(bows[x], bows[i])
  return min

### 1. LoRA

In [None]:
checkpoint_list = range(5, 51, 5)
all_questions = []
for checkpoint in checkpoint_list:
  file_name = 'checkpoint-' + str(checkpoint) + ".txt"
  with open(file_name, 'r') as file:
    for line in file:
          line = line.rstrip()
          if line not in list(df['question']) and line not in all_questions:
            all_questions.append(line.lower())

  bows, corpus = create_tf_idf(df, all_questions)

  with open(file_name, 'r') as file:
      questions = []
      num_questions = 0
      num_unique = 0
      num_data_questions = 0
      cos_count = 0
      # num_right_class = 0
      for line in file:
          line = line.rstrip()
          num_questions = num_questions + 1
          ind = corpus.index(line.lower())
          cos_count = cos_count + minDistance(ind, bows, corpus)
          if line not in list(df['question']) and line not in questions:
            questions.append(line)
            num_unique = num_unique + 1
          if classify(line) == "yes":
            num_data_questions = num_data_questions + 1
          #if classify_difficulty(line) == "advanced":
          #  num_right_class = num_right_class + 1
      all_questions = all_questions + [doc.lower() for doc in questions]
      print("-------------------------------------------------")
      print("Model name:", "lora/checkpoint-" + str(checkpoint))
      print("Average cosine distance of the questions:", cos_count/num_questions)
      print('Diversity:', str(num_questions) + '/100,', str(num_questions) + '%')
      print('Uniqueness:', str(num_unique) + '/' + str(num_questions) + ',', str(round(100*(num_unique/num_questions),2)) + '%')
      print('Data science questions:', str(num_data_questions) + '/' + str(num_questions) + ',', str(round(100*(num_data_questions/num_questions),2)) + '%')
      #print('Right class:', str(num_right_class) + '/' + str(num_questions) + ',', str(round(100*(num_right_class/num_questions),2)) + '%')
      print()
      #print('Total:', str(round(100*((num_questions/100) + (num_unique/num_questions) + (num_data_questions/num_questions))/3,2)) + '%')
      print('Total:', str(round(100*((num_questions/100) + (num_unique/num_questions) + (num_data_questions/num_questions) + (cos_count/num_questions))/4,2)) + '%')
      #print('Total:', str(round(25*((num_questions/100) + (num_unique/num_questions) + (num_data_questions/num_questions) + (num_right_class/num_questions)),2)) + '%')
      print()
      print()

-------------------------------------------------
Model name: lora/checkpoint-5
Average cosine distance of the questions: 0.2699890563629779
Diversity: 85/100, 85%
Uniqueness: 85/85, 100.0%
Data science questions: 74/85, 87.06%

Total: 74.76%


-------------------------------------------------
Model name: lora/checkpoint-10
Average cosine distance of the questions: 0.18144549174828925
Diversity: 38/100, 38%
Uniqueness: 33/38, 86.84%
Data science questions: 35/38, 92.11%

Total: 58.77%


-------------------------------------------------
Model name: lora/checkpoint-15
Average cosine distance of the questions: 0.13940076286049363
Diversity: 28/100, 28%
Uniqueness: 16/28, 57.14%
Data science questions: 27/28, 96.43%

Total: 48.88%


-------------------------------------------------
Model name: lora/checkpoint-20
Average cosine distance of the questions: 0.21947975762568286
Diversity: 24/100, 24%
Uniqueness: 10/24, 41.67%
Data science questions: 22/24, 91.67%

Total: 44.82%


--------------

### 2. P-tuning

In [None]:
checkpoint_list = range(38, 381, 38)
all_questions = []
for checkpoint in checkpoint_list:
  file_name = 'checkpoint-' + str(checkpoint) + ".txt"
  with open(file_name, 'r') as file:
    for line in file:
          line = line.rstrip()
          if line not in list(df['question']) and line not in all_questions:
            all_questions.append(line.lower())

  bows, corpus = create_tf_idf(df, all_questions)

  with open(file_name, 'r') as file:
      questions = []
      num_questions = 0
      num_unique = 0
      num_data_questions = 0
      cos_count = 0
      # num_right_class = 0
      for line in file:
          line = line.rstrip()
          num_questions = num_questions + 1
          ind = corpus.index(line.lower())
          cos_count = cos_count + minDistance(ind, bows, corpus)
          if line not in list(df['question']) and line not in questions:
            questions.append(line)
            num_unique = num_unique + 1
          if classify(line) == "yes":
            num_data_questions = num_data_questions + 1
          #if classify_difficulty(line) == "advanced":
          #  num_right_class = num_right_class + 1

      print("-------------------------------------------------")
      print("Model name:", "p-tuning/checkpoint-" + str(checkpoint))
      print("Average cosine distance of the questions:", cos_count/num_questions)
      print('Diversity:', str(num_questions) + '/100,', str(num_questions) + '%')
      print('Uniqueness:', str(num_unique) + '/' + str(num_questions) + ',', str(round(100*(num_unique/num_questions),2)) + '%')
      print('Data science questions:', str(num_data_questions) + '/' + str(num_questions) + ',', str(round(100*(num_data_questions/num_questions),2)) + '%')
      #print('Right class:', str(num_right_class) + '/' + str(num_questions) + ',', str(round(100*(num_right_class/num_questions),2)) + '%')
      print()
      #print('Total:', str(round(100*((num_questions/100) + (num_unique/num_questions) + (num_data_questions/num_questions))/3,2)) + '%')
      print('Total:', str(round(100*((num_questions/100) + (num_unique/num_questions) + (num_data_questions/num_questions) + (cos_count/num_questions))/4,2)) + '%')
      #print('Total:', str(round(25*((num_questions/100) + (num_unique/num_questions) + (num_data_questions/num_questions) + (num_right_class/num_questions)),2)) + '%')
      print()
      print()

-------------------------------------------------
Model name: p-tuning/checkpoint-38
Average cosine distance of the questions: 0.4743797718640525
Diversity: 64/100, 64%
Uniqueness: 62/64, 96.88%
Data science questions: 1/64, 1.56%

Total: 52.47%


-------------------------------------------------
Model name: p-tuning/checkpoint-76
Average cosine distance of the questions: 0.2863181196069845
Diversity: 98/100, 98%
Uniqueness: 98/98, 100.0%
Data science questions: 48/98, 48.98%

Total: 68.9%


-------------------------------------------------
Model name: p-tuning/checkpoint-114
Average cosine distance of the questions: 0.17136301375159474
Diversity: 91/100, 91%
Uniqueness: 91/91, 100.0%
Data science questions: 51/91, 56.04%

Total: 66.05%


-------------------------------------------------
Model name: p-tuning/checkpoint-152
Average cosine distance of the questions: 0.17685976050233126
Diversity: 92/100, 92%
Uniqueness: 92/92, 100.0%
Data science questions: 48/92, 52.17%

Total: 65.46%



### 3. Prefix tuning

In [None]:
checkpoint_list = range(38, 381, 38)
all_questions = []
for checkpoint in checkpoint_list:
  file_name = 'checkpoint-' + str(checkpoint) + ".txt"
  with open(file_name, 'r') as file:
    for line in file:
          line = line.rstrip()
          if line not in list(df['question']) and line not in all_questions:
            all_questions.append(line.lower())

  bows, corpus = create_tf_idf(df, all_questions)

  with open(file_name, 'r') as file:
      questions = []
      num_questions = 0
      num_unique = 0
      num_data_questions = 0
      cos_count = 0
      # num_right_class = 0
      for line in file:
          line = line.rstrip()
          num_questions = num_questions + 1
          ind = corpus.index(line.lower())
          cos_count = cos_count + minDistance(ind, bows, corpus)
          if line not in list(df['question']) and line not in questions:
            questions.append(line)
            num_unique = num_unique + 1
          if classify(line) == "yes":
            num_data_questions = num_data_questions + 1
          #if classify_difficulty(line) == "advanced":
          #  num_right_class = num_right_class + 1

      print("-------------------------------------------------")
      print("Model name:", "prefix/checkpoint-" + str(checkpoint))
      print("Average cosine distance of the questions:", cos_count/num_questions)
      print('Diversity:', str(num_questions) + '/100,', str(num_questions) + '%')
      print('Uniqueness:', str(num_unique) + '/' + str(num_questions) + ',', str(round(100*(num_unique/num_questions),2)) + '%')
      print('Data science questions:', str(num_data_questions) + '/' + str(num_questions) + ',', str(round(100*(num_data_questions/num_questions),2)) + '%')
      #print('Right class:', str(num_right_class) + '/' + str(num_questions) + ',', str(round(100*(num_right_class/num_questions),2)) + '%')
      print()
      #print('Total:', str(round(100*((num_questions/100) + (num_unique/num_questions) + (num_data_questions/num_questions))/3,2)) + '%')
      print('Total:', str(round(100*((num_questions/100) + (num_unique/num_questions) + (num_data_questions/num_questions) + (cos_count/num_questions))/4,2)) + '%')
      #print('Total:', str(round(25*((num_questions/100) + (num_unique/num_questions) + (num_data_questions/num_questions) + (num_right_class/num_questions)),2)) + '%')
      print()
      print()

-------------------------------------------------
Model name: prefix/checkpoint-38
Average cosine distance of the questions: 0.25178050478247144
Diversity: 42/100, 42%
Uniqueness: 42/42, 100.0%
Data science questions: 0/42, 0.0%

Total: 41.79%


-------------------------------------------------
Model name: prefix/checkpoint-76
Average cosine distance of the questions: 0.14976286791254795
Diversity: 78/100, 78%
Uniqueness: 78/78, 100.0%
Data science questions: 6/78, 7.69%

Total: 50.17%


-------------------------------------------------
Model name: prefix/checkpoint-114
Average cosine distance of the questions: 0.11784262015223605
Diversity: 68/100, 68%
Uniqueness: 68/68, 100.0%
Data science questions: 9/68, 13.24%

Total: 48.25%


-------------------------------------------------
Model name: prefix/checkpoint-152
Average cosine distance of the questions: 0.11923519321980965
Diversity: 81/100, 81%
Uniqueness: 81/81, 100.0%
Data science questions: 14/81, 17.28%

Total: 52.55%


--------

## Evaluation of difficulty models

### 1. Advanced difficulty

In [None]:
checkpoint_list = range(1, 11, 1)
all_questions = []
for checkpoint in checkpoint_list:
  file_name = 'checkpoint-' + str(checkpoint) + ".txt"
  with open(file_name, 'r') as file:
    for line in file:
          line = line.rstrip()
          if line not in list(df['question']) and line not in all_questions:
            all_questions.append(line.lower())

  bows, corpus = create_tf_idf(df, all_questions)

  with open(file_name, 'r') as file:
      questions = []
      num_questions = 0
      num_unique = 0
      num_data_questions = 0
      num_right_class = 0
      cos_count = 0
      for line in file:
          line = line.rstrip()
          num_questions = num_questions + 1
          ind = corpus.index(line.lower())
          cos_count = cos_count + minDistance(ind, bows, corpus)
          if line not in list(df['question']) and line not in questions:
            questions.append(line)
            num_unique = num_unique + 1
          if classify(line) == "yes":
            num_data_questions = num_data_questions + 1
          if classify_difficulty(line) == "advanced":
            num_right_class = num_right_class + 1

      print("-------------------------------------------------")
      print("Model name:", "advanced/lora/checkpoint-" + str(checkpoint))
      print("Average cosine distance of the questions:", cos_count/num_questions)
      print('Diversity:', str(num_questions) + '/100,', str(num_questions) + '%')
      print('Uniqueness:', str(num_unique) + '/' + str(num_questions) + ',', str(round(100*(num_unique/num_questions),2)) + '%')
      print('Data science questions:', str(num_data_questions) + '/' + str(num_questions) + ',', str(round(100*(num_data_questions/num_questions),2)) + '%')
      print('Right class:', str(num_right_class) + '/' + str(num_questions) + ',', str(round(100*(num_right_class/num_questions),2)) + '%')
      print()
      #print('Total:', str(round(100*((num_questions/100) + (num_unique/num_questions) + (num_data_questions/num_questions))/3,2)) + '%')
      print('Total:', str(round(25*((num_questions/100) + (num_unique/num_questions) + (num_data_questions/num_questions) + (num_right_class/num_questions)),2)) + '%')
      print('Total:', str(round(20*((num_questions/100) + (num_unique/num_questions) + (num_data_questions/num_questions) + (num_right_class/num_questions) + (cos_count/num_questions)),2)) + '%')
      print()
      print()

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 108ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 76ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 77ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 81ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 71ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 71ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 77ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 68ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 64ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 74ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 82ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 74ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 233ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 

### 2. Beginner difficulty

In [None]:
checkpoint_list = range(2, 21, 2)
all_questions = []
for checkpoint in checkpoint_list:
  file_name = 'checkpoint-' + str(checkpoint) + ".txt"
  with open(file_name, 'r') as file:
    for line in file:
          line = line.rstrip()
          if line not in list(df['question']) and line not in all_questions:
            all_questions.append(line.lower())

  bows, corpus = create_tf_idf(df, all_questions)

  with open(file_name, 'r') as file:
      questions = []
      num_questions = 0
      num_unique = 0
      num_data_questions = 0
      num_right_class = 0
      cos_count = 0
      for line in file:
          line = line.rstrip()
          num_questions = num_questions + 1
          ind = corpus.index(line.lower())
          cos_count = cos_count + minDistance(ind, bows, corpus)
          if line not in list(df['question']) and line not in questions:
            questions.append(line)
            num_unique = num_unique + 1
          if classify(line) == "yes":
            num_data_questions = num_data_questions + 1
          if classify_difficulty(line) == "beginner":
            num_right_class = num_right_class + 1

      print("-------------------------------------------------")
      print("Model name:", "advanced/lora/checkpoint-" + str(checkpoint))
      print("Average cosine distance of the questions:", cos_count/num_questions)
      print('Diversity:', str(num_questions) + '/100,', str(num_questions) + '%')
      print('Uniqueness:', str(num_unique) + '/' + str(num_questions) + ',', str(round(100*(num_unique/num_questions),2)) + '%')
      print('Data science questions:', str(num_data_questions) + '/' + str(num_questions) + ',', str(round(100*(num_data_questions/num_questions),2)) + '%')
      print('Right class:', str(num_right_class) + '/' + str(num_questions) + ',', str(round(100*(num_right_class/num_questions),2)) + '%')
      print()
      #print('Total:', str(round(100*((num_questions/100) + (num_unique/num_questions) + (num_data_questions/num_questions))/3,2)) + '%')
      print('Total:', str(round(25*((num_questions/100) + (num_unique/num_questions) + (num_data_questions/num_questions) + (num_right_class/num_questions)),2)) + '%')
      print('Total:', str(round(20*((num_questions/100) + (num_unique/num_questions) + (num_data_questions/num_questions) + (num_right_class/num_questions) + (cos_count/num_questions)),2)) + '%')
      print()
      print()

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 76ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 76ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 73ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 91ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 93ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 88ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 89ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 276ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 81ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 83ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 170ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 93ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 154ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m

### 3. Intermediate difficulty

In [None]:
checkpoint_list = range(4, 41, 4)
all_questions = []
for checkpoint in checkpoint_list:
  file_name = 'checkpoint-' + str(checkpoint) + ".txt"
  with open(file_name, 'r') as file:
    for line in file:
          line = line.rstrip()
          if line not in list(df['question']) and line not in all_questions:
            all_questions.append(line.lower())

  bows, corpus = create_tf_idf(df, all_questions)

  with open(file_name, 'r') as file:
      questions = []
      num_questions = 0
      num_unique = 0
      num_data_questions = 0
      num_right_class = 0
      cos_count = 0
      for line in file:
          line = line.rstrip()
          num_questions = num_questions + 1
          ind = corpus.index(line.lower())
          cos_count = cos_count + minDistance(ind, bows, corpus)
          if line not in list(df['question']) and line not in questions:
            questions.append(line)
            num_unique = num_unique + 1
          if classify(line) == "yes":
            num_data_questions = num_data_questions + 1
          if classify_difficulty(line) == "intermediate":
            num_right_class = num_right_class + 1

      print("-------------------------------------------------")
      print("Model name:", "advanced/lora/checkpoint-" + str(checkpoint))
      print("Average cosine distance of the questions:", cos_count/num_questions)
      print('Diversity:', str(num_questions) + '/100,', str(num_questions) + '%')
      print('Uniqueness:', str(num_unique) + '/' + str(num_questions) + ',', str(round(100*(num_unique/num_questions),2)) + '%')
      print('Data science questions:', str(num_data_questions) + '/' + str(num_questions) + ',', str(round(100*(num_data_questions/num_questions),2)) + '%')
      print('Right class:', str(num_right_class) + '/' + str(num_questions) + ',', str(round(100*(num_right_class/num_questions),2)) + '%')
      print()
      #print('Total:', str(round(100*((num_questions/100) + (num_unique/num_questions) + (num_data_questions/num_questions))/3,2)) + '%')
      print('Total:', str(round(25*((num_questions/100) + (num_unique/num_questions) + (num_data_questions/num_questions) + (num_right_class/num_questions)),2)) + '%')
      print('Total:', str(round(20*((num_questions/100) + (num_unique/num_questions) + (num_data_questions/num_questions) + (num_right_class/num_questions) + (cos_count/num_questions)),2)) + '%')
      print()
      print()

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 79ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 95ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 94ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 107ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 97ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 86ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 81ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 90ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 77ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 88ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 71ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 245ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 160ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m

## Evaluation of topic models

### 1. Classification topic

In [None]:
topic = 'classification'
checkpoint_list = list(range(1,6))
all_questions = []
for checkpoint in checkpoint_list:
  file_name = 'checkpoint-' + str(checkpoint) + ".txt"
  with open(file_name, 'r') as file:
    for line in file:
          line = line.rstrip()
          if line not in list(df['question']) and line not in all_questions:
            all_questions.append(line.lower())

  bows, corpus = create_tf_idf(df, all_questions)

  with open(file_name, 'r') as file:
      questions = []
      num_questions = 0
      num_unique = 0
      num_data_questions = 0
      num_right_class = 0
      cos_count = 0
      for line in file:
          line = line.rstrip()
          num_questions = num_questions + 1
          ind = corpus.index(line.lower())
          cos_count = cos_count + minDistance(ind, bows, corpus)
          if line not in list(df['question']) and line not in questions:
            questions.append(line)
            num_unique = num_unique + 1
          if classify(line) == "yes":
            num_data_questions = num_data_questions + 1
          if classify_topic(line) == topic:
            num_right_class = num_right_class + 1

      print("-------------------------------------------------")
      print("Model name:", "topic/"+topic+"/checkpoint-" + str(checkpoint))
      print("Average cosine distance of the questions:", cos_count/num_questions)
      print('Diversity:', str(num_questions) + '/100,', str(num_questions) + '%')
      print('Uniqueness:', str(num_unique) + '/' + str(num_questions) + ',', str(round(100*(num_unique/num_questions),2)) + '%')
      print('Data science questions:', str(num_data_questions) + '/' + str(num_questions) + ',', str(round(100*(num_data_questions/num_questions),2)) + '%')
      print('Right class:', str(num_right_class) + '/' + str(num_questions) + ',', str(round(100*(num_right_class/num_questions),2)) + '%')
      print()
      #print('Total:', str(round(100*((num_questions/100) + (num_unique/num_questions) + (num_data_questions/num_questions))/3,2)) + '%')
      print('Total:', str(round(25*((num_questions/100) + (num_unique/num_questions) + (num_data_questions/num_questions) + (num_right_class/num_questions)),2)) + '%')
      print('Total:', str(round(20*((num_questions/100) + (num_unique/num_questions) + (num_data_questions/num_questions) + (num_right_class/num_questions) + (cos_count/num_questions)),2)) + '%')
      print()
      print()

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 45ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 38ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 34ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 35ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 35ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 35ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 34ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 45ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 33ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 35ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 36ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 34ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 38ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 34

### 2. Feature selection

In [None]:
topic = 'feature selection'
checkpoint_list = list(range(1,6))
all_questions = []
for checkpoint in checkpoint_list:
  file_name = 'checkpoint-' + str(checkpoint) + ".txt"
  with open(file_name, 'r') as file:
    for line in file:
          line = line.rstrip()
          if line not in list(df['question']) and line not in all_questions:
            all_questions.append(line.lower())

  bows, corpus = create_tf_idf(df, all_questions)

  with open(file_name, 'r') as file:
      questions = []
      num_questions = 0
      num_unique = 0
      num_data_questions = 0
      num_right_class = 0
      cos_count = 0
      for line in file:
          line = line.rstrip()
          num_questions = num_questions + 1
          ind = corpus.index(line.lower())
          cos_count = cos_count + minDistance(ind, bows, corpus)
          if line not in list(df['question']) and line not in questions:
            questions.append(line)
            num_unique = num_unique + 1
          if classify(line) == "yes":
            num_data_questions = num_data_questions + 1
          if classify_topic(line) == topic:
            num_right_class = num_right_class + 1

      print("-------------------------------------------------")
      print("Model name:", "topic/"+topic+"/checkpoint-" + str(checkpoint))
      print("Average cosine distance of the questions:", cos_count/num_questions)
      print('Diversity:', str(num_questions) + '/100,', str(num_questions) + '%')
      print('Uniqueness:', str(num_unique) + '/' + str(num_questions) + ',', str(round(100*(num_unique/num_questions),2)) + '%')
      print('Data science questions:', str(num_data_questions) + '/' + str(num_questions) + ',', str(round(100*(num_data_questions/num_questions),2)) + '%')
      print('Right class:', str(num_right_class) + '/' + str(num_questions) + ',', str(round(100*(num_right_class/num_questions),2)) + '%')
      print()
      #print('Total:', str(round(100*((num_questions/100) + (num_unique/num_questions) + (num_data_questions/num_questions))/3,2)) + '%')
      print('Total:', str(round(25*((num_questions/100) + (num_unique/num_questions) + (num_data_questions/num_questions) + (num_right_class/num_questions)),2)) + '%')
      print('Total:', str(round(20*((num_questions/100) + (num_unique/num_questions) + (num_data_questions/num_questions) + (num_right_class/num_questions) + (cos_count/num_questions)),2)) + '%')
      print()
      print()

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 33ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 37ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 36ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 41ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 37ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 38ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 38ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 36ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 37ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 36ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 33ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 35ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 33ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 39

### 3. Neural networks

In [None]:
topic = 'neural networks'
checkpoint_list = range(1, 6, 1)
all_questions = []
for checkpoint in checkpoint_list:
  file_name = 'checkpoint-' + str(checkpoint) + ".txt"
  with open(file_name, 'r') as file:
    for line in file:
          line = line.rstrip()
          if line not in list(df['question']) and line not in all_questions:
            all_questions.append(line.lower())

  bows, corpus = create_tf_idf(df, all_questions)

  with open(file_name, 'r') as file:
      questions = []
      num_questions = 0
      num_unique = 0
      num_data_questions = 0
      num_right_class = 0
      cos_count = 0
      for line in file:
          line = line.rstrip()
          num_questions = num_questions + 1
          ind = corpus.index(line.lower())
          cos_count = cos_count + minDistance(ind, bows, corpus)
          if line not in list(df['question']) and line not in questions:
            questions.append(line)
            num_unique = num_unique + 1
          if classify(line) == "yes":
            num_data_questions = num_data_questions + 1
          if classify_topic(line) == topic:
            num_right_class = num_right_class + 1

      print("-------------------------------------------------")
      print("Model name:", "topic/"+topic+"/checkpoint-" + str(checkpoint))
      print("Average cosine distance of the questions:", cos_count/num_questions)
      print('Diversity:', str(num_questions) + '/100,', str(num_questions) + '%')
      print('Uniqueness:', str(num_unique) + '/' + str(num_questions) + ',', str(round(100*(num_unique/num_questions),2)) + '%')
      print('Data science questions:', str(num_data_questions) + '/' + str(num_questions) + ',', str(round(100*(num_data_questions/num_questions),2)) + '%')
      print('Right class:', str(num_right_class) + '/' + str(num_questions) + ',', str(round(100*(num_right_class/num_questions),2)) + '%')
      print()
      #print('Total:', str(round(100*((num_questions/100) + (num_unique/num_questions) + (num_data_questions/num_questions))/3,2)) + '%')
      print('Total:', str(round(25*((num_questions/100) + (num_unique/num_questions) + (num_data_questions/num_questions) + (num_right_class/num_questions)),2)) + '%')
      print('Total:', str(round(20*((num_questions/100) + (num_unique/num_questions) + (num_data_questions/num_questions) + (num_right_class/num_questions) + (cos_count/num_questions)),2)) + '%')
      print()
      print()

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 204ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 87ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 202ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 122ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 82ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 86ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 71ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 68ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 64ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 67ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 103ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 61ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 64ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0

### 4. Recommender systems

In [None]:
topic = 'recommender systems'
checkpoint_list = range(1, 6, 1)
all_questions = []
for checkpoint in checkpoint_list:
  file_name = 'checkpoint-' + str(checkpoint) + ".txt"
  with open(file_name, 'r') as file:
    for line in file:
          line = line.rstrip()
          if line not in list(df['question']) and line not in all_questions:
            all_questions.append(line.lower())

  bows, corpus = create_tf_idf(df, all_questions)

  with open(file_name, 'r') as file:
      questions = []
      num_questions = 0
      num_unique = 0
      num_data_questions = 0
      num_right_class = 0
      cos_count = 0
      for line in file:
          line = line.rstrip()
          num_questions = num_questions + 1
          ind = corpus.index(line.lower())
          cos_count = cos_count + minDistance(ind, bows, corpus)
          if line not in list(df['question']) and line not in questions:
            questions.append(line)
            num_unique = num_unique + 1
          if classify(line) == "yes":
            num_data_questions = num_data_questions + 1
          if classify_topic(line) == topic:
            num_right_class = num_right_class + 1

      print("-------------------------------------------------")
      print("Model name:", "topic/"+topic+"/checkpoint-" + str(checkpoint))
      print("Average cosine distance of the questions:", cos_count/num_questions)
      print('Diversity:', str(num_questions) + '/100,', str(num_questions) + '%')
      print('Uniqueness:', str(num_unique) + '/' + str(num_questions) + ',', str(round(100*(num_unique/num_questions),2)) + '%')
      print('Data science questions:', str(num_data_questions) + '/' + str(num_questions) + ',', str(round(100*(num_data_questions/num_questions),2)) + '%')
      print('Right class:', str(num_right_class) + '/' + str(num_questions) + ',', str(round(100*(num_right_class/num_questions),2)) + '%')
      print()
      #print('Total:', str(round(100*((num_questions/100) + (num_unique/num_questions) + (num_data_questions/num_questions))/3,2)) + '%')
      print('Total:', str(round(25*((num_questions/100) + (num_unique/num_questions) + (num_data_questions/num_questions) + (num_right_class/num_questions)),2)) + '%')
      print('Total:', str(round(20*((num_questions/100) + (num_unique/num_questions) + (num_data_questions/num_questions) + (num_right_class/num_questions) + (cos_count/num_questions)),2)) + '%')
      print()
      print()

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 89ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 166ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 84ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 81ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 109ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 71ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 68ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 66ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 69ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 81ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 85ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 106ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 150ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0

### 5. Regularization

In [None]:
topic = 'regularization'
checkpoint_list = range(1, 6, 1)
all_questions = []
for checkpoint in checkpoint_list:
  file_name = 'checkpoint-' + str(checkpoint) + ".txt"
  with open(file_name, 'r') as file:
    for line in file:
          line = line.rstrip()
          if line not in list(df['question']) and line not in all_questions:
            all_questions.append(line.lower())

  bows, corpus = create_tf_idf(df, all_questions)

  with open(file_name, 'r') as file:
      questions = []
      num_questions = 0
      num_unique = 0
      num_data_questions = 0
      num_right_class = 0
      cos_count = 0
      for line in file:
          line = line.rstrip()
          num_questions = num_questions + 1
          ind = corpus.index(line.lower())
          cos_count = cos_count + minDistance(ind, bows, corpus)
          if line not in list(df['question']) and line not in questions:
            questions.append(line)
            num_unique = num_unique + 1
          if classify(line) == "yes":
            num_data_questions = num_data_questions + 1
          if classify_topic(line) == topic:
            num_right_class = num_right_class + 1

      print("-------------------------------------------------")
      print("Model name:", "topic/"+topic+"/checkpoint-" + str(checkpoint))
      print("Average cosine distance of the questions:", cos_count/num_questions)
      print('Diversity:', str(num_questions) + '/100,', str(num_questions) + '%')
      print('Uniqueness:', str(num_unique) + '/' + str(num_questions) + ',', str(round(100*(num_unique/num_questions),2)) + '%')
      print('Data science questions:', str(num_data_questions) + '/' + str(num_questions) + ',', str(round(100*(num_data_questions/num_questions),2)) + '%')
      print('Right class:', str(num_right_class) + '/' + str(num_questions) + ',', str(round(100*(num_right_class/num_questions),2)) + '%')
      print()
      #print('Total:', str(round(100*((num_questions/100) + (num_unique/num_questions) + (num_data_questions/num_questions))/3,2)) + '%')
      print('Total:', str(round(25*((num_questions/100) + (num_unique/num_questions) + (num_data_questions/num_questions) + (num_right_class/num_questions)),2)) + '%')
      print('Total:', str(round(20*((num_questions/100) + (num_unique/num_questions) + (num_data_questions/num_questions) + (num_right_class/num_questions) + (cos_count/num_questions)),2)) + '%')
      print()
      print()

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 97ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 105ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 89ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 82ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 81ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 82ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 76ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 78ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 85ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 73ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 73ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 65ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 91ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 8

### 6. Supervised learning

In [None]:
topic = 'supervised learning'
checkpoint_list = range(1, 6, 1)
all_questions = []
for checkpoint in checkpoint_list:
  file_name = 'checkpoint-' + str(checkpoint) + ".txt"
  with open(file_name, 'r') as file:
    for line in file:
          line = line.rstrip()
          if line not in list(df['question']) and line not in all_questions:
            all_questions.append(line.lower())

  bows, corpus = create_tf_idf(df, all_questions)

  with open(file_name, 'r') as file:
      questions = []
      num_questions = 0
      num_unique = 0
      num_data_questions = 0
      num_right_class = 0
      cos_count = 0
      for line in file:
          line = line.rstrip()
          num_questions = num_questions + 1
          ind = corpus.index(line.lower())
          cos_count = cos_count + minDistance(ind, bows, corpus)
          if line not in list(df['question']) and line not in questions:
            questions.append(line)
            num_unique = num_unique + 1
          if classify(line) == "yes":
            num_data_questions = num_data_questions + 1
          if classify_topic(line) == topic:
            num_right_class = num_right_class + 1

      print("-------------------------------------------------")
      print("Model name:", "topic/"+topic+"/checkpoint-" + str(checkpoint))
      print("Average cosine distance of the questions:", cos_count/num_questions)
      print('Diversity:', str(num_questions) + '/100,', str(num_questions) + '%')
      print('Uniqueness:', str(num_unique) + '/' + str(num_questions) + ',', str(round(100*(num_unique/num_questions),2)) + '%')
      print('Data science questions:', str(num_data_questions) + '/' + str(num_questions) + ',', str(round(100*(num_data_questions/num_questions),2)) + '%')
      print('Right class:', str(num_right_class) + '/' + str(num_questions) + ',', str(round(100*(num_right_class/num_questions),2)) + '%')
      print()
      #print('Total:', str(round(100*((num_questions/100) + (num_unique/num_questions) + (num_data_questions/num_questions))/3,2)) + '%')
      print('Total:', str(round(25*((num_questions/100) + (num_unique/num_questions) + (num_data_questions/num_questions) + (num_right_class/num_questions)),2)) + '%')
      print('Total:', str(round(20*((num_questions/100) + (num_unique/num_questions) + (num_data_questions/num_questions) + (num_right_class/num_questions) + (cos_count/num_questions)),2)) + '%')
      print()
      print()

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 256ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 189ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 192ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 75ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 76ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 73ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 93ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 95ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 168ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 125ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 85ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 81ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 69ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[

### 7. Text classification

In [None]:
topic = 'text classification'
checkpoint_list = range(1, 6, 1)
all_questions = []
for checkpoint in checkpoint_list:
  file_name = 'checkpoint-' + str(checkpoint) + ".txt"
  with open(file_name, 'r') as file:
    for line in file:
          line = line.rstrip()
          if line not in list(df['question']) and line not in all_questions:
            all_questions.append(line.lower())

  bows, corpus = create_tf_idf(df, all_questions)

  with open(file_name, 'r') as file:
      questions = []
      num_questions = 0
      num_unique = 0
      num_data_questions = 0
      num_right_class = 0
      cos_count = 0
      for line in file:
          line = line.rstrip()
          num_questions = num_questions + 1
          ind = corpus.index(line.lower())
          cos_count = cos_count + minDistance(ind, bows, corpus)
          if line not in list(df['question']) and line not in questions:
            questions.append(line)
            num_unique = num_unique + 1
          if classify(line) == "yes":
            num_data_questions = num_data_questions + 1
          if classify_topic(line) == topic:
            num_right_class = num_right_class + 1

      print("-------------------------------------------------")
      print("Model name:", "topic/"+topic+"/checkpoint-" + str(checkpoint))
      print("Average cosine distance of the questions:", cos_count/num_questions)
      print('Diversity:', str(num_questions) + '/100,', str(num_questions) + '%')
      print('Uniqueness:', str(num_unique) + '/' + str(num_questions) + ',', str(round(100*(num_unique/num_questions),2)) + '%')
      print('Data science questions:', str(num_data_questions) + '/' + str(num_questions) + ',', str(round(100*(num_data_questions/num_questions),2)) + '%')
      print('Right class:', str(num_right_class) + '/' + str(num_questions) + ',', str(round(100*(num_right_class/num_questions),2)) + '%')
      print()
      #print('Total:', str(round(100*((num_questions/100) + (num_unique/num_questions) + (num_data_questions/num_questions))/3,2)) + '%')
      print('Total:', str(round(25*((num_questions/100) + (num_unique/num_questions) + (num_data_questions/num_questions) + (num_right_class/num_questions)),2)) + '%')
      print('Total:', str(round(20*((num_questions/100) + (num_unique/num_questions) + (num_data_questions/num_questions) + (num_right_class/num_questions) + (cos_count/num_questions)),2)) + '%')
      print()
      print()

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 163ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 87ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 89ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 89ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 80ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 114ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 73ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 73ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 63ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 69ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 71ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 67ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 93ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 

### 8. Time series

In [None]:
topic = 'time series'
checkpoint_list = range(1, 6, 1)
all_questions = []
for checkpoint in checkpoint_list:
  file_name = 'checkpoint-' + str(checkpoint) + ".txt"
  with open(file_name, 'r') as file:
    for line in file:
          line = line.rstrip()
          if line not in list(df['question']) and line not in all_questions:
            all_questions.append(line.lower())

  bows, corpus = create_tf_idf(df, all_questions)

  with open(file_name, 'r') as file:
      questions = []
      num_questions = 0
      num_unique = 0
      num_data_questions = 0
      num_right_class = 0
      cos_count = 0
      for line in file:
          line = line.rstrip()
          num_questions = num_questions + 1
          ind = corpus.index(line.lower())
          cos_count = cos_count + minDistance(ind, bows, corpus)
          if line not in list(df['question']) and line not in questions:
            questions.append(line)
            num_unique = num_unique + 1
          if classify(line) == "yes":
            num_data_questions = num_data_questions + 1
          if classify_topic(line) == topic:
            num_right_class = num_right_class + 1

      print("-------------------------------------------------")
      print("Model name:", "topic/"+topic+"/checkpoint-" + str(checkpoint))
      print("Average cosine distance of the questions:", cos_count/num_questions)
      print('Diversity:', str(num_questions) + '/100,', str(num_questions) + '%')
      print('Uniqueness:', str(num_unique) + '/' + str(num_questions) + ',', str(round(100*(num_unique/num_questions),2)) + '%')
      print('Data science questions:', str(num_data_questions) + '/' + str(num_questions) + ',', str(round(100*(num_data_questions/num_questions),2)) + '%')
      print('Right class:', str(num_right_class) + '/' + str(num_questions) + ',', str(round(100*(num_right_class/num_questions),2)) + '%')
      print()
      #print('Total:', str(round(100*((num_questions/100) + (num_unique/num_questions) + (num_data_questions/num_questions))/3,2)) + '%')
      print('Total:', str(round(25*((num_questions/100) + (num_unique/num_questions) + (num_data_questions/num_questions) + (num_right_class/num_questions)),2)) + '%')
      print('Total:', str(round(20*((num_questions/100) + (num_unique/num_questions) + (num_data_questions/num_questions) + (num_right_class/num_questions) + (cos_count/num_questions)),2)) + '%')
      print()
      print()

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 87ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 90ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 77ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 135ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 212ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 176ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 257ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 117ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 139ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 78ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 92ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 87ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 72ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s

### 9. Unsupervised learning

In [None]:
topic = 'unsupervised learning'
checkpoint_list = range(1, 6, 1)
all_questions = []
for checkpoint in checkpoint_list:
  file_name = 'checkpoint-' + str(checkpoint) + ".txt"
  with open(file_name, 'r') as file:
    for line in file:
          line = line.rstrip()
          if line not in list(df['question']) and line not in all_questions:
            all_questions.append(line.lower())

  bows, corpus = create_tf_idf(df, all_questions)

  with open(file_name, 'r') as file:
      questions = []
      num_questions = 0
      num_unique = 0
      num_data_questions = 0
      num_right_class = 0
      cos_count = 0
      for line in file:
          line = line.rstrip()
          num_questions = num_questions + 1
          ind = corpus.index(line.lower())
          cos_count = cos_count + minDistance(ind, bows, corpus)
          if line not in list(df['question']) and line not in questions:
            questions.append(line)
            num_unique = num_unique + 1
          if classify(line) == "yes":
            num_data_questions = num_data_questions + 1
          if classify_topic(line) == topic:
            num_right_class = num_right_class + 1

      print("-------------------------------------------------")
      print("Model name:", "topic/"+topic+"/checkpoint-" + str(checkpoint))
      print("Average cosine distance of the questions:", cos_count/num_questions)
      print('Diversity:', str(num_questions) + '/100,', str(num_questions) + '%')
      print('Uniqueness:', str(num_unique) + '/' + str(num_questions) + ',', str(round(100*(num_unique/num_questions),2)) + '%')
      print('Data science questions:', str(num_data_questions) + '/' + str(num_questions) + ',', str(round(100*(num_data_questions/num_questions),2)) + '%')
      print('Right class:', str(num_right_class) + '/' + str(num_questions) + ',', str(round(100*(num_right_class/num_questions),2)) + '%')
      print()
      #print('Total:', str(round(100*((num_questions/100) + (num_unique/num_questions) + (num_data_questions/num_questions))/3,2)) + '%')
      print('Total:', str(round(25*((num_questions/100) + (num_unique/num_questions) + (num_data_questions/num_questions) + (num_right_class/num_questions)),2)) + '%')
      print('Total:', str(round(20*((num_questions/100) + (num_unique/num_questions) + (num_data_questions/num_questions) + (num_right_class/num_questions) + (cos_count/num_questions)),2)) + '%')
      print()
      print()

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 101ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 105ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 84ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 81ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 145ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 80ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 73ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 96ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 70ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 67ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 85ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 103ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 90ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0

# 2. Extended dataset creation

In [None]:
data_path = "dataset_extended_difficulty.csv"
df = pd.read_csv(data_path)

class Vocabulary:
    def __init__(self):
        self.word2idx = {}
        self.idx2word = {}
        self.idx = 0

    def add_word(self, word):
        if word not in self.word2idx:
            self.word2idx[word] = self.idx
            self.idx2word[self.idx] = word
            self.idx += 1

    def build_vocabulary(self, captions):
            for caption in captions:
                words = nltk.word_tokenize(caption.lower())
                for word in words:
                    self.add_word(WordNetLemmatizer().lemmatize(word))

vocab = Vocabulary()
vocab.build_vocabulary(list(df['question']))
VOCAB_SIZE = len(vocab.word2idx)

word_occurrence = {}
for idx in range(vocab.idx):
    word_occurrence[vocab.idx2word[idx]] = 0

i = 0
for caption in list(df['question']):
    words = nltk.word_tokenize(caption.lower())
    for word in words:
        new_word = WordNetLemmatizer().lemmatize(word)
        word_occurrence[new_word] = word_occurrence[new_word] + 1

word_occurrence_no_stopwords = {}
for k, v in word_occurrence.items():
    if any(c.isalpha() for c in k) and len(k) > 1 and k not in stopwords.words('english'):
      word_occurrence_no_stopwords[k] = v

sorted_tokens = sorted(word_occurrence_no_stopwords.items(), key=lambda x: x[1], reverse=True)
sorted_tokens = [('<PAD>', 0)] + sorted_tokens

word2idx = {}
idx2word = {}
idx = 0
with open("keywords.txt") as file:
    #for word, occurence in sorted_tokens:
    for line in file:
        line = line.rstrip()
        word = line.split(' ')[0]
        word2idx[word] = idx
        idx2word[idx] = word
        idx = idx + 1

key_vectors = []
vec_len = 10
for text in list(df['question']):
  vec_rep = []
  words = nltk.word_tokenize(text.lower())
  keywords = []
  for word in words:
      new_word = WordNetLemmatizer().lemmatize(word)
      if new_word in word2idx:
        keywords.append(new_word)
        vec_rep.append(word2idx[new_word])
  if len(vec_rep) >= vec_len:
    vec_rep = vec_rep[:vec_len]
  else:
    vec_rep = vec_rep + [0 for x in range(vec_len - len(vec_rep))]

  vec_rep = np.array(vec_rep)
  key_vectors.append(vec_rep)
key_vectors = np.array(key_vectors)

print(key_vectors)
print()

targets = [elm for elm in df.loc[:, 'difficulty']]
ratings = pd.Categorical(targets)
targets = to_categorical(ratings.codes)

X_train, X_test, y_train, y_test = train_test_split(key_vectors, targets, stratify=targets, test_size=0.15)

oversampler = RandomOverSampler()
xTrain_oversampled, yTrain_oversampled = oversampler.fit_resample(X_train, y_train)

print("Original data: advanced: " + str(len([x for x in y_train if x[0] == 1])) + ", beginner: " + str(len([x for x in y_train if x[1] == 1])) + ", intermediate: " + str(len([x for x in y_train if x[2] == 1])))
print("Oversampled data: advanced: " + str(len([x for x in yTrain_oversampled if x[0] == 1])) + ", beginner: " + str(len([x for x in yTrain_oversampled if x[1] == 1])) + ", intermediate: " + str(len([x for x in yTrain_oversampled if x[2] == 1])))

[[  9  34  19 ...   0   0   0]
 [ 29  22   0 ...   0   0   0]
 [148  60  88 ...   0   0   0]
 ...
 [179  16  16 ...   0   0   0]
 [146 147  38 ...   0   0   0]
 [  4   6   0 ...   0   0   0]]

Original data: advanced: 102, beginner: 102, intermediate: 119
Oversampled data: advanced: 119, beginner: 119, intermediate: 119


In [None]:
difficulty_model = Sequential([
    Embedding(200, 64),
    LSTM(units=100, dropout=0.1),
    Dense(64, activation='relu'),
    Dense(3, activation=softmax)
])

print("Training of model: ")
print()
difficulty_model.compile(optimizer=Adam(), loss='categorical_crossentropy', metrics=['accuracy'])
difficulty_model.fit(xTrain_oversampled, yTrain_oversampled, epochs=10, validation_split=0.15)

difficulty_model.load_weights("difficulty_pred_model_extended_lstm_2.weights.h5")

Training of model: 

Epoch 1/10
[1m10/10[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m9s[0m 135ms/step - accuracy: 0.3596 - loss: 1.0984 - val_accuracy: 0.1296 - val_loss: 1.1170
Epoch 2/10
[1m10/10[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 46ms/step - accuracy: 0.4227 - loss: 1.0845 - val_accuracy: 0.3704 - val_loss: 1.0854
Epoch 3/10
[1m10/10[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 42ms/step - accuracy: 0.5172 - loss: 1.0393 - val_accuracy: 0.8333 - val_loss: 0.7935
Epoch 4/10
[1m10/10[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 40ms/step - accuracy: 0.6308 - loss: 0.7417 - val_accuracy: 0.8519 - val_loss: 0.4562
Epoch 5/10
[1m10/10[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 41ms/step - accuracy: 0.7909 - loss: 0.5452 - val_accuracy: 0.9074 - val_loss: 0.2772
Epoch 6/10
[1m10/10[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 70ms/step - accuracy: 0.8787 - loss: 0.3431 - val_accuracy: 0.9259 - val_loss: 0.2345
Epoch 7/10


In [None]:
def classify_difficulty(text):
  sequence = keyword_tokenize(text)
  predicted = difficulty_model.predict(np.array([sequence]))
  pred_list = np.ndarray.tolist(predicted[0])
  pred_ind = pred_list.index(max(pred_list))
  if pred_ind == 0:
    return 'advanced'
  elif pred_ind == 1:
    return 'beginner'
  elif pred_ind == 2:
    return 'intermediate'

In [None]:
data_path = "dataset_extended_topic.csv"
df = pd.read_csv(data_path)

class Vocabulary:
    def __init__(self):
        self.word2idx = {}
        self.idx2word = {}
        self.idx = 0

    def add_word(self, word):
        if word not in self.word2idx:
            self.word2idx[word] = self.idx
            self.idx2word[self.idx] = word
            self.idx += 1

    def build_vocabulary(self, captions):
            for caption in captions:
                words = nltk.word_tokenize(caption.lower())
                for word in words:
                    self.add_word(WordNetLemmatizer().lemmatize(word))

vocab = Vocabulary()
vocab.build_vocabulary(list(df['question']))
VOCAB_SIZE = len(vocab.word2idx)

word_occurrence = {}
for idx in range(vocab.idx):
    word_occurrence[vocab.idx2word[idx]] = 0

i = 0
for caption in list(df['question']):
    words = nltk.word_tokenize(caption.lower())
    for word in words:
        new_word = WordNetLemmatizer().lemmatize(word)
        word_occurrence[new_word] = word_occurrence[new_word] + 1

word_occurrence_no_stopwords = {}
for k, v in word_occurrence.items():
    if any(c.isalpha() for c in k) and len(k) > 1 and k not in stopwords.words('english'):
      word_occurrence_no_stopwords[k] = v

sorted_tokens = sorted(word_occurrence_no_stopwords.items(), key=lambda x: x[1], reverse=True)
sorted_tokens = [('<PAD>', 0)] + sorted_tokens

word2idx = {}
idx2word = {}
idx = 0
with open("keywords.txt") as file:
    #for word, occurence in sorted_tokens:
    for line in file:
        line = line.rstrip()
        word = line.split(' ')[0]
        word2idx[word] = idx
        idx2word[idx] = word
        idx = idx + 1

key_vectors = []
vec_len = 10
for text in list(df['question']):
  vec_rep = []
  words = nltk.word_tokenize(text.lower())
  keywords = []
  for word in words:
      new_word = WordNetLemmatizer().lemmatize(word)
      if new_word in word2idx:
        keywords.append(new_word)
        vec_rep.append(word2idx[new_word])
  if len(vec_rep) >= vec_len:
    vec_rep = vec_rep[:vec_len]
  else:
    vec_rep = vec_rep + [0 for x in range(vec_len - len(vec_rep))]

  vec_rep = np.array(vec_rep)
  key_vectors.append(vec_rep)
key_vectors = np.array(key_vectors)

print(key_vectors)
print()

targets = [elm for elm in df.loc[:, 'topic']]
ratings = pd.Categorical(targets)
targets = to_categorical(ratings.codes)

X_train, X_test, y_train, y_test = train_test_split(key_vectors, targets, stratify=targets, test_size=0.15)

oversampler = RandomOverSampler()
xTrain_oversampled, yTrain_oversampled = oversampler.fit_resample(X_train, y_train)

[[193 194   0 ...   0   0   0]
 [ 48  66   7 ...   0   0   0]
 [ 72  37  10 ...   0   0   0]
 ...
 [ 23  42   6 ...   0   0   0]
 [ 27 192  68 ...   0   0   0]
 [ 42   6   0 ...   0   0   0]]



In [None]:
class Transformer3(Model):
    def __init__(self):
        super().__init__()
        self.embedding = Embedding(200, 100)
        self.multiheadattention = MultiHeadAttention(8, 3, value_dim=None, dropout=0.05)
        self.layernorm = LayerNormalization()
        self.add = Add()
        self.feedforward = Sequential([Dense(100, activation='relu'), Dense(100, activation='relu'), Dropout(0.1)])
        self.dense1 = Dense(64, activation='relu')
        self.dense2 = Dense(9, activation=softmax)

    def call(self, inputs):
        x1 = self.embedding(inputs)
        attn_output = self.multiheadattention(x1, x1)
        x1 = self.add([x1, attn_output])
        x1 = self.layernorm(x1)
        x1 = self.feedforward(x1)
        x1 = self.add([x1, attn_output])
        x1 = self.layernorm(x1)

        x2 = self.embedding(inputs)
        attn_output = self.multiheadattention(x2, x2)
        x2 = self.add([x2, attn_output])
        x2 = self.layernorm(x2)
        x2 = self.feedforward(x2)
        x2 = self.add([x2, attn_output])
        x2 = self.layernorm(x2)

        attn_output = self.multiheadattention(x2, x1)
        x = self.add([x2, attn_output])
        x = self.layernorm(x)
        x = self.feedforward(x)
        x = self.add([x, attn_output])
        x = self.layernorm(x)

        x = GlobalAveragePooling1D()(x)
        x = self.dense1(x)
        return self.dense2(x)

In [None]:
topic_model = Transformer3()
topic_model.compile(optimizer=Adam(), loss='categorical_crossentropy', metrics=['accuracy'])

topic_model.fit(xTrain_oversampled, yTrain_oversampled, epochs=7, validation_split=0.15)

topic_model.load_weights("topic_pred_model_extended_transformer_2.weights.h5")

Epoch 1/7
[1m27/27[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m19s[0m 133ms/step - accuracy: 0.3181 - loss: 1.9666 - val_accuracy: 0.9595 - val_loss: 0.9032
Epoch 2/7
[1m27/27[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 66ms/step - accuracy: 0.8171 - loss: 0.8391 - val_accuracy: 0.9730 - val_loss: 0.1754
Epoch 3/7
[1m27/27[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 65ms/step - accuracy: 0.9274 - loss: 0.2316 - val_accuracy: 1.0000 - val_loss: 0.0260
Epoch 4/7
[1m27/27[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 58ms/step - accuracy: 0.9771 - loss: 0.0864 - val_accuracy: 1.0000 - val_loss: 0.0190
Epoch 5/7
[1m27/27[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 99ms/step - accuracy: 0.9861 - loss: 0.0417 - val_accuracy: 1.0000 - val_loss: 0.0067
Epoch 6/7
[1m27/27[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 57ms/step - accuracy: 0.9909 - loss: 0.0415 - val_accuracy: 1.0000 - val_loss: 0.0127
Epoch 7/7
[1m27/27[0m [32m━━━━━━━━━

In [None]:
def classify_topic(text):
  sequence = keyword_tokenize(text)
  predicted = topic_model.predict(np.array([sequence]))
  pred_list = np.ndarray.tolist(predicted[0])
  pred_ind = pred_list.index(max(pred_list))
  if pred_ind == 0:
    return 'classification'
  elif pred_ind == 1:
    return 'feature selection'
  elif pred_ind == 2:
    return 'neural networks'
  elif pred_ind == 3:
    return 'recommender systems'
  elif pred_ind == 4:
    return 'regularization'
  elif pred_ind == 5:
    return 'supervised learning'
  elif pred_ind == 6:
    return 'text classification'
  elif pred_ind == 7:
    return 'time series'
  elif pred_ind == 8:
    return 'unsupervised learning'

## 1. Difficulty extended dataset

In [None]:
questions = set(df["question"])
print(len(questions))

166


In [None]:
# advanced questions
filename = "advanced.txt"
advanced_questions = set()
with open(filename) as file:
    for line in file:
        line = line.rstrip()
        if classify(line) == "yes" and classify_difficulty(line) == "advanced" and line not in questions:
          advanced_questions.add(line)
print(len(advanced_questions))

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 38ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 38ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 55ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 47ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 56ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 49ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 44ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 43ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 40ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 43ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 38ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 39ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 38ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 37

In [None]:
# beginner questions
questions.update(advanced_questions)
filename = "beginner.txt"
beginner_questions = set()
with open(filename) as file:
    for line in file:
        line = line.rstrip()
        if classify(line) == "yes" and classify_difficulty(line) == "beginner" and line not in questions:
          beginner_questions.add(line)
print(len(beginner_questions))

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 62ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 48ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 44ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 44ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 44ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 62ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 50ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 48ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 63ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 44ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 45ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 47ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 41ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 62

In [None]:
# intermediate questions
questions.update(beginner_questions)
filename = "intermediate.txt"
intermediate_questions = set()
with open(filename) as file:
    for line in file:
        line = line.rstrip()
        if classify(line) == "yes" and classify_difficulty(line) == "intermediate" and line not in questions:
          intermediate_questions.add(line)
print(len(intermediate_questions))

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 37ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 38ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 56ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 46ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 49ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 45ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 46ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 45ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 41ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 42ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 40ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 38ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 39ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 36

In [None]:
questions = list(df['question'])
difficulty = list(df['difficulty'])

questions = questions + list(advanced_questions)
difficulty = difficulty + ["advanced" for x in advanced_questions]
questions = questions + list(beginner_questions)
difficulty = difficulty + ["beginner" for x in beginner_questions]
questions = questions + list(intermediate_questions)
difficulty = difficulty + ["intermediate" for x in intermediate_questions]

difficulty_df = pd.DataFrame()
difficulty_df['question'] = questions
difficulty_df['difficulty'] = difficulty
difficulty_df

Unnamed: 0,question,difficulty
0,What is supervised machine learning?,beginner
1,What is regression? Which models can you use t...,beginner
2,What is linear regression? When do we use it?,beginner
3,What are the main assumptions of linear regres...,intermediate
4,What’s the normal distribution? Why do we care...,beginner
...,...,...
375,If you have a series with only one variable “y...,intermediate
376,What are the main parameters in the gradient d...,intermediate
377,What if we want to build a neural network for ...,intermediate
378,Which regularization techniques for neural net...,intermediate


In [None]:
difficulty_df['difficulty'].value_counts()

Unnamed: 0_level_0,count
difficulty,Unnamed: 1_level_1
intermediate,140
beginner,120
advanced,120


In [None]:
difficulty_df = difficulty_df.sample(frac=1).reset_index(drop=True)
difficulty_df

Unnamed: 0,question,difficulty
0,Which feature selection techniques do you know?,intermediate
1,What is a time series?,beginner
2,Which hyper-parameter tuning strategies (in ge...,intermediate
3,How can we use CNN model for text classification?,advanced
4,How to predict the best time series?,beginner
...,...,...
375,What are the different types of machine learni...,beginner
376,Is accuracy always a good metric?,beginner
377,"If you have a sentence with multiple words, yo...",intermediate
378,Can you tell us more about how you would use X...,advanced


In [None]:
difficulty_df.to_csv('dataset_extended_difficulty.csv', sep='\t', encoding='utf-8', index=False, header=True)

In [None]:
data_path = "dataset_extended_difficulty.csv"
difficulty_df = pd.read_csv(data_path)
print(len(difficulty_df["question"]))

380


## 2. Topic extended dataset

In [None]:
questions = set(df["question"])
print(len(questions))

166


In [None]:
df['topic'].value_counts()

Unnamed: 0_level_0,count
topic,Unnamed: 1_level_1
neural networks,34
feature selection,30
classification,21
unsupervised learning,21
supervised learning,20
text classification,14
regularization,13
recommender systems,7
time series,7


In [None]:
# classification questions
filename = "classification.txt"
classification_questions = set()
supervised_learning_questions = set()
recommender_systems_questions = set()
unsupervised_learning_questions = set()
with open(filename) as file:
    for line in file:
        line = line.rstrip()
        if classify(line) == "yes" and classify_topic(line) == "classification" and line not in questions:
          classification_questions.add(line)
        elif classify(line) == "yes" and classify_topic(line) == "supervised learning" and line not in questions:
          supervised_learning_questions.add(line)
        elif classify(line) == "yes" and classify_topic(line) == "recommender systems" and line not in questions:
          recommender_systems_questions.add(line)
        elif classify(line) == "yes" and classify_topic(line) == "unsupervised learning" and line not in questions:
          unsupervised_learning_questions.add(line)
print(len(classification_questions))

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 51ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 47ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 67ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 65ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 63ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 59ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 64ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 49ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 46ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 68ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 67ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 66ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 61ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 53

In [None]:
# feature selection questions
questions.update(classification_questions)
filename = "feature selection.txt"
feature_selection_questions = set()
with open(filename) as file:
    for line in file:
        line = line.rstrip()
        if classify(line) == "yes" and classify_topic(line) == "feature selection" and line not in questions:
          feature_selection_questions.add(line)
        elif classify(line) == "yes" and classify_topic(line) == "supervised learning" and line not in questions:
          supervised_learning_questions.add(line)
        elif classify(line) == "yes" and classify_topic(line) == "classification" and line not in questions:
          classification_questions.add(line)
        elif classify(line) == "yes" and classify_topic(line) == "recommender systems" and line not in questions:
          recommender_systems_questions.add(line)
        elif classify(line) == "yes" and classify_topic(line) == "unsupervised learning" and line not in questions:
          unsupervised_learning_questions.add(line)
print(len(feature_selection_questions))

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 47ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 47ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 53ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 45ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 46ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 45ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 42ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 52ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 52ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 50ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 54ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 47ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 51ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 50

In [None]:
# neural networks questions
questions.update(feature_selection_questions)
filename = "neural networks.txt"
neural_networks_questions = set()
with open(filename) as file:
    for line in file:
        line = line.rstrip()
        if classify(line) == "yes" and classify_topic(line) == "neural networks" and line not in questions:
          neural_networks_questions.add(line)
        elif classify(line) == "yes" and classify_topic(line) == "supervised learning" and line not in questions:
          supervised_learning_questions.add(line)
        elif classify(line) == "yes" and classify_topic(line) == "classification" and line not in questions:
          classification_questions.add(line)
        elif classify(line) == "yes" and classify_topic(line) == "recommender systems" and line not in questions:
          recommender_systems_questions.add(line)
        elif classify(line) == "yes" and classify_topic(line) == "unsupervised learning" and line not in questions:
          unsupervised_learning_questions.add(line)
print(len(neural_networks_questions))

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 43ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 50ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 43ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 41ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 39ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 44ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 50ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 48ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 52ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 48ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 46ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 51ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 52ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 43

In [None]:
# recommender systems questions
questions.update(neural_networks_questions)
filename = "recommender systems.txt"
with open(filename) as file:
    for line in file:
        line = line.rstrip()
        if classify(line) == "yes" and classify_topic(line) == "recommender systems" and line not in questions:
          recommender_systems_questions.add(line)
        elif classify(line) == "yes" and classify_topic(line) == "supervised learning" and line not in questions:
          supervised_learning_questions.add(line)
        elif classify(line) == "yes" and classify_topic(line) == "classification" and line not in questions:
          classification_questions.add(line)
        elif classify(line) == "yes" and classify_topic(line) == "unsupervised learning" and line not in questions:
          unsupervised_learning_questions.add(line)
print(len(recommender_systems_questions))

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 47ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 80ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 65ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 59ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 48ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 46ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 44ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 44ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 42ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 49ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 44ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 39ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 39ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 56

In [None]:
# regularization questions
questions.update(recommender_systems_questions)
filename = "regularization.txt"
regularization_questions = set()
with open(filename) as file:
    for line in file:
        line = line.rstrip()
        if classify(line) == "yes" and classify_topic(line) == "regularization" and line not in questions:
          regularization_questions.add(line)
        elif classify(line) == "yes" and classify_topic(line) == "supervised learning" and line not in questions:
          supervised_learning_questions.add(line)
        elif classify(line) == "yes" and classify_topic(line) == "classification" and line not in questions:
          classification_questions.add(line)
        elif classify(line) == "yes" and classify_topic(line) == "recommender systems" and line not in questions:
          recommender_systems_questions.add(line)
        elif classify(line) == "yes" and classify_topic(line) == "unsupervised learning" and line not in questions:
          unsupervised_learning_questions.add(line)
print(len(regularization_questions))

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 56ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 52ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 49ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 46ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 50ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 72ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 53ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 52ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 50ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 47ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 50ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 50ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 44ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 48

In [None]:
# text classification questions
questions.update(regularization_questions)
filename = "text classification.txt"
text_classification_questions = set()
with open(filename) as file:
    for line in file:
        line = line.rstrip()
        if classify(line) == "yes" and classify_topic(line) == "text classification" and line not in questions:
          text_classification_questions.add(line)
        elif classify(line) == "yes" and classify_topic(line) == "supervised learning" and line not in questions:
          supervised_learning_questions.add(line)
        elif classify(line) == "yes" and classify_topic(line) == "classification" and line not in questions:
          classification_questions.add(line)
        elif classify(line) == "yes" and classify_topic(line) == "recommender systems" and line not in questions:
          recommender_systems_questions.add(line)
        elif classify(line) == "yes" and classify_topic(line) == "unsupervised learning" and line not in questions:
          unsupervised_learning_questions.add(line)
print(len(text_classification_questions))

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 44ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 49ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 46ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 51ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 53ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 57ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 48ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 46ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 47ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 47ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 44ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 43ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 43ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 39

In [None]:
# time series questions
questions.update(text_classification_questions)
time_series_questions = set()
filename = "time series.txt"
with open(filename) as file:
    for line in file:
        line = line.rstrip()
        if classify(line) == "yes" and classify_topic(line) == "time series" and line not in questions:
          time_series_questions.add(line)
        elif classify(line) == "yes" and classify_topic(line) == "supervised learning" and line not in questions:
          supervised_learning_questions.add(line)
        elif classify(line) == "yes" and classify_topic(line) == "classification" and line not in questions:
          classification_questions.add(line)
        elif classify(line) == "yes" and classify_topic(line) == "recommender systems" and line not in questions:
          recommender_systems_questions.add(line)
        elif classify(line) == "yes" and classify_topic(line) == "unsupervised learning" and line not in questions:
          unsupervised_learning_questions.add(line)
print(len(time_series_questions))

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 71ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 75ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 71ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 73ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 71ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 83ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 59ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 85ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 67ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 64ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 55ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 45ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 53ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 55

In [None]:
# unsupervised learning questions
questions.update(time_series_questions)
filename = "unsupervised learning.txt"
with open(filename) as file:
    for line in file:
        line = line.rstrip()
        if classify(line) == "yes" and classify_topic(line) == "unsupervised learning" and line not in questions:
          unsupervised_learning_questions.add(line)
        elif classify(line) == "yes" and classify_topic(line) == "supervised learning" and line not in questions:
          supervised_learning_questions.add(line)
        elif classify(line) == "yes" and classify_topic(line) == "classification" and line not in questions:
          classification_questions.add(line)
        elif classify(line) == "yes" and classify_topic(line) == "recommender systems" and line not in questions:
          recommender_systems_questions.add(line)
print(len(unsupervised_learning_questions))

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 53ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 53ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 52ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 45ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 41ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 41ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 40ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 38ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 51ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 50ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 50ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 55ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 51ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 51

In [None]:
# supervised learning questions
questions.update(time_series_questions)
filename = "supervised learning.txt"
with open(filename) as file:
    for line in file:
        line = line.rstrip()
        if classify(line) == "yes" and classify_topic(line) == "supervised learning" and line not in questions:
          supervised_learning_questions.add(line)
        elif classify(line) == "yes" and classify_topic(line) == "classification" and line not in questions:
          classification_questions.add(line)
        elif classify(line) == "yes" and classify_topic(line) == "recommender systems" and line not in questions:
          recommender_systems_questions.add(line)
        elif classify(line) == "yes" and classify_topic(line) == "unsupervised learning" and line not in questions:
          unsupervised_learning_questions.add(line)
print(len(supervised_learning_questions))

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 44ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 51ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 53ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 49ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 49ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 48ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 56ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 45ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 40ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 39ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 43ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 41ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 38ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 51

In [None]:
# supervised learning questions
recommender_systems_questions = set()
filename = "unsupervised learning.txt"
with open(filename) as file:
    for line in file:
        line = line.rstrip()
        if classify(line) == "yes" and classify_topic(line) == "unsupervised learning" and line not in questions:
          recommender_systems_questions.add(line)
print(len(recommender_systems_questions))

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 38ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 42ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 38ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 40ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 43ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 39ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 36ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 36ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 40ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 37ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 42ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 42ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 40ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 38

In [None]:
questions = list(df['question'])
topic = list(df['topic'])

questions = questions + list(classification_questions)
topic = topic + ["classification" for x in classification_questions]
questions = questions + list(feature_selection_questions)
topic = topic + ["feature selection" for x in feature_selection_questions]
questions = questions + list(neural_networks_questions)
topic = topic + ["neural networks" for x in neural_networks_questions]

questions = questions + list(recommender_systems_questions)
topic = topic + ["recommender systems" for x in recommender_systems_questions]
questions = questions + list(regularization_questions)
topic = topic + ["regularization" for x in regularization_questions]
questions = questions + list(text_classification_questions)
topic = topic + ["text classification" for x in text_classification_questions]

questions = questions + list(time_series_questions)
topic = topic + ["time series" for x in time_series_questions]
questions = questions + list(unsupervised_learning_questions)
topic = topic + ["unsupervised learning" for x in unsupervised_learning_questions]
questions = questions + list(supervised_learning_questions)
topic = topic + ["supervised learning" for x in supervised_learning_questions]

topic_df = pd.DataFrame()
topic_df['question'] = questions
topic_df['topic'] = topic
topic_df

Unnamed: 0,question,topic
0,What is supervised machine learning?,supervised learning
1,What is regression? Which models can you use t...,supervised learning
2,What is linear regression? When do we use it?,supervised learning
3,What are the main assumptions of linear regres...,supervised learning
4,What’s the normal distribution? Why do we care...,supervised learning
...,...,...
811,What do we need to know about bias-variance tr...,supervised learning
812,How does the linear model generalize to new data?,supervised learning
813,What do you know about linear regression?,supervised learning
814,What is the difference between “normal” and “s...,supervised learning


In [None]:
topic_df = pd.DataFrame()
topic_df['question'] = questions
topic_df['topic'] = topic
topic_df

Unnamed: 0,question,topic
0,What is supervised machine learning?,supervised learning
1,What is regression? Which models can you use t...,supervised learning
2,What is linear regression? When do we use it?,supervised learning
3,What are the main assumptions of linear regres...,supervised learning
4,What’s the normal distribution? Why do we care...,supervised learning
...,...,...
813,What do you know about linear regression?,supervised learning
814,What is the difference between “normal” and “s...,supervised learning
815,What is the purpose of K-fold cross-validation?,supervised learning
816,What are N-grams? When do we use N-grams?,text classification


In [None]:
topic_df['topic'].value_counts()

Unnamed: 0_level_0,count
topic,Unnamed: 1_level_1
unsupervised learning,128
supervised learning,118
regularization,95
feature selection,95
recommender systems,86
classification,83
neural networks,71
text classification,71
time series,71


In [None]:
topic_df = topic_df.sample(frac=1).reset_index(drop=True)
topic_df

Unnamed: 0,question,topic
0,What is collaborative filtering?,recommender systems
1,What kind of CNN architectures for classificat...,neural networks
2,How can we deal with the imbalanced class dist...,supervised learning
3,What is time series and how it is different fr...,time series
4,What’s a convolutional layer?,neural networks
...,...,...
813,What do you mean by transfer learning?,unsupervised learning
814,What is Adam? What’s the main difference betwe...,neural networks
815,What is the difference between Logistic Regres...,classification
816,What are the common approaches to building an ...,recommender systems


In [None]:
topic_df.to_csv('dataset_extended_topic.csv', sep='\t', encoding='utf-8', index=False, header=True)

In [None]:
data_path = "dataset_extended_topic.csv"
topic_df2 = pd.read_csv(data_path)
print(len(topic_df2["question"]))

818


# Merging the datasets

## Original dataset

In [35]:
data_path = "dataset_6.csv"
df_original = pd.read_csv(data_path)
df_original

Unnamed: 0,question,answer,difficulty,topic
0,What is supervised machine learning?,Supervised learning is a type of machine learn...,beginner,supervised learning
1,What is regression? Which models can you use t...,Regression is a part of supervised ML. Regress...,beginner,supervised learning
2,What is linear regression? When do we use it?,Linear regression is a model that assumes a li...,beginner,supervised learning
3,What are the main assumptions of linear regres...,There are several assumptions of linear regres...,intermediate,supervised learning
4,What’s the normal distribution? Why do we care...,The normal distribution is a continuous probab...,beginner,supervised learning
...,...,...,...,...
162,Which models do you know for solving time seri...,* Simple Exponential Smoothing: approximate th...,intermediate,time series
163,"If there’s a trend in our series, how we can r...",We can explicitly model the trend (and/or seas...,intermediate,time series
164,You have a series with only one variable “y” m...,We want to look at the correlation between dif...,intermediate,time series
165,You have a series with a variable “y” and a se...,Given the assumption that the set of features ...,intermediate,time series


### Non-data science questions

In [36]:
data_path = "question_dataset.csv"
df_non = pd.read_csv(data_path)
df_non

Unnamed: 0,question
0,"What is the difference between a ""pocket"" and ..."
1,I was wondering if anyone knows of any way tha...
2,How do you like to spend your time when you’re...
3,What would you like to see on this site?
4,I am interested in finding out if there is a w...
...,...
162,What is the name of this type of tree?
163,"2+2=3\nThe answer to this question is simple, ..."
164,What do you think is the most important thing ...
165,Do you know of anyone who has had a positive p...


In [37]:
df_non['answer'] = [pd.NA for e in df_non['question']]
df_non['difficulty'] = [pd.NA for e in df_non['question']]
df_non['topic'] = [pd.NA for e in df_non['question']]
df_non['data_science_question'] = [0 for e in df_non['question']]
df_original['data_science_question'] = [1 for e in df_original['question']]
df_non

Unnamed: 0,question,answer,difficulty,topic,data_science_question
0,"What is the difference between a ""pocket"" and ...",,,,0
1,I was wondering if anyone knows of any way tha...,,,,0
2,How do you like to spend your time when you’re...,,,,0
3,What would you like to see on this site?,,,,0
4,I am interested in finding out if there is a w...,,,,0
...,...,...,...,...,...
162,What is the name of this type of tree?,,,,0
163,"2+2=3\nThe answer to this question is simple, ...",,,,0
164,What do you think is the most important thing ...,,,,0
165,Do you know of anyone who has had a positive p...,,,,0


In [39]:
df_non.index = pd.RangeIndex(start=167, stop=(2*167), step=1)

### Merged original and non-data science questions

In [40]:
df_extended = pd.concat([df_original, df_non])
df_extended

Unnamed: 0,question,answer,difficulty,topic,data_science_question
0,What is supervised machine learning?,Supervised learning is a type of machine learn...,beginner,supervised learning,1
1,What is regression? Which models can you use t...,Regression is a part of supervised ML. Regress...,beginner,supervised learning,1
2,What is linear regression? When do we use it?,Linear regression is a model that assumes a li...,beginner,supervised learning,1
3,What are the main assumptions of linear regres...,There are several assumptions of linear regres...,intermediate,supervised learning,1
4,What’s the normal distribution? Why do we care...,The normal distribution is a continuous probab...,beginner,supervised learning,1
...,...,...,...,...,...
329,What is the name of this type of tree?,,,,0
330,"2+2=3\nThe answer to this question is simple, ...",,,,0
331,What do you think is the most important thing ...,,,,0
332,Do you know of anyone who has had a positive p...,,,,0


## Difficulty extended dataset

In [41]:
data_path = "dataset_extended_difficulty.csv"
df_diff = pd.read_csv(data_path)
df_diff

Unnamed: 0,question,difficulty
0,Which feature selection techniques do you know?,intermediate
1,What is a time series?,beginner
2,Which hyper-parameter tuning strategies (in ge...,intermediate
3,How can we use CNN model for text classification?,advanced
4,How to predict the best time series?,beginner
...,...,...
375,What are the different types of machine learni...,beginner
376,Is accuracy always a good metric?,beginner
377,"If you have a sentence with multiple words, yo...",intermediate
378,Can you tell us more about how you would use X...,advanced


In [42]:
df_diff['answer'] = ['Answer here' for e in df_diff['question']]
df_diff['data_science_question'] = [1 for e in df_diff['question']]
df_diff

Unnamed: 0,question,difficulty,answer,data_science_question
0,Which feature selection techniques do you know?,intermediate,Answer here,1
1,What is a time series?,beginner,Answer here,1
2,Which hyper-parameter tuning strategies (in ge...,intermediate,Answer here,1
3,How can we use CNN model for text classification?,advanced,Answer here,1
4,How to predict the best time series?,beginner,Answer here,1
...,...,...,...,...
375,What are the different types of machine learni...,beginner,Answer here,1
376,Is accuracy always a good metric?,beginner,Answer here,1
377,"If you have a sentence with multiple words, yo...",intermediate,Answer here,1
378,Can you tell us more about how you would use X...,advanced,Answer here,1


### Adding the topic labels

In [None]:
topics = []
for elm in df_diff['question']:
  topics.append(classify_topic(elm))
df_diff['topic'] = topics
df_diff

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 233ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 98ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 93ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 98ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 96ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 105ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 100ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 77ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 91ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 129ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 99ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 97ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 94ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0

Unnamed: 0,question,difficulty,answer,data_science_question,topic
0,Which feature selection techniques do you know?,intermediate,Answer here,1,feature selection
1,What is a time series?,beginner,Answer here,1,time series
2,Which hyper-parameter tuning strategies (in ge...,intermediate,Answer here,1,feature selection
3,How can we use CNN model for text classification?,advanced,Answer here,1,text classification
4,How to predict the best time series?,beginner,Answer here,1,time series
...,...,...,...,...,...
375,What are the different types of machine learni...,beginner,Answer here,1,supervised learning
376,Is accuracy always a good metric?,beginner,Answer here,1,classification
377,"If you have a sentence with multiple words, yo...",intermediate,Answer here,1,text classification
378,Can you tell us more about how you would use X...,advanced,Answer here,1,recommender systems


In [46]:
to_drop = []
for i in range(len(df_diff['question'])):
  if df_diff['question'][i] in list(df_original['question']):
    to_drop.append(i)
len(to_drop)

167

In [47]:
df_diff = df_diff.drop(to_drop)

### The count of the elements in each topic class

In [None]:
df_diff['topic'].value_counts()

Unnamed: 0_level_0,count
topic,Unnamed: 1_level_1
text classification,68
supervised learning,38
feature selection,34
unsupervised learning,26
classification,19
neural networks,15
time series,6
recommender systems,4
regularization,3


In [48]:
df_diff.index = pd.RangeIndex(start=len(df_extended), stop=len(df_extended)+len(df_diff), step=1)

### Merged original dataset + difficulty extended dataset + non-data science questions

In [None]:
df_extended = pd.concat([df_extended, df_diff])
df_extended

Unnamed: 0,question,answer,difficulty,topic,data_science_question
0,What is supervised machine learning?,Supervised learning is a type of machine learn...,beginner,supervised learning,1
1,What is regression? Which models can you use t...,Regression is a part of supervised ML. Regress...,beginner,supervised learning,1
2,What is linear regression? When do we use it?,Linear regression is a model that assumes a li...,beginner,supervised learning,1
3,What are the main assumptions of linear regres...,There are several assumptions of linear regres...,intermediate,supervised learning,1
4,What’s the normal distribution? Why do we care...,The normal distribution is a continuous probab...,beginner,supervised learning,1
...,...,...,...,...,...
542,When do we need to use random forests?,Answer here,beginner,feature selection,1
543,What is the best way to get good feature selec...,Answer here,beginner,feature selection,1
544,How do you evaluate regression models?,Answer here,beginner,supervised learning,1
545,What are the different types of machine learni...,Answer here,beginner,supervised learning,1


## Topic extended dataset

In [50]:
data_path = "dataset_extended_topic.csv"
df_topic = pd.read_csv(data_path)
df_topic

Unnamed: 0,question,topic
0,What is collaborative filtering?,recommender systems
1,What kind of CNN architectures for classificat...,neural networks
2,How can we deal with the imbalanced class dist...,supervised learning
3,What is time series and how it is different fr...,time series
4,What’s a convolutional layer?,neural networks
...,...,...
813,What do you mean by transfer learning?,unsupervised learning
814,What is Adam? What’s the main difference betwe...,neural networks
815,What is the difference between Logistic Regres...,classification
816,What are the common approaches to building an ...,recommender systems


In [51]:
to_drop = []
for i in range(len(df_topic['question'])):
  if df_topic['question'][i] in list(df_extended['question']):
    to_drop.append(i)
len(to_drop)

187

In [52]:
df_topic = df_topic.drop(to_drop)

In [53]:
df_topic.index = pd.RangeIndex(start=len(df_extended), stop=len(df_extended)+len(df_topic), step=1)

In [54]:
df_topic['answer'] = ['Answer here' for e in df_topic['question']]
df_topic['data_science_question'] = [1 for e in df_topic['question']]
df_topic

Unnamed: 0,question,topic,answer,data_science_question
547,How can we deal with the imbalanced class dist...,supervised learning,Answer here,1
548,What is time series and how it is different fr...,time series,Answer here,1
549,What is neural nets?,neural networks,Answer here,1
550,What do we need to know about bias-variance tr...,supervised learning,Answer here,1
551,What is the K-fold cross-validation technique?,supervised learning,Answer here,1
...,...,...,...,...
1173,What do we mean by “Deep Learning”?,unsupervised learning,Answer here,1
1174,1.What is data science and how does it differ ...,supervised learning,Answer here,1
1175,What do you mean by transfer learning?,unsupervised learning,Answer here,1
1176,What is the difference between Logistic Regres...,classification,Answer here,1


### Adding the difficulty labels

In [None]:
difficulties = []
for elm in df_topic['question']:
  difficulties.append(classify_difficulty(elm))
df_topic['difficulty'] = difficulties
df_topic

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 530ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 93ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 76ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 80ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 80ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 103ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 89ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 89ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 51ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 48ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 79ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 79ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 118ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m

Unnamed: 0,question,topic,answer,data_science_question,difficulty
547,How can we deal with the imbalanced class dist...,supervised learning,Answer here,1,beginner
548,What is time series and how it is different fr...,time series,Answer here,1,beginner
549,What is neural nets?,neural networks,Answer here,1,intermediate
550,What do we need to know about bias-variance tr...,supervised learning,Answer here,1,beginner
551,What is the K-fold cross-validation technique?,supervised learning,Answer here,1,beginner
...,...,...,...,...,...
1173,What do we mean by “Deep Learning”?,unsupervised learning,Answer here,1,intermediate
1174,1.What is data science and how does it differ ...,supervised learning,Answer here,1,beginner
1175,What do you mean by transfer learning?,unsupervised learning,Answer here,1,intermediate
1176,What is the difference between Logistic Regres...,classification,Answer here,1,beginner


### The count of the elements in each difficulty class

In [None]:
df_topic['difficulty'].value_counts()

Unnamed: 0_level_0,count
difficulty,Unnamed: 1_level_1
intermediate,337
beginner,238
advanced,56


### Merged original, difficulty extended and topic extended dataset + non-data science questions

In [None]:
df_extended = pd.concat([df_extended, df_topic])
df_extended

Unnamed: 0,question,answer,difficulty,topic,data_science_question
0,What is supervised machine learning?,Supervised learning is a type of machine learn...,beginner,supervised learning,1
1,What is regression? Which models can you use t...,Regression is a part of supervised ML. Regress...,beginner,supervised learning,1
2,What is linear regression? When do we use it?,Linear regression is a model that assumes a li...,beginner,supervised learning,1
3,What are the main assumptions of linear regres...,There are several assumptions of linear regres...,intermediate,supervised learning,1
4,What’s the normal distribution? Why do we care...,The normal distribution is a continuous probab...,beginner,supervised learning,1
...,...,...,...,...,...
1173,What do we mean by “Deep Learning”?,Answer here,intermediate,unsupervised learning,1
1174,1.What is data science and how does it differ ...,Answer here,beginner,supervised learning,1
1175,What do you mean by transfer learning?,Answer here,intermediate,unsupervised learning,1
1176,What is the difference between Logistic Regres...,Answer here,beginner,classification,1


## Non-data science questions dataset

In [57]:
data_path = "dataset_non_data_science.csv"
df_non = pd.read_csv(data_path)
df_non

Unnamed: 0,question
0,Do you have any advice for someone who is inte...
1,I have a client who is in the process of selli...
2,how do I know what to eat?
3,"What is the difference between an ""open"" and a..."
4,What is the most important thing for you as a ...
...,...
690,Do you have a favorite place to go on vacation?
691,What are the odds of you being a member of a c...
692,Is there a way to change the width of a column...
693,How do you keep track of all the things you ne...


In [58]:
df_non['answer'] = [pd.NA for e in df_non['question']]
df_non['difficulty'] = [pd.NA for e in df_non['question']]
df_non['topic'] = [pd.NA for e in df_non['question']]
df_non['data_science_question'] = [0 for e in df_non['question']]
df_non

Unnamed: 0,question,answer,difficulty,topic,data_science_question
0,Do you have any advice for someone who is inte...,,,,0
1,I have a client who is in the process of selli...,,,,0
2,how do I know what to eat?,,,,0
3,"What is the difference between an ""open"" and a...",,,,0
4,What is the most important thing for you as a ...,,,,0
...,...,...,...,...,...
690,Do you have a favorite place to go on vacation?,,,,0
691,What are the odds of you being a member of a c...,,,,0
692,Is there a way to change the width of a column...,,,,0
693,How do you keep track of all the things you ne...,,,,0


In [66]:
to_drop = []
for i in range(len(df_non['question'])):
  if df_non['question'][i] in list(df_extended['question']):
    to_drop.append(i)
len(to_drop)

167


In [60]:
df_non = df_non.drop(to_drop)

In [61]:
df_non.index = pd.RangeIndex(start=len(df_extended), stop=len(df_extended)+len(df_non), step=1)

## Final merged dataset

Created by merging these dataset:

*   original dataset
*   difficulty extended dataset
*   topic extended dataset
*   non-data science questions dataset

In [None]:
df_extended = pd.concat([df_extended, df_non])
df_extended

Unnamed: 0,question,answer,difficulty,topic,data_science_question
0,What is supervised machine learning?,Supervised learning is a type of machine learn...,beginner,supervised learning,1
1,What is regression? Which models can you use t...,Regression is a part of supervised ML. Regress...,beginner,supervised learning,1
2,What is linear regression? When do we use it?,Linear regression is a model that assumes a li...,beginner,supervised learning,1
3,What are the main assumptions of linear regres...,There are several assumptions of linear regres...,intermediate,supervised learning,1
4,What’s the normal distribution? Why do we care...,The normal distribution is a continuous probab...,beginner,supervised learning,1
...,...,...,...,...,...
1701,If you don’t have a computer at home how do yo...,,,,0
1702,How many of you have been in a situation where...,,,,0
1703,How do you get your hands on a copy of the book?,,,,0
1704,What is your favorite thing to do when you are...,,,,0


In [None]:
df_extended = df_extended.sample(frac=1).reset_index(drop=True)
df_extended

Unnamed: 0,question,answer,difficulty,topic,data_science_question
0,How do we decide when to stop training a neura...,Simply stop training when the validation error...,beginner,neural networks,1
1,Do you have any advice for someone who is inte...,,,,0
2,What are text classification models?,Answer here,advanced,text classification,1
3,I have a client who is in the process of selli...,,,,0
4,"If you have a sentence with multiple words, yo...",Approaches ranked from simple to more complex:...,intermediate,text classification,1
...,...,...,...,...,...
1701,How can we use the pre-trained word embeddings...,Answer here,advanced,text classification,1
1702,How do you evaluate how well your models perform?,Answer here,intermediate,regularization,1
1703,How many people in the world speak Spanish as ...,,,,0
1704,What is the difference between time series and...,Answer here,intermediate,time series,1


In [None]:
df_extended.to_csv('dataset_extended_merged.csv', sep='\t', encoding='utf-8', index=False, header=True)

### Data science question class counts

In [None]:
df_extended['data_science_question'].value_counts()

Unnamed: 0_level_0,count
data_science_question,Unnamed: 1_level_1
1,1011
0,695


### Difficulty class counts

In [None]:
df_extended['difficulty'].value_counts()

Unnamed: 0_level_0,count
difficulty,Unnamed: 1_level_1
intermediate,477
beginner,358
advanced,176


### Topic class counts

In [None]:
df_extended['topic'].value_counts()

Unnamed: 0_level_0,count
topic,Unnamed: 1_level_1
unsupervised learning,151
supervised learning,147
text classification,138
feature selection,128
regularization,98
classification,97
recommender systems,90
neural networks,85
time series,77


## Merged dataset (data science only questions)

In [None]:
df_data_science = df_extended[df_extended['data_science_question'] == 1]
df_data_science

Unnamed: 0,question,answer,difficulty,topic,data_science_question
0,How do we decide when to stop training a neura...,Simply stop training when the validation error...,beginner,neural networks,1
2,What are text classification models?,Answer here,advanced,text classification,1
4,"If you have a sentence with multiple words, yo...",Approaches ranked from simple to more complex:...,intermediate,text classification,1
6,How does time series differ from the usual reg...,Answer here,beginner,time series,1
7,What is the importance of machine learning in ...,Answer here,intermediate,unsupervised learning,1
...,...,...,...,...,...
1700,How we can use neural nets for computer vision?,Neural nets used in the area of computer visio...,intermediate,neural networks,1
1701,How can we use the pre-trained word embeddings...,Answer here,advanced,text classification,1
1702,How do you evaluate how well your models perform?,Answer here,intermediate,regularization,1
1704,What is the difference between time series and...,Answer here,intermediate,time series,1


In [None]:
df_data_science = df_data_science.reset_index(drop=True)

In [None]:
df_data_science = df_data_science.drop(columns=['data_science_question'])

In [None]:
df_data_science.to_csv('dataset_extended_merged_data_science_only.csv', sep='\t', encoding='utf-8', index=False, header=True)