In [54]:
import sys
import gc
import re
import warnings
warnings.filterwarnings('ignore')

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from bs4 import BeautifulSoup
from tqdm import tqdm

from tensorflow.keras.preprocessing.text import Tokenizer

from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all" 

# Data preprocessing

In [55]:
questions = pd.read_csv('./data/Questions.csv', encoding="ISO-8859-1")
# answers = pd.read_csv('./data/Answers.csv', encoding="ISO-8859-1")
tags = pd.read_csv('./data/Tags.csv', encoding="ISO-8859-1")
gc.collect() # for reruns of the notebook

38200

In [56]:
print(f'[DF size] questions shape: {questions.shape}')
print(f'[DF size] tags shape: {tags.shape}')

questions.head()
# answers.head()
tags.head()

[DF size] questions shape: (1264216, 7)
[DF size] tags shape: (3750994, 2)


Unnamed: 0,Id,OwnerUserId,CreationDate,ClosedDate,Score,Title,Body
0,80,26.0,2008-08-01T13:57:07Z,,26,SQLStatement.execute() - multiple queries in o...,<p>I've written a database generation script i...
1,90,58.0,2008-08-01T14:41:24Z,2012-12-26T03:45:49Z,144,Good branching and merging tutorials for Torto...,<p>Are there any really good tutorials explain...
2,120,83.0,2008-08-01T15:50:08Z,,21,ASP.NET Site Maps,<p>Has anyone got experience creating <strong>...
3,180,2089740.0,2008-08-01T18:42:19Z,,53,Function for creating color wheels,<p>This is something I've pseudo-solved many t...
4,260,91.0,2008-08-01T23:22:08Z,,49,Adding scripting functionality to .NET applica...,<p>I have a little game written in C#. It uses...


Unnamed: 0,Id,Tag
0,80,flex
1,80,actionscript-3
2,80,air
3,90,svn
4,90,tortoisesvn


## There is a need to discard unneeded data
`answers` is not relevant to our target problem. It may aid tagging accuracy & F-score, but it will not be available in real life  

In [57]:
# Memory footprint
print('memory footprint of question. Two ways to compute')
sys.getsizeof(questions)
questions.memory_usage(deep=True).values.sum()

# print('memory footprint of answers')
# answers.memory_usage(deep=True).values.sum()

print('memory footprint of tags')
tags.memory_usage(deep=True).values.sum()

# del answers

memory footprint of question. Two ways to compute


2164588917

2164588901

memory footprint of tags


272589961

In [58]:
## This too decreases the memory footprint
questions = questions.set_index('Id').sort_index()
tags = tags.set_index('Id').sort_index()

In [59]:
df = questions.merge(tags, on='Id', how='left')
questions.shape
tags.shape
df.shape
df.memory_usage(deep=True).values.sum()
df.head(5)
del df # will remake later

(1264216, 6)

(3750994, 1)

(3750994, 7)

7015361526

Unnamed: 0_level_0,OwnerUserId,CreationDate,ClosedDate,Score,Title,Body,Tag
Id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
80,26.0,2008-08-01T13:57:07Z,,26,SQLStatement.execute() - multiple queries in o...,<p>I've written a database generation script i...,flex
80,26.0,2008-08-01T13:57:07Z,,26,SQLStatement.execute() - multiple queries in o...,<p>I've written a database generation script i...,actionscript-3
80,26.0,2008-08-01T13:57:07Z,,26,SQLStatement.execute() - multiple queries in o...,<p>I've written a database generation script i...,air
90,58.0,2008-08-01T14:41:24Z,2012-12-26T03:45:49Z,144,Good branching and merging tutorials for Torto...,<p>Are there any really good tutorials explain...,svn
90,58.0,2008-08-01T14:41:24Z,2012-12-26T03:45:49Z,144,Good branching and merging tutorials for Torto...,<p>Are there any really good tutorials explain...,tortoisesvn


It is too soon to merge, but it's memory impact is interesting

Memory footprint of semi-preprocessed data is 7015361526, which is larger by 4578182664 because the two conponent are: 2164588901 & 272589961

In [60]:
print(f'Number of questions {questions.shape[0]}') 
        # base  1264216
min_score = 0 # 1185767
min_score = 1 # 594057
# min_score = 3 # 188015
print(f'Number of questions with score >= {min_score} is {questions[questions["Score"]>=min_score].shape[0]}')

Number of questions 1264216
Number of questions with score >= 1 is 594057


## Discarding low scored data
Having a minimum score of 1 would eliminate half of data, relative to the total.

I assume a score of 0 means the post was either ignored by the community and/or is _controversial_

In [61]:
print(f'[DF size] questions shape before removing low scoring {min_score} data')
questions = questions[questions["Score"]>=min_score]
print(f'[DF size] questions shape after removing low scoring {min_score} data')
questions.head(5)

# Sync IDs; discard tags with no questions
questions_idx = questions.index.values
tags = tags.loc[questions_idx]

[DF size] questions shape before removing low scoring 1 data
[DF size] questions shape after removing low scoring 1 data


Unnamed: 0_level_0,OwnerUserId,CreationDate,ClosedDate,Score,Title,Body
Id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
80,26.0,2008-08-01T13:57:07Z,,26,SQLStatement.execute() - multiple queries in o...,<p>I've written a database generation script i...
90,58.0,2008-08-01T14:41:24Z,2012-12-26T03:45:49Z,144,Good branching and merging tutorials for Torto...,<p>Are there any really good tutorials explain...
120,83.0,2008-08-01T15:50:08Z,,21,ASP.NET Site Maps,<p>Has anyone got experience creating <strong>...
180,2089740.0,2008-08-01T18:42:19Z,,53,Function for creating color wheels,<p>This is something I've pseudo-solved many t...
260,91.0,2008-08-01T23:22:08Z,,49,Adding scripting functionality to .NET applica...,<p>I have a little game written in C#. It uses...


### Plot Distribution of tags

In [62]:
tags.describe()

# # basic statistics
# # tags.Tag.value_counts()
# 100 * tags.Tag.value_counts()[0] / np.sum(tags.Tag.value_counts())
# import seaborn as sns
# _=plt.figure(figsize=(16,6));
# _=plt.title("Questions Distribution According To Tags");
# _=sns.barplot(x=tags.Tag.value_counts().index.values[:101], y=tags.Tag.value_counts().values[:101]);
# _=plt.xticks(rotation=90);
# _=plt.ylabel("Number of questions")
# plt.savefig('Questions Distribution According To Tags.png')

Unnamed: 0,Tag
count,1802266
unique,32057
top,java
freq,53895


## Remove the tag with less than N training examples - as the dataset is spare in this sense

In [63]:
min_numb = 2000 # Number of unique tags after removal is 101, if min_score = 1.
# min_numb = 624 # Number of unique tags after removal is 100+1, if min_score = 3.

# tags.Tag.value_counts()
print(f'Number of unique tags before removal {tags.Tag.unique().shape[0]}')

counts = tags.Tag.value_counts()
idx = (counts>=min_numb)
if min_numb==624: idx['exception'] = True # found manaul via set difference 
idx = idx[np.where(idx)[0]].index

uniques = tags[tags.Tag.isin(idx)].Tag.unique() # or tags.where(tags.Tag.isin(idx)).dropna()['Tag'].unique()
print(f'Number of unique tags after removal {uniques.shape[0]}')
print(f'Amount of categories (tags) remaining {uniques.shape[0]/counts.shape[0]:.6f}%')
print(f'Amount of data remaining {tags[tags.Tag.isin(idx)].shape[0]/tags.shape[0]:.6f}%')
tags = tags[tags.Tag.isin(idx)]
counts = tags.Tag.value_counts()

print(f'\nMetrics tags counts')
print(f'Max {counts.values.max():.2f}')
print(f'Min {counts.values.min():.2f}')
print(f'Mean {counts.values.mean():.2f}')
print(f'Stddev {counts.values.std():.2f}')
print(f'The top 50 tags {counts[:50].index.values}')

Number of unique tags before removal 32058
Number of unique tags after removal 101
Amount of categories (tags) remaining 0.003151%
Amount of data remaining 0.449453%

Metrics tags counts
Max 53895.00
Min 2033.00
Mean 8022.67
Stddev 10899.97
The top 50 tags ['java' 'javascript' 'c#' 'android' 'php' 'python' 'jquery' 'c++' 'html'
 'ios' 'css' 'mysql' 'sql' '.net' 'asp.net' 'objective-c' 'ruby-on-rails'
 'c' 'iphone' 'ruby' 'r' 'angularjs' 'sql-server' 'arrays' 'regex' 'json'
 'asp.net-mvc' 'node.js' 'linux' 'xml' 'django' 'wpf' 'ajax' 'swift'
 'windows' 'xcode' 'database' 'string' 'multithreading' 'git' 'eclipse'
 'spring' 'html5' 'algorithm' 'performance' 'osx' 'bash' 'excel' 'vb.net'
 'scala']


## Preprocess the questions
### Remove excess questions (which have no considered tag), and unneeded `question` columns

In [64]:
# remove excess questions and columns
print(f'Number of unique tags indices: {tags.index.nunique()}') # 1056968
tags_idx = np.unique(tags.index.values)

# check if the tags_idx are in the questions_idx
assert(set(tags_idx) - set(questions.index.values) == set())

questions.head(1)

print(f'Number of questions before: {questions.shape}')
questions = questions.loc[tags_idx]
questions.drop(columns=['OwnerUserId', 'CreationDate', 'ClosedDate', 'Score'], inplace=True)
print(f'Number of questions after: {questions.shape}')

Number of unique tags indices: 498996


Unnamed: 0_level_0,OwnerUserId,CreationDate,ClosedDate,Score,Title,Body
Id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
80,26.0,2008-08-01T13:57:07Z,,26,SQLStatement.execute() - multiple queries in o...,<p>I've written a database generation script i...


Number of questions before: (594057, 6)
Number of questions after: (498996, 2)


In [65]:
## Run the below a few times to see the `Title` column. I did not see anything of concern.
# questions.Title.sample(20)

def preprocess_text(sentence):
     # HTML decoding
    sentence = BeautifulSoup(sentence, "lxml").get_text() #.text
    
    # Remove punctuations and numbers
    # sentence = re.sub('[^a-zA-Z]', ' ', sentence)

    # Single character removal
    # sentence = re.sub(r"\s+[a-zA-Z]\s+", ' ', sentence)

    # Removing multiple spaces
    sentence = re.sub(r'\s+', ' ', sentence)
    # Trailing whitespace
    sentence = re.sub(r'[ \t]+(\r?$)', ' ', sentence)
    sentence = re.sub(r'[ \t]+(\r?$)', ' ', sentence)
    return sentence

# Test
tmp = questions.Body.sample(1).values[0]

print(f'\nraw text is \n{tmp}')
print(f'\npreprocess text is \n{preprocess_text(tmp)}')


raw text is 
<p>highcharts gets my hour time wrong</p>

<p>I'm from venezuela just in case. I doing a real time system where I get in my db the time,seconds and miliseconds like
10:39:09:2</p>

<p>I apply the strtotime($time) then I sended by json to charted </p>

<p>and in my highcharts i got in the </p>

<pre><code>xAxis: {
  type: 'datetime',
  title: {
    text: 'Tiempo'
  }
</code></pre>

<p>the utc function is already false</p>

<pre><code>Highcharts.setOptions({
  global: {
    useUTC: false
  }
});
</code></pre>

<p>and my function to get the json is </p>

<pre><code>function requestData_Frecuencia() {
  $.ajax({
    url: 'Datos_Frecuencia.php',
    success: function (point) {
      var series = chart_Frecuencia.series[0],
        shift = series.data.length &gt; 400;
      //add point
      chart_Frecuencia.series[0].addPoint(eval(point), true, shift);
      setTimeout(requestData_Frecuencia, 500);
    },
    cache: false
  });
}
</code></pre>

<p>PS. is been a while since I w

In [52]:
# takes 5 mins to run
add_space =[' ' for i in range(questions.Body.shape[0])] # add space, so we have "foo bar", not "foobar" 
questions['Formatted'] = questions.Title + add_space + questions.Body # concat title and body
questions['Formatted'] = questions['Formatted'].apply(preprocess_text)

questions.drop(columns=['Title', 'Body'], inplace=True)

In [53]:
questions.head(1)
questions.to_csv('./data/questions_preprocessed_min3.csv', index=True)
tags.head(1)
tags.to_csv('./data/tags_preprocessed_min3.csv', index=True)

Unnamed: 0_level_0,Formatted
Id,Unnamed: 1_level_1
120,ASP NET Site Maps Has anyone got experience cr...


Unnamed: 0_level_0,Tag
Id,Unnamed: 1_level_1
120,sql


# Duplicate code

In [20]:
from sklearn.preprocessing import MultiLabelBinarizer
mlb = MultiLabelBinarizer() # TODO sparse_output=True  I can add more classes

list_to_encode = tags.Tag.values
mlb.fit(list_to_encode.reshape(-1,1))
# # MultiLabelBinarizer on the DataFrame, `mlb.fit(tags.Tag)`. Did not work as intended

## faster to just recompute
# if not os.path.exists('./data/encoded.pkl'):
#     print('Encoding tags, then saving..')
encoded = dict() # used later
for i in tqdm(tags.index.unique()): 
    # went from 1750000 to 1050000 because I forgot to do only uniques
    # 500,000 after I removed low voted posts
    input_ = tags.loc[i, 'Tag']
    if (type(input_) is str): # single value is of type string. 
        input_ = np.array(input_)
    else:   
        input_ = input_.values
    
    ouput_ = mlb.transform(input_.reshape(-1,1))
    ouput_ = list(np.sum(ouput_, axis=0))
    encoded[i] = ouput_
#     pickle.dump(encoded, open('./data/encoded.pkl', 'wb'))
# else:
#     encoded = pickle.load(open('./data/encoded.pkl', 'rb')) 1751371

MultiLabelBinarizer()

100%|██████████| 498996/498996 [01:59<00:00, 4168.11it/s]


In [21]:
list_of_questions = []
list_of_tags = []
for i in tqdm(questions['Formatted'].index.unique()):
    list_of_questions.append(questions['Formatted'][i])
    list_of_tags.append(encoded[i])

print(f'length is {len(list_of_questions)}')

100%|██████████| 498996/498996 [00:02<00:00, 171339.43it/s]

length is 498996





In [22]:
## cases computer to run out of ram and go into paging. Despite my PC having 24gb. 
# np.save('./data/list_of_questions.npy', list_of_questions)
# np.save('./data/list_of_tags.npy', list_of_tags)

In [23]:
def sizeof_fmt(num, suffix='B'):
    ''' by Fred Cirera,  https://stackoverflow.com/a/1094933/1870254, modified'''
    for unit in ['','Ki','Mi','Gi','Ti','Pi','Ei','Zi']:
        if abs(num) < 1024.0:
            return "%3.1f %s%s" % (num, unit, suffix)
        num /= 1024.0
    return "%.1f %s%s" % (num, 'Yi', suffix)

for name, size in sorted(((name, sys.getsizeof(value)) for name, value in locals().items()),
                         key= lambda x: -x[1])[:10]:
    print("{:>30}: {:>8}".format(name, sizeof_fmt(size)))

gc.collect()

                     questions: 539.2 MiB
                          tags: 86.9 MiB
                       encoded: 20.0 MiB
                 questions_idx:  4.5 MiB
             list_of_questions:  4.0 MiB
                  list_of_tags:  4.0 MiB
                      tags_idx:  3.8 MiB
                           tmp: 10.3 KiB
                            _8:  8.1 KiB
                        counts:  7.1 KiB


0

In [24]:
print(f'Sample question: {list_of_questions[0]}\n')
print(f'Number of tags: {sum(list_of_tags[0])}')

Sample question: ASP NET Site Maps Has anyone got experience creating SQL based ASP NET site map providers ve got the default XML file web sitemap working properly with my Menu and SiteMapPath controls but ll need way for the users of my site to create and modify pages dynamically need to tie page viewing permissions into the standard ASP NET membership system as well 

Number of tags: 2


### [optional] lemmatize

In [None]:
## dict based mapping... bad idea because foo must be a string in nlp(foo). it would be called vocab_size number of times
# mapping = dict()
# for doc in X_train:
#     sentences = doc.split(' ')
#     for word in sentences:
#         mapping[word] = word
# list_of_vocab = list(mapping.values())

## Still very slow... 
# subset = X_train[0:100]
# tmp = " ".join([foo for foo in X_train]) 
# nlp = spacy.load('en_core_web_sm')
# nlp.max_length = 1000000 * 10
# doc = nlp(tmp)

In [None]:
# ## optional lemmatization 
# import spacy
# nlp = spacy.load('en_core_web_sm')

# # idx=20
# # X_train[idx]
# for i in tqdm(range(len(X_train))):
#     doc = nlp(X_train[i])
#     X_train[i] = " ".join([token.lemma_ for token in doc]) 
# # X_train[idx]

In [81]:
# uniqueWords = [] 
# for q in list_of_questions[:1000]:
#     for word in q.split():
#       if not word in uniqueWords:
#           uniqueWords.append(word);
          
# len(uniqueWords)

10261