# Background

https://towardsdatascience.com/building-a-topic-modeling-pipeline-with-spacy-and-gensim-c5dc03ffc619

# Initialization

In [100]:
%matplotlib inline

import os
from pathlib import Path
import numpy as np
import datetime

import pandas as pd
pd.set_option("display.max_rows",10)

# IPython

from IPython.display import display, Markdown, HTML
from IPython.display import Image

from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

# http://stackoverflow.com/questions/21971449/how-do-i-increase-the-cell-width-of-the-jupyter-ipython-notebook-in-my-browser
from IPython.core.display import display, HTML
display(HTML("<style>.container { width:80% !important; }</style>"))


# Autoload Python Code
%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


# Data 

https://www.kaggle.com/stackoverflow/stacksample

Dataset with the text of 10% of questions and answers from the Stack Overflow programming Q&A website.

This is organized as three tables:

* Questions contains the title, body, creation date, closed date (if applicable), score, and owner ID for all non-deleted  Stack Overflow questions whose Id is a multiple of 10.

* Answers contains the body, creation date, score, and owner ID for each of the answers to these questions. The ParentId column links back to the Questions table.

* Tags contains the tags on each of these questions

In [51]:
%pip install rich devtools

Collecting rich
  Downloading rich-11.0.0-py3-none-any.whl (215 kB)
     |████████████████████████████████| 215 kB 2.0 MB/s            
[?25hCollecting devtools
  Downloading devtools-0.8.0-py3-none-any.whl (14 kB)
Collecting commonmark<0.10.0,>=0.9.0
  Using cached commonmark-0.9.1-py2.py3-none-any.whl (51 kB)
Collecting colorama<0.5.0,>=0.4.0
  Using cached colorama-0.4.4-py2.py3-none-any.whl (16 kB)
Collecting asttokens<3.0.0,>=2.0.0
  Downloading asttokens-2.0.5-py2.py3-none-any.whl (20 kB)
Collecting executing<1.0.0,>=0.8.0
  Downloading executing-0.8.2-py2.py3-none-any.whl (16 kB)
Installing collected packages: executing, commonmark, colorama, asttokens, rich, devtools
Successfully installed asttokens-2.0.5 colorama-0.4.4 commonmark-0.9.1 devtools-0.8.0 executing-0.8.2 rich-11.0.0
You should consider upgrading via the '/Library/Frameworks/Python.framework/Versions/3.9/bin/python3 -m pip install --upgrade pip' command.[0m
Note: you may need to restart the kernel to use updated p

In [52]:
from pathlib import Path
from devtools import debug

In [4]:
data_path = Path('/Users/bkraft/data/dt_nlp1')

#### Questions

In [18]:
questions = pd.read_csv(data_path / 'Questions.csv',  encoding="ISO-8859-1")

In [39]:
questions.Id.nunique()
questions.sort_values(by='Id')

1264216

Unnamed: 0,Id,OwnerUserId,CreationDate,ClosedDate,Score,Title,Body
0,80,26.0,2008-08-01T13:57:07Z,,26,SQLStatement.execute() - multiple queries in o...,<p>I've written a database generation script i...
1,90,58.0,2008-08-01T14:41:24Z,2012-12-26T03:45:49Z,144,Good branching and merging tutorials for Torto...,<p>Are there any really good tutorials explain...
2,120,83.0,2008-08-01T15:50:08Z,,21,ASP.NET Site Maps,<p>Has anyone got experience creating <strong>...
3,180,2089740.0,2008-08-01T18:42:19Z,,53,Function for creating color wheels,<p>This is something I've pseudo-solved many t...
4,260,91.0,2008-08-01T23:22:08Z,,49,Adding scripting functionality to .NET applica...,<p>I have a little game written in C#. It uses...
...,...,...,...,...,...,...,...
1264211,40143210,5610777.0,2016-10-19T23:38:01Z,,0,URL routing in PHP (MVC),<p>I am building a custom MVC project and I ha...
1264212,40143300,3791161.0,2016-10-19T23:48:09Z,,0,Bigquery.Jobs.Insert - Resumable Upload?,<p>The API docs show that you should be able t...
1264213,40143340,7028647.0,2016-10-19T23:52:50Z,,1,Obfuscating code in android studio,<p>Under minifyEnabled I changed from false to...
1264214,40143360,871677.0,2016-10-19T23:55:24Z,,0,How to fire function after v-model change?,<p>I have input which I use to filter my array...


In [57]:
questions[questions.Id==80]

Unnamed: 0,Id,OwnerUserId,CreationDate,ClosedDate,Score,Title,Body
0,80,26.0,2008-08-01T13:57:07Z,,26,SQLStatement.execute() - multiple queries in o...,<p>I've written a database generation script i...


#### Answers

In [11]:
answers = pd.read_csv(data_path / 'Answers.csv', encoding="ISO-8859-1")

In [38]:
answers.Id.nunique()
answers.sort_values(by='ParentId')

2014516

Unnamed: 0,Id,OwnerUserId,CreationDate,ParentId,Score,Body
1,124,26.0,2008-08-01T16:09:47Z,80,12,<p>I wound up using this. It is a kind of a ha...
219787,3770976,364174.0,2010-09-22T15:37:30Z,80,1,<p>What about making your delimiter something ...
336,10008,1109.0,2008-08-13T16:09:09Z,80,6,"<p>The <a href=""http://en.wikipedia.org/wiki/S..."
0,92,61.0,2008-08-01T14:45:37Z,90,13,"<p><a href=""http://svnbook.red-bean.com/"">Vers..."
10748,202317,20709.0,2008-10-14T18:41:45Z,90,2,"<p>You can also try <em><a href=""http://www.co..."
...,...,...,...,...,...,...
2014515,40143389,4464432.0,2016-10-19T23:58:58Z,40142910,0,<p>Try add <code>retrun false</code> in the <c...
2014501,40143139,6107989.0,2016-10-19T23:30:12Z,40142940,1,<p>Alternative to @space_voyager this code sup...
2014495,40142997,4605946.0,2016-10-19T23:15:02Z,40142940,0,<p>Here's how you can do it:</p>\n\n<pre><code...
2014508,40143212,1491895.0,2016-10-19T23:38:17Z,40143190,5,<p>Use a here-doc:</p>\n\n<pre><code>result=$(...


#### Tags

In [36]:
tags = pd.read_csv(data_path / 'Tags.csv', encoding="ISO-8859-1")

In [40]:
tags.Id.nunique()
tags.sort_values(by='Id')

1264216

Unnamed: 0,Id,Tag
0,80,flex
1,80,actionscript-3
2,80,air
3,90,svn
4,90,tortoisesvn
...,...,...
3750989,40143360,javascript
3750990,40143360,vue.js
3750992,40143380,mocha
3750991,40143380,npm


In [227]:
class StackOverflow:
    
    def __init__(self, questions, answers, tags):
        self.questions = questions
        self.answers = answers
        self.tags = tags
        
    @staticmethod
    def extract_tags(t):
        return ', '.join(list(t.Tag.values))
        
    def display_question(self, id):
        mask = self.questions.Id==id
        q = self.questions[mask]
        
        display(HTML("<h1>Question<h1>"))
        display(HTML(f"<h3>{q.Title.values[0]}</h3>"))
        display(HTML("<br>"))
        display(HTML(q.Body.values[0]))
        
        
        
    def display_answer(self, id):
        mask = self.answers.ParentId==id
        a = self.answers[mask].sort_values(by='Score', ascending=False).reset_index()
        
        display(HTML("<h1>Answers</h1>"))
        for ii_answer in a.itertuples():
            display(HTML(f"<h4>Answer {ii_answer.Index + 1} ({ii_answer.Score})</h4>"))
            display(HTML(ii_answer.Body))
        
    def display_tag(self, id):
        mask = self.tags.Id==id
        t = self.tags[mask]
    
        tag_csv = self.extract_tags(t)
        
        display(HTML(f"<h1>Tags:</h1>")) # <h2>{tag_csv}<h2>"))
        display(HTML(f"<h4>{tag_csv}</h4>"))
        display(HTML("<br>"))
        
    def display(self,id):
        self.display_tag(id)
        self.display_question(id)
        self.display_answer(id)


In [228]:
so = StackOverflow(questions, answers, tags)

In [233]:
questions.head(5).Id.unique()

array([ 80,  90, 120, 180, 260])

In [234]:
so.display(id=180)