In [1]:
import numpy as np 
import pandas as pd
import html2text
import time
from timeit import default_timer as timer
from datetime import timedelta
import pickle

### Data Ingestion

In [2]:
questions = pd.read_csv("Questions.csv", encoding = "ISO-8859-1")
answers   = pd.read_csv("Answers.csv", encoding = "ISO-8859-1")
tags      = pd.read_csv("Tags.csv", encoding = "ISO-8859-1")
new_questions = questions.to_numpy()
new_answers   = answers.to_numpy()

In [22]:
answers.head()

Unnamed: 0,Id,OwnerUserId,CreationDate,ParentId,Score,Body
0,497,50.0,2008-08-02T16:56:53Z,469,4,<p>open up a terminal (Applications-&gt;Utilit...
1,518,153.0,2008-08-02T17:42:28Z,469,2,<p>I haven't been able to find anything that d...
2,536,161.0,2008-08-02T18:49:07Z,502,9,<p>You can use ImageMagick's convert utility f...
3,538,156.0,2008-08-02T18:56:56Z,535,23,<p>One possibility is Hudson. It's written in...
4,541,157.0,2008-08-02T19:06:40Z,535,20,"<p>We run <a href=""http://buildbot.net/trac"">B..."


### Data Structure Definition

In [8]:
class Thread:
    
    def __init__(self, q_title, q_text, q_id, q_score, u_id):
        self.q_title = h.handle(q_title)
        self.q_text  = h.handle(q_text)
        self.q_id    = q_id
        self.q_score = q_score
        self.u_id    = u_id
        self.answers = []
    
    def __str__(self):
        return "Question: " + self.q_title + "\n" + self.q_text + "\n" + str(self.answers)
        
    def __repr__(self):
        return "Question: " + self.q_title + "\n" + self.q_text + "\n" + str(self.answers)
    
    def add_answer(self, answer):
        self.answers.append(answer)

class Answer:
    
    def __init__(self, a_text, a_id, a_score, u_id):
        self.a_text  = h.handle(a_text)
        self.a_id    = a_id
        self.a_score = a_score
        self.u_id    = u_id
        
    def __str__(self):
        return "Answer: "+  self.a_text
        
    def __repr__(self):
        return "Answer: "+  self.a_text

### Pre-Processing:

After reading in the data from the .csv files. We generate a list of Thread objects, this list contains the answers linked to it's respective questions.

We clean the html tags from the text retaining the \[code\]code here\[/code\] tags, this will be used further on while doing pos tagging to ignore code content, since code is not a recognizable part of speech.

The format of the Thread object is as follows

    Thread :{
        q_title : "Question Title",
        q_text  : "Question Text",
        q_id    : "Question StackOverflow Id",
        q_score : "Question Score", #(retained for future purposes - Sentiment Analysis?)
        q_uid   : "User StackOverflow Id",
        answers : \[
            Answer:{
                a_text  : "Answer Text",
                a_id    : "Answer StackOverflow Id",
                a_score : "Answer Score", #(retained for future purposes - Sentiment Analysis?)
                u_id    : "User StackOverflow Id",
            }
    
        \],
    }

In [17]:
start_t = time.process_time()
threads = []
start = 0
for i in range(len(questions)):
    if i % 10000 == 0:
        h = html2text.HTML2Text()
        h.mark_code = True
        print(i)
    Q = new_questions[i]
    thread = Thread(Q[4], Q[5], Q[0], Q[3], Q[1])
    for j in range(start, len(answers)):
        A = new_answers[j]
        if A[3] == thread.q_id:
            answer = Answer(A[5], A[0], A[4], A[1])
            thread.answers.append(answer)
        elif A[3] > thread.q_id:
            start = j - 1
            break
    threads.append(thread)
with open("data.pkl", "wb") as out:
    pickle.dump(threads, out)
threads = []
end = time.process_time()
print("Start ", start_t, "End", end)
print("Total time = ", (end - start_t))

0
10000
20000
30000
40000
50000
60000
70000
80000
90000
100000
110000
120000
130000
140000
150000
160000
170000
180000
190000
200000
210000
220000
230000
240000
250000
260000
270000
280000
290000
300000
310000
320000
330000
340000
350000
360000
370000
380000
390000
400000
410000
420000
430000
440000
450000
460000
470000
480000
490000
500000
510000
520000
530000
540000
550000
560000
570000
580000
590000
600000
Start  7223.53125 End 11773.46875
Total time =  4549.9375


### Recovering the data pkl that contains the processed Threads object

In [19]:
with open("data.pkl", "rb") as pkl_in:
    threads = pickle.load(pkl_in)

### POS Tagging

In [23]:
threads[19]

Question: How do you set up Python scripts to work in Apache 2.0?




I tried to follow a couple of googled up tutorials on setting up mod_python,
but failed every time. Do you have a good, step-by step, rock-solid howto?

My dev box is OS X, production - Centos.


[Answer: 

There are two main ways of running Python on Apache. The simplest would be to
use CGI and write normal Python scripts while the second is using a web
framework like Django or Pylons.

Using CGI is straightforward. Make sure your Apache config file has a cgi-bin
set up. If not, follow their documentation
(<http://httpd.apache.org/docs/2.0/howto/cgi.html>). At that point all you
need to do is place your Python scripts in the cgi-bin directory and the
standard output will become the HTTP response. Refer to Python's documentation
for further info (<https://docs.python.org/library/cgi.html>).

If you want to use a web framework you'll need to setup mod_python or FastCGI.
These steps are dependent on which framework you