## Database creation and cleaning for Stack Overflow dataset
#### The current notebook needs the "posts_stack_overflow.xml" file inside the raw_dataset folder in order to run properly.
The dataset is read, parsed and exported into three CSV files (train, validation and test).
It labels the questions based on their "score" property between 1 (good) and 0 (bad) question. These labels are calculated given a linear interpolation for all questions in order to get ranges between 0.5 and 5.5 stars. Based on then, every question above 3 starts is labeled as 1 and every question below as 0.

In [None]:
# INSTALL LIBRARIES AND DOWNLOAD FILES
# !pip install py7zr
# !wget https://archive.org/download/stackexchange/electronics.stackexchange.com.7z

In [None]:
# UNZIP FILE
'''
import py7zr
print(py7zr.__version__)
import py7zr
with py7zr.SevenZipFile('/content/electronics.7z', mode='r') as z:
    z.extractall("/content/data")
'''

In [None]:
#Parse XML to CSV (cleaning data)
import xml.etree.ElementTree as ET
import csv
import random
import numpy as np
import matplotlib.pyplot as plt
import random
import re

In [None]:
#Find stars for given score
class Scores:
    def __init__(self):
        #------------------------------
        self.min_star = 0.5
        self.max_star = 5.5
        #------------------------------
        self.total_scores = 0
        self.scores_counts = {}

    def append(self, likes):
        #append user likes values
        self.scores_counts[likes] = self.scores_counts.get(likes,0)+1
        self.total_scores +=1

    def process(self):
        # orders scores and find value ranges
        # call before trying to score something
        self.processed_already = True
        self.keys = list(self.scores_counts.keys())
        self.keys.sort()
        
        partial_count = 0
        total_count = sum([count for count in self.scores_counts.values()])
        self.stars = {}
        for k in self.keys:
            self.stars[k] = self.min_star + ((partial_count+0.5*self.scores_counts[k])/total_count)*(self.max_star - self.min_star)
            partial_count += self.scores_counts[k]
        
    def get_star(self, likes):
        if likes in self.stars:
            return self.stars[likes]
        
        for k in self.keys:
            if k>likes:
                return self.stars[k]
        return self.max_star
       


In [None]:
data_size = 20094655
# scores = Scores()
scores = {str(year):Scores() for year in range(2010,2019+1)}
i = 0
for event, elem in ET.iterparse("../raw_dataset/posts_stackoverflow.xml"):
      #Add score if type is question
    if elem.tag == "row" and event == "end" and elem.attrib["PostTypeId"] == "1":
        year = str(elem.attrib["CreationDate"]).split("T")[0].split("-")[0]
        if int(year)>=2020:
            break
        if int(year)>=2010:
            scores[year].append(int(elem.attrib["Score"]))
            i += 1
    elem.clear()
    if i % 1000000 == 0 and i>0:
        print(i)
    if i == data_size:
        break

# scores.process()
print("Processing scores...")
for scorer in scores.values():
    scorer.process()


In [None]:
for k in scores["2019"].keys:
    print(k,":",scores["2019"].scores_counts[k], scores["2019"].get_star(k))

print("test other value:", scores["2019"].get_star(1))
print("test other value:", scores["2019"].get_star(13000))

stars = []
for k in scores["2019"].keys:
    star = scores["2019"].get_star(k)
    for c in range(scores["2019"].scores_counts[k]):
        stars.append(star)


plt.hist(stars)
plt.show()

In [None]:
data_size = 20094655
data_split = 30000

idx_val = set(random.sample(range(data_split),int(data_split*0.05)))
idx_test = set()
for i in range(int(data_split*0.05)):
    n = random.randint(0, data_split-1)
    while n in idx_val or n in idx_test:
        n = random.randint(0, data_split-1)
    idx_test.add(n)
print(len(idx_test))
print(len(idx_val))
print(len(idx_test.intersection(idx_val)))
z = np.zeros(data_split)
z[list(idx_test)] = 1
z[list(idx_val)] = 2
print(np.count_nonzero(z == 0))
print(np.count_nonzero(z == 1))
print(np.count_nonzero(z == 2))

In [None]:
#Parse XML to CSV (cleaning data)

#Creates list of tags given the string
def createTags(tags):
    return "|".join(tags[1:-1].split("><"))

#Clean sentence (remove non alpha chars)
def cleanSentence(sentence):
    p = re.compile(r'<.*?>')
    sentence = p.sub('', sentence) 
    sentence = ''.join([(i.lower() if i.isalpha() else " ") for i in sentence if i.isalpha() or i == " " or i == "-"])
    # sentence = ''.join([i.lower() if i.isalpha() else " " if (i==" " or i=="-" or i=="_") else "" for i in sentence])
    return sentence

#Opens CSV file to write parsed rows
i, j = 0, 0
with open('../processed_files/data_stackoverflow_train.csv', mode='w', newline='',encoding="utf8", buffering=1) as data_file:
    with open('../processed_files/data_stackoverflow_test.csv', mode='w', newline='',encoding="utf8", buffering=1) as data_file_test:
        with open('../processed_files/data_stackoverflow_val.csv', mode='w', newline='',encoding="utf8", buffering=1) as data_file_val:
            #Headers for train
            data_writer = csv.writer(data_file, delimiter=',', quotechar='"', quoting=csv.QUOTE_MINIMAL)
            data_writer.writerow(["stars","title","tags","score","creation_date"]) #write headers
            #Headers for test
            data_writer_test = csv.writer(data_file_test, delimiter=',', quotechar='"', quoting=csv.QUOTE_MINIMAL)
            data_writer_test.writerow(["stars","title","tags","score","creation_date"]) #write headers
            #Headers for val
            data_writer_val = csv.writer(data_file_val, delimiter=',', quotechar='"', quoting=csv.QUOTE_MINIMAL)
            data_writer_val.writerow(["stars","title","tags","score","creation_date"]) #write headers
            #Loop to iterate through every element on the XML file
            for event, elem in ET.iterparse("Posts.xml"):
                #Write row if tag is row and post type is 1 (meaning it is a question)
                if elem.tag == "row" and event == "end" and elem.attrib["PostTypeId"] == "1":
                    year = str(elem.attrib["CreationDate"]).split("T")[0].split("-")[0]
                    if int(year)>=2020:
                        break
                    if int(year)>=2010:
                        if i % 650 == 0:
                            score = int(elem.attrib["Score"])
                            if j in idx_test:
                                data_writer_test.writerow([scores[year].get_star(score),cleanSentence(elem.attrib["Title"]),createTags(elem.attrib["Tags"].lower()),elem.attrib["Score"],elem.attrib["CreationDate"]])
                            elif j in idx_val:
                                data_writer_val.writerow([scores[year].get_star(score),cleanSentence(elem.attrib["Title"]),createTags(elem.attrib["Tags"].lower()),elem.attrib["Score"],elem.attrib["CreationDate"]])
                            else:
                                data_writer.writerow([scores[year].get_star(score),cleanSentence(elem.attrib["Title"]),createTags(elem.attrib["Tags"].lower()),elem.attrib["Score"],elem.attrib["CreationDate"]])  
                            j += 1
                    i += 1
                elem.clear()
                if i%1000000==0:
                    print(i)
                if j == data_split:
                    break


FileNotFoundError: ignored

In [None]:
#Find stars for given score
def stars(score, ranges):
    if score < ranges[0]:
        return 1
    elif score < ranges[1]:
        return 2
    elif score < ranges[2]:
        return 3
    elif score < ranges[3]:
        return 4
    else:
        return 5

#Create array with all scores
scores = []
i = 0
for event, elem in ET.iterparse("Posts.xml"):
      #Add score if type is question
      if elem.tag == "row" and event == "end" and elem.attrib["PostTypeId"] == "1":
          scores.append(int(elem.attrib["Score"]))
          i += 1
      elem.clear()
      if i == data_size:
          break
#Sort the list of scores
scores.sort()
l = len(scores)
ranges = [scores[int(0.2*l)], scores[int(0.4*l)], scores[int(0.6*l)], scores[int(0.8*l)]]

# New section