In [7]:
import os
import json
import re
from preprocessing import Preprocess as preprocess_class
from copy import deepcopy

In [8]:
preprocess_object=preprocess_class()

In [None]:
PREFIX_INPUT_FILE_PATH="data_files/pp_nov_28/result/"
PREFIX_OUTPUT_FILE_PATH="data_files/cleaned_pp_nov_28/"

In [87]:
lb_file_idx=0
ub_file_idx=102

potential_dups_yet=0

### To clean self-closing tags and both closing tags from URL

In [9]:
#https://stackoverflow.com/a/11229866
self_contained_ref_regex=r"<[^>]*>"
self_contained_ref_pattern=re.compile(self_contained_ref_regex)

def clean_tags_prevent_content(curr_text):
    curr_text=self_contained_ref_pattern.sub(" ",curr_text)
    return curr_text

##### Tests

In [80]:
tc_cases=["< Hello> There </Hello>",
            "billy butcher",
              "<Doncaster/>"
            ]

In [82]:
for curr_tc in tc_cases:
    print("tc is ",curr_tc)
    print("Output  is ",clean_tags_prevent_content(curr_tc) )
    print("**********")

tc is  < Hello> There </Hello>
Output  is    There  
**********
tc is  billy butcher
Output  is  billy butcher
**********
tc is  <Doncaster/>
Output  is   
**********


### Cleaning newlines and tabs

In [10]:
def clean_newline_stuff(curr_text):
    curr_text=curr_text.replace("\n","")
    curr_text=curr_text.replace("\t","")
    return curr_text

#### Fetching question tags

In [11]:
def fetch_tags_list(tag_text):
    arr=tag_text.split(">")
    arr=list(filter(lambda x:x!="",arr))
    arr=[x[1:] for x in arr]
    return arr

In [83]:
tc_cases=['<performance><unix><awk><aix>',
            '<c#><exception><error-handling>',
              "<Doncaster>"
            ]

In [84]:
for curr_tc in tc_cases:
    print("tc is ",curr_tc)
    print("Output  is ",fetch_tags_list(curr_tc) )
    print("**********")

tc is  <performance><unix><awk><aix>
Output  is  ['performance', 'unix', 'awk', 'aix']
**********
tc is  <c#><exception><error-handling>
Output  is  ['c#', 'exception', 'error-handling']
**********
tc is  <Doncaster>
Output  is  ['Doncaster']
**********


### Removing all variations of the word "DUPLICATE" from title

In [12]:
dup_regex = re.compile("duplicate", re.IGNORECASE)
def rem_dup(text):
    return dup_regex.sub(" ", text)

### Removing "code"/blockquote tag and the content inside it

In [93]:
code_paired_refs_regex=r"<code(.*?)<\/code>"
code_paired_refs_pattern=re.compile(code_paired_refs_regex)
def rem_code(text):
    return code_paired_refs_pattern.sub(" ", text)

In [98]:
blockquote_paired_refs_regex=r"<blockquote(.*?)<\/blockquote>"
blockquote_paired_refs_pattern=re.compile(blockquote_paired_refs_regex)
def rem_blockquote(text):
    return blockquote_paired_refs_pattern.sub(" ", text)

In [90]:
def fetch_question_satisfying_a_condition(fn):
    ans=None
    for file_id in range(lb_file_idx, ub_file_idx+1):
        if ans!=None:
            break
        print("Starting file with id: ", file_id)
        with open(PREFIX_FILE_PATH+f"/post_{file_id}.json",'r') as fd:
            df=json.load(fd)
        new_df=dict()
        for curr_key, curr_val in df.items():
            # we do not want to process answers
            if curr_val["PostTypeId"]!="1":
                continue
            if fn(curr_val):
                ans=deepcopy(curr_val)
                print("Found")
                break
    return ans

In [95]:
def code_tag_in_body(curr_obj):
    all_matches=code_paired_refs_pattern.findall(curr_obj['Body'])
    return len(all_matches)>0

In [99]:
def blockquote_tag_in_body(curr_obj):
    all_matches=blockquote_paired_refs_pattern.findall(curr_obj['Body'])
    return len(all_matches)>0

In [96]:
fetch_question_satisfying_a_condition(code_tag_in_body)

Starting file with id:  0
Found


{'AcceptedAnswerId': '7',
 'AnswerCount': '13',
 'Body': "<p>I'm new to C# and I want to use a track-bar to change a form's opacity.</p>\n\n<p>This is my code:</p>\n\n<pre><code>decimal trans = trackBar1.Value / 5000;\nthis.Opacity = trans;\n</code></pre>\n\n<p>When I try to build it, I get this error:</p>\n\n<blockquote>\n  <p>Cannot implicitly convert type 'decimal' to 'double'</p>\n</blockquote>\n\n<p>I tried making <code>trans</code> a double, but then the control doesn't work. This code worked fine for me in VB.NET. </p>\n\n<p>What do I need to do differently?</p>\n",
 'CommentCount': '19',
 'CreationDate': '2008-07-31T21:42:52.667',
 'FavoriteCount': '14',
 'Id': '4',
 'LastActivityDate': '2012-07-24T16:45:40.937',
 'LastEditDate': '2012-05-04T08:55:46.677',
 'LastEditorDisplayName': 'Rich B',
 'LastEditorUserId': '1039608',
 'OwnerUserId': '8',
 'PostTypeId': '1',
 'Score': '139',
 'Tags': '<c#><winforms><forms><type-conversion><opacity>',
 'Title': "When setting a form's opacit

In [None]:
fetch_question_satisfying_a_condition(blockquote_tag_in_body)

In [88]:
def fetch_question_id_object(q_id):
    ans=None
    assert(type(q_id)==str)
    for file_id in range(lb_file_idx, ub_file_idx+1):
        if ans!=None:
            break
        print("Starting file with id: ", file_id)
        with open(PREFIX_FILE_PATH+f"/post_{file_id}.json",'r') as fd:
            df=json.load(fd)
        new_df=dict()
        for curr_key, curr_val in df.items():
            # we do not want to process answers
            if curr_val["PostTypeId"]!="1":
                continue
            if curr_val['Id']==q_id:
                ans=deepcopy(curr_val)
                print("Found")
                break
    return ans

In [89]:
fetch_question_id_object("777711")

Starting file with id:  0
Starting file with id:  1
Starting file with id:  2
Starting file with id:  3
Starting file with id:  4
Starting file with id:  5
Found
Starting file with id:  6
Starting file with id:  7


KeyboardInterrupt: 

#### Just investigate the PostTypeIDs present (DEV)

In [23]:
def check_post_types():
    for curr_id, curr_val in df.items():
        if curr_val['PostTypeId']!='1':
            print(curr_val['PostTypeId'])

### Now, first find a duplicate

In [70]:
test_reg=re.compile(r"\[\s*duplicate\s*\]", re.IGNORECASE)

In [76]:
testing_dup_title_string=["[  duplicaTe]"
                            ,"[   duplicaTE       ]"
                             ,"[duplicaTE]"
                          ,"abcdef [duplicaTE] dupl sis os"        ,
                          "skjsis9"
                         ]

In [77]:
for curr_tc in testing_dup_title_string:
    matches=test_reg.findall(curr_tc)
    print(curr_tc)
    print(matches)
    print("##########")

[  duplicaTe]
['[  duplicaTe]']
##########
[   duplicaTE       ]
['[   duplicaTE       ]']
##########
[duplicaTE]
['[duplicaTE]']
##########
abcdef [duplicaTE] dupl sis os
['[duplicaTE]']
##########
skjsis9
[]
##########


### Find all questions with [duplicate] in their title among all files

In [78]:
def find_dups_in_title():
    lb_file_idx=0
    ub_file_idx=102

    potential_dups_yet=0

    for file_id in range(lb_file_idx, ub_file_idx+1):
        print("Starting file with id: ", file_id)
        with open(PREFIX_FILE_PATH+f"/post_{file_id}.json",'r') as fd:
            df=json.load(fd)
        new_df=dict()
        for curr_key, curr_val in df.items():
            # we do not want to process answers
            if curr_val["PostTypeId"]!="1":
                continue
            matches=test_reg.findall(curr_val['Title'])
            if len(matches)>0:
                print(curr_key)
                print(curr_val)
                print("$$$$$$$$$$$$$$$$$$")

In [79]:
find_dups_in_title()

Starting file with id:  0
Starting file with id:  1
Starting file with id:  2
Starting file with id:  3
Starting file with id:  4
Starting file with id:  5
777711
{'AcceptedAnswerId': '777721', 'AnswerCount': '4', 'Body': '<blockquote>\n  <p>Duplicate:\n  <a href="http://stackoverflow.com/questions/163434/are-nulls-in-a-relational-database-okay">http://stackoverflow.com/questions/163434/are-nulls-in-a-relational-database-okay</a></p>\n</blockquote>\n\n<p>I dodged a heated debate concerning nulls in the database today.\nMy opinion is that null is an excellent indicator of unspecified values. Every one else in the team, that has an opinion, thinks zero and empty strings are the way to go.</p>\n\n<p>Are they lazy or am I to strict?</p>\n', 'CommentCount': '1', 'CreationDate': '2009-04-22T15:07:54.137', 'Id': '777711', 'LastActivityDate': '2009-04-22T15:11:00.667', 'LastEditDate': '2009-04-22T15:10:22.710', 'LastEditorDisplayName': '', 'LastEditorUserId': '44389', 'OwnerUserId': '21761', '

## Run loop

In [None]:

for file_id in range(lb_file_idx, ub_file_idx+1):
    print("Starting file with id: ", file_id)
    with open(PREFIX_FILE_PATH+f"/post_{file_id}.json",'r') as fd:
        df=json.load(fd)
    new_df=dict()
    for curr_key, curr_val in df.items():
        # we do not want to process answers
        if curr_val["PostTypeId"]!="1":
            continue
        new_val=deepcopy(curr_val)
        new_val['cleaned_body']=rem_code(new_val['Body'])
        new_val['cleaned_body']=clean_tags_prevent_content(new_val['cleaned_body'])
        new_val['cleaned_body']=clean_newline_stuff(new_val['cleaned_body'])
        
        new_val['cleaned_title']=rem_dup(new_val['Title'])
        
        new_val['body_vec']=preprocess_object.parse_string(new_val['cleaned_body'])
        new_val['title_vec']=preprocess_object.parse_string(new_val['cleaned_title'])
        
        #try:
            #new_val['dups_list']=dups_dict[curr_key]
            #potential_dups_yet+=1
        #except:
            #new_val['dups_list']=[]
        try:
            new_val['tags_list']=fetch_tags_list(new_val['Tags'])
        except:
            new_val['tags_list']=[]
            print(curr_val)
            #break
        new_df[curr_key]=deepcopy(new_val)
    print("Finished ", file_id)
    print("dups potential ", potential_dups_yet)
    print("-----------")
    with open(NEW_PREFIX_FILE_PATH+f"/post_{file_id}.json",'w') as fd:
        json.dump(new_df, fd, indent=1)

In [None]:
#new_df.keys()

In [None]:
for key in new_df:
    if new_df[key]['dups_list']!=[]:
        print(new_df[key]['dups_list'])
        #break