In [67]:
import os
import json
import re
from preprocessing import Preprocess as preprocess_class
from copy import deepcopy

In [68]:
preprocess_object=preprocess_class()

### Decide the input and output paths

In [69]:
PREFIX_FILE_PATH="data_files/pp_nov_28/result/"
PREFIX_OUTPUT_FILE_PATH="data_files/cleaned_pp_nov_28/"

In [70]:
lb_file_idx=0
ub_file_idx=102

### To clean self-closing tags and both closing tags from URL

In [71]:
#https://stackoverflow.com/a/11229866
self_contained_ref_regex=r"<[^>]*>"
self_contained_ref_pattern=re.compile(self_contained_ref_regex)

def clean_tags_prevent_content(curr_text):
    curr_text=self_contained_ref_pattern.sub(" ",curr_text)
    return curr_text

##### Tests

In [72]:
tc_cases=["< Hello> There </Hello>",
            "billy butcher",
              "<Doncaster/>"
            ]

In [73]:
for curr_tc in tc_cases:
    print("tc is ",curr_tc)
    print("Output  is ",clean_tags_prevent_content(curr_tc) )
    print("**********")

tc is  < Hello> There </Hello>
Output  is    There  
**********
tc is  billy butcher
Output  is  billy butcher
**********
tc is  <Doncaster/>
Output  is   
**********


### Cleaning newlines and tabs

In [74]:
def clean_newline_stuff(curr_text):
    curr_text=curr_text.replace("\n","")
    curr_text=curr_text.replace("\t","")
    return curr_text

#### Fetching question tags

In [75]:
def fetch_tags_list(tag_text):
    arr=tag_text.split(">")
    arr=list(filter(lambda x:x!="",arr))
    arr=[x[1:] for x in arr]
    return arr

In [76]:
tc_cases=['<performance><unix><awk><aix>',
            '<c#><exception><error-handling>',
              "<Doncaster>"
            ]

In [77]:
for curr_tc in tc_cases:
    print("tc is ",curr_tc)
    print("Output  is ",fetch_tags_list(curr_tc) )
    print("**********")

tc is  <performance><unix><awk><aix>
Output  is  ['performance', 'unix', 'awk', 'aix']
**********
tc is  <c#><exception><error-handling>
Output  is  ['c#', 'exception', 'error-handling']
**********
tc is  <Doncaster>
Output  is  ['Doncaster']
**********


### Removing all variations of the word "DUPLICATE" from title

In [78]:
test_dup_reg=re.compile(r"\[\s*duplicate\s*\]", re.IGNORECASE)
testing_dup_title_string=["abcd [  duplicaTe] fghi j "
                            ,"[   duplicaTE       ]"
                             ,"[duplicaTE]"
                          ,"abcdef [duplicaTE] dupl sis os"        ,
                          "skjsis9",
                          "Is null harmful? [Duplicate]"
                         ]

#dup_regex = re.compile("duplicate", re.IGNORECASE)
def rem_dup(text):
    return test_dup_reg.sub(" ", text)

for curr_tc in testing_dup_title_string:
    matches=test_dup_reg.findall(curr_tc)
    print(curr_tc)
    print(matches)
    print("Rem part is ", rem_dup(curr_tc))
    print("##########")

abcd [  duplicaTe] fghi j 
['[  duplicaTe]']
Rem part is  abcd   fghi j 
##########
[   duplicaTE       ]
['[   duplicaTE       ]']
Rem part is   
##########
[duplicaTE]
['[duplicaTE]']
Rem part is   
##########
abcdef [duplicaTE] dupl sis os
['[duplicaTE]']
Rem part is  abcdef   dupl sis os
##########
skjsis9
[]
Rem part is  skjsis9
##########
Is null harmful? [Duplicate]
['[Duplicate]']
Rem part is  Is null harmful?  
##########


### Removing "code"/blockquote tag and the content inside it

In [79]:
code_paired_refs_regex=r"<code((.|\n)*?)<\/code>"
code_paired_refs_pattern=re.compile(code_paired_refs_regex)
def rem_code(text):
    return code_paired_refs_pattern.sub(" ", text)

##### test code block removal

In [80]:
tc=["<p>I'm new to C# and I want to use a track-bar to change a form's opacity\
            .</p>\n\n<p>This is my code:</p>\n\n<pre>\
        <code>decimal trans = trackBar1.Value / 5000;\nthis.Opacity = trans;\n</code>\
                </pre>\n\n<p>When I try to build it, I get this error:</p>\n\n<blockquote>\n  <p>Cannot implicitly convert type 'decimal' to 'double'</p>\n</blockquote>\n\n<p>I tried making\ <code>trans</code> \
    a double, but then the control doesn't work. This code worked fine for me in VB.NET. </p>\n\n<p>What do I need to do differently?</p>\n",
    "<code>decimal trans = trackBar1.Value / 5000;\nthis.Opacity = trans;\n</code>",
    "Homelander <code> Hi Hi </code> Bi <code>Hello there</code>"
   ]

In [81]:
for curr_tc in tc:
    print(curr_tc)
    print(":::::::::")
    print(rem_code(curr_tc))
    print("##################################")

<p>I'm new to C# and I want to use a track-bar to change a form's opacity            .</p>

<p>This is my code:</p>

<pre>        <code>decimal trans = trackBar1.Value / 5000;
this.Opacity = trans;
</code>                </pre>

<p>When I try to build it, I get this error:</p>

<blockquote>
  <p>Cannot implicitly convert type 'decimal' to 'double'</p>
</blockquote>

<p>I tried making\ <code>trans</code>     a double, but then the control doesn't work. This code worked fine for me in VB.NET. </p>

<p>What do I need to do differently?</p>

:::::::::
<p>I'm new to C# and I want to use a track-bar to change a form's opacity            .</p>

<p>This is my code:</p>

<pre>                         </pre>

<p>When I try to build it, I get this error:</p>

<blockquote>
  <p>Cannot implicitly convert type 'decimal' to 'double'</p>
</blockquote>

<p>I tried making\       a double, but then the control doesn't work. This code worked fine for me in VB.NET. </p>

<p>What do I need to do differently

##### -----------

In [82]:
url_regex = r"(?i)\b((?:https?://|www\d{0,3}[.]|[a-z0-9.\-]+[.][a-z]{2,4}/)(?:[^\s()<>]+|\(([^\s()<>]+|(\([^\s()<>]+\)))*\))+(?:\(([^\s()<>]+|(\([^\s()<>]+\)))*\)|[^\s`!()\[\]{};:'\".,<>?«»“”‘’]))"
def fetch_urls(curr_str):
    url_matches = re.findall(url_regex,curr_str)      
    return [x[0] for x in url_matches]

In [83]:
blockquote_paired_refs_regex=r"<blockquote((.|\n)*?)<\/blockquote>"
blockquote_paired_refs_pattern=re.compile(blockquote_paired_refs_regex)
def rem_blockquote(text):
    #print("num is : ",len(fetch_urls(text)))
    if len(fetch_urls(text))>0:
        return blockquote_paired_refs_pattern.sub(" ", text)
    else:
        return text

In [84]:
tc=['<blockquote>\n  <h2>Duplicate</h2>\n  \n  <p><a href=\"http://stackoverflow.com/questions/403539/what-are-extension-methods\">What are Extension Methods?</a><br />\n  <a href=\"http://stackoverflow.com/questions/403619/usage-of-extension-methods\">Usage of Extension Methods</a><br />\n  <a href=\"http://stackoverflow.com/questions/487904/what-advantages-of-extension-methods-have-you-found\">What Advantages of Extension Methods have you found?</a>  </p>\n</blockquote>\n',
   "<blockquote>\n  <p>Cannot implicitly convert type 'decimal' to 'double'</p>\n</blockquote>",
   "<blockquote>\n  <p>Duplicate:\n  <a href=\"http://stackoverflow.com/questions/163434/are-nulls-in-a-relational-database-okay\">http://stackoverflow.com/questions/163434/are-nulls-in-a-relational-database-okay</a></p>\n</blockquote>\n"]
for curr_tc in tc:
    print(curr_tc)
    print(":::::::::")
    print("ans is : ",rem_blockquote(curr_tc))
    print("##################################")


<blockquote>
  <h2>Duplicate</h2>
  
  <p><a href="http://stackoverflow.com/questions/403539/what-are-extension-methods">What are Extension Methods?</a><br />
  <a href="http://stackoverflow.com/questions/403619/usage-of-extension-methods">Usage of Extension Methods</a><br />
  <a href="http://stackoverflow.com/questions/487904/what-advantages-of-extension-methods-have-you-found">What Advantages of Extension Methods have you found?</a>  </p>
</blockquote>

:::::::::
ans is :   

##################################
<blockquote>
  <p>Cannot implicitly convert type 'decimal' to 'double'</p>
</blockquote>
:::::::::
ans is :  <blockquote>
  <p>Cannot implicitly convert type 'decimal' to 'double'</p>
</blockquote>
##################################
<blockquote>
  <p>Duplicate:
  <a href="http://stackoverflow.com/questions/163434/are-nulls-in-a-relational-database-okay">http://stackoverflow.com/questions/163434/are-nulls-in-a-relational-database-okay</a></p>
</blockquote>

:::::::::
ans is :  

## =========================================================

In [85]:
def fetch_question_satisfying_a_condition(fn):
    ans=None
    for file_id in range(lb_file_idx, ub_file_idx+1):
        if ans!=None:
            break
        print("Starting file with id: ", file_id)
        with open(PREFIX_FILE_PATH+f"/post_{file_id}.json",'r') as fd:
            df=json.load(fd)
        new_df=dict()
        for curr_key, curr_val in df.items():
            # we do not want to process answers
            if curr_val["PostTypeId"]!="1":
                continue
            if fn(curr_val):
                ans=deepcopy(curr_val)
                print("Found")
                break
    return ans

In [86]:
def code_tag_in_body(curr_obj):
    all_matches=code_paired_refs_pattern.findall(curr_obj['Body'])
    return len(all_matches)>0

In [87]:
def blockquote_tag_in_body(curr_obj):
    all_matches=blockquote_paired_refs_pattern.findall(curr_obj['Body'])
    return len(all_matches)>0

In [88]:
def fetch_question_id_object(q_id):
    ans=None
    assert(type(q_id)==str)
    for file_id in range(lb_file_idx, ub_file_idx+1):
        if ans!=None:
            break
        print("Starting file with id: ", file_id)
        with open(PREFIX_FILE_PATH+f"/post_{file_id}.json",'r') as fd:
            df=json.load(fd)
        new_df=dict()
        for curr_key, curr_val in df.items():
            # we do not want to process answers
            if curr_val["PostTypeId"]!="1":
                continue
            if curr_val['Id']==q_id:
                ans=deepcopy(curr_val)
                print("Found")
                break
    return ans

## ===============================================

In [89]:
fetch_question_satisfying_a_condition(code_tag_in_body)

Starting file with id:  0
Found


{'AcceptedAnswerId': '7',
 'AnswerCount': '13',
 'Body': "<p>I'm new to C# and I want to use a track-bar to change a form's opacity.</p>\n\n<p>This is my code:</p>\n\n<pre><code>decimal trans = trackBar1.Value / 5000;\nthis.Opacity = trans;\n</code></pre>\n\n<p>When I try to build it, I get this error:</p>\n\n<blockquote>\n  <p>Cannot implicitly convert type 'decimal' to 'double'</p>\n</blockquote>\n\n<p>I tried making <code>trans</code> a double, but then the control doesn't work. This code worked fine for me in VB.NET. </p>\n\n<p>What do I need to do differently?</p>\n",
 'CommentCount': '19',
 'CreationDate': '2008-07-31T21:42:52.667',
 'FavoriteCount': '14',
 'Id': '4',
 'LastActivityDate': '2012-07-24T16:45:40.937',
 'LastEditDate': '2012-05-04T08:55:46.677',
 'LastEditorDisplayName': 'Rich B',
 'LastEditorUserId': '1039608',
 'OwnerUserId': '8',
 'PostTypeId': '1',
 'Score': '139',
 'Tags': '<c#><winforms><forms><type-conversion><opacity>',
 'Title': "When setting a form's opacit

In [90]:
fetch_question_satisfying_a_condition(blockquote_tag_in_body)

Starting file with id:  0
Found


{'AcceptedAnswerId': '7',
 'AnswerCount': '13',
 'Body': "<p>I'm new to C# and I want to use a track-bar to change a form's opacity.</p>\n\n<p>This is my code:</p>\n\n<pre><code>decimal trans = trackBar1.Value / 5000;\nthis.Opacity = trans;\n</code></pre>\n\n<p>When I try to build it, I get this error:</p>\n\n<blockquote>\n  <p>Cannot implicitly convert type 'decimal' to 'double'</p>\n</blockquote>\n\n<p>I tried making <code>trans</code> a double, but then the control doesn't work. This code worked fine for me in VB.NET. </p>\n\n<p>What do I need to do differently?</p>\n",
 'CommentCount': '19',
 'CreationDate': '2008-07-31T21:42:52.667',
 'FavoriteCount': '14',
 'Id': '4',
 'LastActivityDate': '2012-07-24T16:45:40.937',
 'LastEditDate': '2012-05-04T08:55:46.677',
 'LastEditorDisplayName': 'Rich B',
 'LastEditorUserId': '1039608',
 'OwnerUserId': '8',
 'PostTypeId': '1',
 'Score': '139',
 'Tags': '<c#><winforms><forms><type-conversion><opacity>',
 'Title': "When setting a form's opacit

In [91]:
fetch_question_id_object("777711")

Starting file with id:  0
Starting file with id:  1
Starting file with id:  2
Starting file with id:  3
Starting file with id:  4
Starting file with id:  5
Found


{'AcceptedAnswerId': '777721',
 'AnswerCount': '4',
 'Body': '<blockquote>\n  <p>Duplicate:\n  <a href="http://stackoverflow.com/questions/163434/are-nulls-in-a-relational-database-okay">http://stackoverflow.com/questions/163434/are-nulls-in-a-relational-database-okay</a></p>\n</blockquote>\n\n<p>I dodged a heated debate concerning nulls in the database today.\nMy opinion is that null is an excellent indicator of unspecified values. Every one else in the team, that has an opinion, thinks zero and empty strings are the way to go.</p>\n\n<p>Are they lazy or am I to strict?</p>\n',
 'CommentCount': '1',
 'CreationDate': '2009-04-22T15:07:54.137',
 'Id': '777711',
 'LastActivityDate': '2009-04-22T15:11:00.667',
 'LastEditDate': '2009-04-22T15:10:22.710',
 'LastEditorDisplayName': '',
 'LastEditorUserId': '44389',
 'OwnerUserId': '21761',
 'PostTypeId': '1',
 'Score': '0',
 'Tags': '<asp.net><sql><database><null>',
 'Title': 'Is null harmful? [Duplicate]',
 'ViewCount': '175'}

#### Just investigate the PostTypeIDs present (DEV)

In [92]:
def check_post_types():
    for curr_id, curr_val in df.items():
        if curr_val['PostTypeId']!='1':
            print(curr_val['PostTypeId'])

## ========================================

### Now, first find a duplicate and verify Gurkirat claim of just 2 duplicates using the paper's method

In [93]:
def find_dups_in_title():
    lb_file_idx=0
    ub_file_idx=102

    potential_dups_yet=0

    for file_id in range(lb_file_idx, ub_file_idx+1):
        print("Starting file with id: ", file_id)
        with open(PREFIX_FILE_PATH+f"/post_{file_id}.json",'r') as fd:
            df=json.load(fd)
        new_df=dict()
        for curr_key, curr_val in df.items():
            # we do not want to process answers
            if curr_val["PostTypeId"]!="1":
                continue
            matches=test_reg.findall(curr_val['Title'])
            if len(matches)>0:
                print(curr_key)
                print(curr_val)
                print("$$$$$$$$$$$$$$$$$$")

In [94]:
#find_dups_in_title()

## ================================================

## Run loop

In [95]:
def clean_post(post_obj):
    new_val=deepcopy(post_obj)
    new_val['cleaned_body']=rem_blockquote(new_val['Body'])
    #print("After rem is ",new_val['cleaned_body'] )
    new_val['cleaned_body']=rem_code(new_val['cleaned_body'])
    new_val['cleaned_body']=clean_tags_prevent_content(new_val['cleaned_body'])
    new_val['cleaned_body']=clean_newline_stuff(new_val['cleaned_body'])

    new_val['cleaned_title']=rem_dup(new_val['Title'])

    #new_val['body_vec']=preprocess_object.parse_string(new_val['cleaned_body'])
    #new_val['title_vec']=preprocess_object.parse_string(new_val['cleaned_title'])

    try:
        new_val['tags_list']=fetch_tags_list(new_val['Tags'])
    except:
        new_val['tags_list']=[]
        print(curr_val)
        #break
    return new_val

### Initial test

In [96]:
test_arr=[fetch_question_satisfying_a_condition(code_tag_in_body),
         fetch_question_satisfying_a_condition(blockquote_tag_in_body),
          fetch_question_id_object('777711'),
          fetch_question_id_object('783926')
                                               ]

Starting file with id:  0
Found
Starting file with id:  0
Found
Starting file with id:  0
Starting file with id:  1
Starting file with id:  2
Starting file with id:  3
Starting file with id:  4
Starting file with id:  5
Found
Starting file with id:  0
Starting file with id:  1
Starting file with id:  2
Starting file with id:  3
Starting file with id:  4
Starting file with id:  5
Found


In [97]:
test_ans=[]
for curr_tc in test_arr:
    curr_d=dict()
    curr_d['input']=deepcopy(curr_tc)
    curr_d['output']=clean_post(curr_tc)
    test_ans.append(curr_d)
with open("sample_of_cleanings.json",'w') as fd:
    json.dump(test_ans, fd, indent=4)

In [98]:
------------------

SyntaxError: invalid syntax (<ipython-input-98-2644e9e677df>, line 1)

### Final loop

In [None]:

for file_id in range(lb_file_idx, ub_file_idx+1):
    print("Starting file with id: ", file_id)
    with open(PREFIX_FILE_PATH+f"/post_{file_id}.json",'r') as fd:
        df=json.load(fd)
    new_df=dict()
    for curr_key, curr_val in df.items():
        # we do not want to process answers
        if curr_val["PostTypeId"]!="1":
            continue
        
        new_df[curr_key]=clean_post(curr_val)
    print("Finished ", file_id)
    print("dups potential ", potential_dups_yet)
    print("-----------")
    with open(PREFIX_OUTPUT_FILE_PATH+f"/post_{file_id}.json",'w') as fd:
        json.dump(new_df, fd, indent=1)