-
Notifications
You must be signed in to change notification settings - Fork 1
/
create_dataset.py
116 lines (82 loc) · 3.74 KB
/
create_dataset.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
import pandas as pd
import numpy as np
import re
import csv
import os
'''
Read files from data/QueryResults_XY.csv and aggregate
'''
def files(path):
for file in os.listdir(path):
if os.path.isfile(os.path.join(path, file)):
yield file
df = pd.DataFrame()
path = "../data/"
for file in files(path):
if "QueryResults_" in file:
temp = pd.read_csv(os.path.join(path, file))
df = pd.concat([df, temp])
print(os.path.join(path, file), df.shape)
titles = df[['PostId','ParentId','CommentId','Title']].drop_duplicates(subset= ['PostId','Title']).dropna(subset = ['PostId','Title']).reset_index(drop = True).copy()
titles['category'] = 'title'
titles.rename(columns={'PostId': 'post_id','ParentId':'parent_id', 'CommentId': 'comment_id', 'Title':'text'}, inplace = True)
titles['comment_id'] = None
titles['parent_id'] = None
posts = df[['PostId','ParentId','CommentId', 'Body']].drop_duplicates(subset= ['PostId','ParentId','Body']).dropna(subset= ['PostId','Body']).reset_index(drop = True)
posts['category'] = 'post'
posts.rename(columns={'PostId': 'post_id','ParentId':'parent_id', 'CommentId': 'comment_id', 'Body':'text'}, inplace = True)
posts['comment_id'] = None
comments = df[['PostId','ParentId','CommentId', 'Text']].drop_duplicates(subset= ['PostId','CommentId','Text']).dropna(subset= ['PostId','CommentId', 'Text']).reset_index(drop = True)
comments['category'] = 'comment'
comments.rename(columns={'PostId': 'post_id','ParentId':'parent_id', 'CommentId': 'comment_id', 'Text':'text'}, inplace = True)
comments['parent_id'] = None
data = pd.concat([titles, posts,comments], sort=False)
data.to_csv('../data/stackexchange_812k.csv', quoting = csv.QUOTE_ALL, index = False)
# ------------------------------------------------------------------------------
'''
clean up with regex
'''
# rm html tags
data['text'] = data.text.apply(lambda t : re.sub("<[^>]*>",' ', t) )
# rm line returns
data['text'] = data.text.apply(lambda t : re.sub("[\r\n]+",' ', t) )
# rm urls
data['text'] = data.text.apply(lambda t : re.sub("http\S+",' ', t) )
# strip
data['text'] = data.text.apply(lambda t : t.strip() )
# ---------------------------------------------------------------------------
# Version 01
# ---------------------------------------------------------------------------
df = pd.read_csv('../data/StatsExchange.csv')
'''
Flatten dataset
'''
df.rename(columns = { 'Title': 'title', 'Body': 'body', 'Text': 'text'}, inplace = True)
# title text and Body as separate items
title = df.title.unique()
body = df.body.unique()
text = df.text.unique()
data = pd.DataFrame(np.concatenate((text, body, title)), columns = ['fulltext']).dropna()
'''
Alternative data aggregation
Aggregate title + Body + multiple text
'''
# aggregate text by posts
post = df.groupby(by = 'PostId')['text'].apply(lambda x: ','.join(x)).reset_index()
# merge unique title / body with aggregated texts
data = df.drop_duplicates(subset = 'PostId')[['PostId','title', 'body']].merge(post, on = 'PostId' )
data.fillna('',inplace = True)
data['fulltext'] = data.apply(lambda d: ' '.join([d.title, d.body, d.text]) , axis = 1)
# remove html tags
# ex: text = re.sub("<[^>]*>",' ', text)
data['fulltext'] = data.fulltext.apply(lambda t : re.sub("<[^>]*>",' ', t) )
data['fulltext'] = data.fulltext.apply(lambda t : re.sub("[\r\n]+",' ', t) )
data['fulltext'] = data.fulltext.apply(lambda t : re.sub("http\S+",' ', t) )
data['fulltext'] = data.fulltext.apply(lambda t : t.strip() )
data['len_text'] = data.fulltext.apply(len)
data.sort_values(by = 'len_text', inplace = True)
data = data[data.len_text > 2]
data.to_csv('../data/fulltext_71k.csv', quoting = csv.QUOTE_ALL, index = False)
# remove latex : $\\overline{x}\\pm \\sigma Z_{\\alpha/2} $
# remove urls
# remove mentions @