In [1]:
import pandas as pd
import os
import re

path = '../../data/submissions/raw/2019/2019-1.csv'



In [10]:
def clean_data(path, year, month):
    # Read CSV
    submissions = pd.read_csv(path, index_col = 0)
    print(len(submissions))
    # Drop unnecessary columns
    submissions = submissions.drop(columns = ["is_image", "author", "url", "num_comments", "num_upvotes"], axis = 1)

    # Remove rows with null bodies
    submissions = submissions.dropna()
    print(len(submissions))
    # Remove rows with [deleted] posts
    submissions = submissions[submissions.body != "[deleted]"]
    print(len(submissions))
    # Remove rows with [\n] bodies
    submissions = submissions[submissions.body != " \n"]
    print(len(submissions))
    # Remove rows with [removed] bodies
    submissions = submissions[submissions.body != "[removed]"]
    print(len(submissions))
    # Replace the \n special character with a space
    submissions = submissions.replace(r'\n', ' ', regex=True)
    
    # Replace links (Tested, too noisy)
    submissions['body'] = submissions['body'].map(lambda x: re.sub(r'.*http\S+', '', x))
    
    # Remove all non alphanumeric characters
    submissions['body'] = submissions['body'].map(lambda x: re.sub('[^0-9a-zA-Z \-\']+', ' ', x))
    
    # Lowercase all words
    submissions['body'] = submissions['body'].map(lambda x: x.lower())
    
    # Reset dataframe index
    submissions = submissions.reset_index(drop = True)
    
    # Save as CSV
    path = "../data/submissions/filtered/" + str(year) + "/" + str(year) + "-" + str(month) + ".csv"
    submissions.to_csv(path)
# clean_data(path, 2019, 1)

In [12]:
def clean_loop():
    for year in range(2014, 2023):
        if year == 2022:
            for month in range(1, 2):
                path = "../data/submissions/raw/" + str(year) + "/" + str(year) + "-" + str(month) + ".csv"
                clean_data(path, year, month)
        else:
            for month in range(1, 13):
                path = "../data/submissions/raw/" + str(year) + "/" + str(year) + "-" + str(month) + ".csv"
                clean_data(path, year, month)
def clean_merged_csv():
    main_file = "../data/submissions/raw/all_submissions.csv"
    clean_data(main_file, 2022, "submission")
clean_loop()

5
1
1
1
1
3
0
0
0
0
2
0
0
0
0
2
0
0
0
0
10
1
1
1
1
2
0
0
0
0
7
0
0
0
0
4
0
0
0
0
4
0
0
0
0
8
0
0
0
0
5
0
0
0
0
7
2
2
2
2
15
1
1
1
1
22
2
2
2
2
28
1
1
1
1
35
2
2
2
2
25
1
1
1
1
36
2
2
2
2
14
2
2
2
2
26
4
4
4
4
28
3
3
3
3
17
0
0
0
0
26
0
0
0
0
16
0
0
0
0
33
4
3
3
3
50
0
0
0
0
48
2
2
2
2
41
2
2
2
2
27
1
1
1
1
26
3
2
2
2
27
0
0
0
0
38
2
2
2
2
34
4
3
3
3
25
3
3
3
3
24
3
2
2
2
29
6
2
2
2
21
4
2
2
2
11
3
3
3
3
7
1
0
0
0
14
2
1
1
1
8
2
1
1
1
13
3
2
2
2
13
2
1
1
1
4
1
1
1
1
2
0
0
0
0
10
7
6
5
4
15
4
4
4
4
10
3
2
2
2
18
6
5
5
5
17
2
0
0
0
18
5
3
2
2
22
9
9
9
8
22
5
5
5
5
19
4
4
3
3
31
4
4
4
4
48
24
24
24
24
55
22
22
21
21
126
73
73
70
65
131
75
74
70
70
141
73
73
71
70
285
179
179
170
170
309
185
185
178
178
390
261
261
255
254
463
308
308
298
298
514
350
349
340
340
526
357
355
348
347
600
400
400
388
387
686
465
464
448
446
742
521
521
512
511
925
650
650
628
626
950
643
641
616
615
1103
796
795
761
758
1162
844
842
812
810
1078
785
784
762
754
1108
800
794
772
768
905
587
565
545
538
916
659


In [9]:
def merge_csv(csv1, csv2, name):
    a = pd.read_csv(csv1, index_col = 0)
    b = pd.read_csv(csv2, index_col = 0)
    merged = pd.concat([a, b], axis = 0, ignore_index = True)
    merged.to_csv(name)
    
def merge_all_filtered():
    main_file = "../data/submissions/filtered/all_submissions.csv"
    for year in range(2014, 2023):
        if year == 2022:
            for month in range(1, 2):
                path = "../data/submissions/filtered/" + str(year) + "/" + str(year) + "-" + str(month) + ".csv"
                merge_csv(main_file, path, main_file)
        else:
            for month in range(1, 13):
                path = "../data/submissions/filtered/" + str(year) + "/" + str(year) + "-" + str(month) + ".csv"
                merge_csv(main_file, path, main_file)
def merge_all_unfiltered():
    main_file = "../data/submissions/raw/all_submissions.csv"
    for year in range(2014, 2023):
        if year == 2022:
            for month in range(1, 2):
                path = "../data/submissions/raw/" + str(year) + "/" + str(year) + "-" + str(month) + ".csv"
                merge_csv(main_file, path, main_file)
        else:
            for month in range(1, 13):
                path = "../data/submissions/raw/" + str(year) + "/" + str(year) + "-" + str(month) + ".csv"
                merge_csv(main_file, path, main_file)
merge_all_unfiltered()

  merge_csv(main_file, path, main_file)
  merge_csv(main_file, path, main_file)


In [14]:
def summary_statistics(path):
    df = pd.read_csv(path, index_col = 0)
    max_len = df.body.map(lambda x: len(x.split())).max()
    position = df.body.map(lambda x: len(x.split())).argmax()
    print(df['body'][position])
    min_len = df.body.map(lambda x: len(x.split())).min()
    average_len = count = df['body'].str.split().apply(len).mean()
    return max_len, min_len, average_len
path = "../data/submissions/filtered/all_submissions.csv"
max_len, min_len, average_len = summary_statistics(path)
print("Max string = " + str(max_len))
print("Min string = " + str(min_len))
print("Average Length = " + str(average_len))


gucci gang gucci gang gucci gang gucci gang gucci gang gucci gang gucci gang gucci gang gucci gang gucci gang gucci gang gucci gang gucci gang gucci gang gucci gang gucci gang gucci gang gucci gang gucci gang gucci gang gucci gang gucci gang gucci gang gucci gang gucci gang gucci gang gucci gang gucci gang gucci gang gucci gang gucci gang gucci gang gucci gang gucci gang gucci gang gucci gang gucci gang gucci gang gucci gang gucci gang gucci gang gucci gang gucci gang gucci gang gucci gang gucci gang gucci gang gucci gang gucci gang gucci gang gucci gang gucci gang gucci gang gucci gang gucci gang gucci gang gucci gang gucci gang gucci gang gucci gang gucci gang gucci gang gucci gang gucci gang gucci gang gucci gang gucci gang gucci gang gucci gang gucci gang gucci gang gucci gang gucci gang gucci gang gucci gang gucci gang gucci gang gucci gang gucci gang gucci gang gucci gang gucci gang gucci gang gucci gang gucci gang gucci gang gucci gang gucci gang gucci gang gucci gang gucci gang

Max string = 7074
Min string = 0
Average Length = 157.33600694444445


In [15]:
def add_details_to_ids(ids_path, raw_path):  
    ids = pd.read_csv(ids_path, index_col = 0)
    raw = pd.read_csv(raw_path, index_col = 0)
    merged = pd.merge(ids, raw, left_on="id", right_on="id", how="left")
    merged.to_csv("75th_labels.csv")
ids_path = "../data/submissions/raw/75th_labels.csv"
raw_path = "../data/submissions/raw/all_submissions.csv"
add_details_to_ids(ids_path, raw_path)


  add_details_to_ids(ids_path, raw_path)
