-
Notifications
You must be signed in to change notification settings - Fork 0
/
stats_reddit.py
39 lines (27 loc) · 915 Bytes
/
stats_reddit.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
import os
import pickle
from collections import defaultdict, Counter
from pprint import pprint
import pandas as pd
from tqdm.auto import tqdm
if __name__ == '__main__':
dataset_file = "data/reddit.pkl"
with open(dataset_file, mode='rb') as inputfile:
X = pickle.load(inputfile)
y = pickle.load(inputfile)
token_counts = defaultdict(int)
for text,nation in tqdm(zip(X,y)):
token_counts[nation] += len(text.split())
label_counts = Counter(y)
pprint(label_counts)
pprint(token_counts)
dataset_file = "data/redditEN.pkl"
with open(dataset_file, mode='rb') as inputfile:
X = pickle.load(inputfile)
y = pickle.load(inputfile)
token_counts = defaultdict(int)
for text,nation in tqdm(zip(X,y)):
token_counts[nation] += len(text.split())
label_counts = Counter(y)
pprint(label_counts)
pprint(token_counts)