In [1]:
import os
from tqdm import tqdm
import peewee

from fyp.crypto import Crypto


In [2]:
crypto = Crypto()


In [3]:
base = '/its/home/ep396/Documents/FYP/'
name = "dataset"


In [4]:
def load_db(name, base):
    e = base + f"encrypted_{name}.db"
    d = base + f"decrypted_{name}.db"
    crypto.age_decrypt_file(e, d)


In [5]:
def unload_db(name, base):
    e = base + f"encrypted_{name}.db"
    d = base + f"decrypted_{name}.db"
    crypto.age_encrypt_file(d, e)

    os.remove(d)


In [6]:
load_db(name, base)


In [7]:
from fyp.db_dataset import Tweet, ReferencedTweet


In [8]:
query = (Tweet
        .select(Tweet.author_id, peewee.fn.COUNT(Tweet.tweet_id).alias('ct'))
        .group_by(Tweet.author_id))


In [9]:
counts_by_user = []

for obj in tqdm(query):
    counts_by_user.append((obj.author_id, obj.ct))


100%|██████████| 14300/14300 [00:00<00:00, 1142405.00it/s]


In [10]:
filtered_counts_by_user = []
users_to_keep = []

for tup in counts_by_user:
    if tup[1] >= 50:
        filtered_counts_by_user.append(tup)
        users_to_keep.append(tup[0])

len(filtered_counts_by_user)


7281

In [11]:
relations_to_delete = Tweet.delete().where(~(Tweet.author_id << users_to_keep))
relations_to_delete.execute()


142267

In [12]:
query = (Tweet
        .select(Tweet.author_id, peewee.fn.COUNT(Tweet.tweet_id).alias('ct'))
        .group_by(Tweet.author_id))

In [13]:
new_counts_by_user = []

for obj in tqdm(query):
    new_counts_by_user.append((obj.author_id, obj.ct))


100%|██████████| 7281/7281 [00:00<00:00, 1183625.73it/s]


In [14]:
"reduced by %s" % (100 - ((sum([obj[1] for obj in new_counts_by_user]) / sum([obj[1] for obj in counts_by_user])) * 100))


'reduced by 7.059135836773578'

In [15]:
from fyp.db_dataset import DataSplit


In [16]:
from sklearn.model_selection import train_test_split


In [17]:
user_tweet_dict = {tup[0]:[] for tup in new_counts_by_user}


In [18]:
query = (Tweet.select(Tweet.tweet_id, Tweet.author_id))

for tweet in tqdm(query):
    user_tweet_dict[tweet.author_id].append(tweet.tweet_id)


100%|██████████| 1873093/1873093 [00:02<00:00, 688351.43it/s]


In [19]:
inserts = []

for user, tweets in tqdm(user_tweet_dict.items()):
    trainData, testData = train_test_split(tweets, train_size=0.7, shuffle=True)
    inserts += [(tweet_id, 0) for tweet_id in trainData] + [(tweet_id, 1) for tweet_id in testData]


100%|██████████| 7281/7281 [00:02<00:00, 3543.70it/s]


In [20]:
n = 50
k, m = divmod(len(inserts), n)
insert_splited = list(inserts[i * k + min(i, m):(i + 1) * k + min(i + 1, m)] for i in range(n))


In [21]:
for insert in tqdm(insert_splited):
    DataSplit.insert_many(insert, fields=[DataSplit.tweet_original_id, DataSplit.split_type]).execute()


100%|██████████| 50/50 [07:40<00:00,  9.21s/it]


In [22]:
query = (DataSplit
    .select(DataSplit.split_type, peewee.fn.COUNT(DataSplit.tweet_original_id).alias('ct'))
    .group_by(DataSplit.split_type))


In [23]:
for obj in query:
    print(obj.split_type, obj.ct)
    print((obj.ct / sum([obj[1] for obj in new_counts_by_user])) * 100)


0 1307801
69.82039866680405
1 565292
30.179601333195947


In [24]:
sum([obj[1] for obj in new_counts_by_user])

1873093

In [25]:
unload_db(name, base)
