In [9]:
import nltk
from tqdm import tqdm
import os
import html
import re
import contractions
from copy import deepcopy
from nltk import ngrams


from fyp.crypto import Crypto


In [None]:
# https://link.springer.com/content/pdf/10.1007/11892755_87.pdf


In [10]:
crypto = Crypto()
base = '/its/home/ep396/Documents/FYP/'
name = "dataset"


In [11]:
def load_db(name, base):
    e = base + f"encrypted_{name}.db"
    d = base + f"decrypted_{name}.db"
    crypto.age_decrypt_file(e, d)


In [12]:
def unload_db(name, base):
    e = base + f"encrypted_{name}.db"
    d = base + f"decrypted_{name}.db"
    crypto.age_encrypt_file(d, e)

    os.remove(d)


In [13]:
load_db(name, base)


In [14]:
from fyp.db_dataset import Tweet, ReferencedTweet, DataSplit


In [15]:
nltk.download("stopwords")
nltk.download('punkt')
nltk.download('wordnet')
nltk.download('omw-1.4')


[nltk_data] Downloading package stopwords to
[nltk_data]     /its/home/ep396/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /its/home/ep396/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     /its/home/ep396/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to
[nltk_data]     /its/home/ep396/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


True

In [32]:
def initial_phase(initial_set_s):
    tweet_bigrams = {}
    unique_bigrams = []
    freqency = {}

    for tweet_id, tweet_text in initial_set_s.items():
        cleaned_text = clean_text(tweet_text)
        eal = pre_processing_words(cleaned_text)
        tweet_bigrams[tweet_id] = list(nltk.bigrams(eal))
        for bigram in tweet_bigrams[tweet_id]:
            if bigram not in unique_bigrams:
                unique_bigrams.append(bigram)

    for bigram in unique_bigrams:
        for tweet_bigrams_collection in tweet_bigrams.values():
            if bigram in tweet_bigrams_collection:
                if bigram in freqency:
                    freqency[bigram] += 1
                else:
                    freqency[bigram] = 1
        
    return freqency


In [19]:
wnl = nltk.stem.WordNetLemmatizer()
sw = nltk.corpus.stopwords.words('english')


In [20]:
def pre_processing_words(text):
    tokenized = nltk.word_tokenize(text)
    return tokenized


In [25]:
def clean_text(text):
    html.unescape(text)
    removed_links = re.sub(r"https?://\S+", '', text)
    removed_mentions = re.sub(r"(^|[^@\w])@(\w{1,15})\b", '', removed_links)
    return removed_mentions


In [22]:
init_user_author_id = Tweet.select().first().author_id
init_user_tweets_query = Tweet.select(Tweet.tweet_id, Tweet.text).where(Tweet.author_id == init_user_author_id)
init_user_corpus = {tweet.tweet_id:tweet.text for tweet in tqdm(init_user_tweets_query)}


100%|██████████| 1581/1581 [00:00<00:00, 801256.00it/s]


In [23]:
(sum([len(text) for text in init_user_corpus.values()]) // len(init_user_corpus)) * 0.025


6.775

In [30]:
frequency_threshold = (sum([len(text) for text in init_user_corpus.values()]) // len(init_user_corpus)) * 0.025


In [31]:
frequent_pairs = initial_phase(init_user_corpus)


126.70549838244915


In [None]:
import json
print(json.dumps(sorted(frequent_pairs.items(), key=lambda x:x[1], reverse=True), indent=4))


In [None]:
def discovery_phase(original_grams, corpus):
    k = 2
    max_list = []
    grams = deepcopy(original_grams)
    while len(grams) > 0:
        for g, frequency in grams.items():
            if not_a_subsequence(g, max_list):
                if frequency > 1:
                    max = expand(g)
                    max_list.append(max)
                    if max == g:
                        grams.pop(g, None)
                else:
                    grams.pop(g, None)
        
        grams = form_grams_plus_one(grams, corpus)
        k += 1
    
    return max_list


In [43]:
test_bigrams = {("hello", "world") : 10, ("world", "hi"): 12, ("hi", "hello"): 1}
test_corpus = {1: "hello world hi hello world hello world hi"}


In [48]:
def form_grams_plus_one(grams, corpus):
    n_grams = []
    n_grams_frequencies = {}

    for gram_one in grams.keys():
        for gram_two in grams.keys():
            if gram_one[-1] == gram_two[0]:
                n_grams.append(combine_grams(gram_one, gram_two))
            if gram_two[-1] == gram_one[0]:
                n_grams.append(combine_grams(gram_two, gram_one))

    for gram in n_grams:
        for tweet_text in corpus.values():
            tokenized = nltk.word_tokenize(tweet_text)
            tweet_text_ngrams = ngrams(tokenized, len(gram))
            if gram in tweet_text_ngrams:
                if gram not in n_grams_frequencies:
                    n_grams_frequencies[gram] = 1
                else:
                    n_grams_frequencies[gram] += 1

    return n_grams_frequencies


In [63]:
def combine_grams(left_gram, right_gram):
    left_list = list(left_gram)
    right_list = list(right_gram)
    right_list.pop(0)
    return tuple(left_list + right_list)


In [57]:
def not_a_subsequence(g, max_list):
    grams = [m for m in max_list]
    for gram in grams:
        if len(g) <= len(gram):
            if not is_a_in_x(g, gram):
                return False
    return True

In [58]:
def is_a_in_x(A, X):
    for i in range(len(X) - len(A) + 1):
        if A == X[i:i+len(A)]: return True
    return False


In [None]:
def expand(p):
    l = len(p)
    

True

In [15]:
# unload_db(name, base)
