In [1]:
import pandas as pd
import re
import numpy as np
from src.utils.generative import Generator
from src.utils.tokenizer import Tokenizer
from src.utils.embedding import Embedding
from src.utils.utils import *

In [2]:
def print_full(x):
    pd.set_option('display.max_columns', 10000)  # or 1000
    print(x)
    pd.reset_option('display.max_rows')

In [3]:
comments = pd.read_csv('data/the-reddit-dataset-dataset-comments.csv')[['permalink','body']]
posts = pd.read_csv('data/the-reddit-dataset-dataset-posts.csv')[['permalink','selftext','title']]

In [4]:
comments.dropna(inplace=True)
posts.dropna(inplace=True)

In [5]:
comments['post_id'] = comments['permalink'].apply(lambda x: extract_post_id(x))
posts['post_id'] = posts['permalink'].apply(lambda x: extract_post_id(x))
comments['true_label'] = True

In [6]:
for i, row in comments.iterrows():
    choice = np.random.choice([False, True], p=[0.1, 0.9])
    if not choice:  # 10% probability
        # Choose a random row from the DataFrame
        random_row = comments.sample()
        # Replace the ID of the current row with the ID of the random row
        comments.at[i, 'post_id'] = random_row['post_id'].values[0]
    comments.at[i, 'true_label'] = choice

In [7]:

# Create a tokenizer
tokenizer = Tokenizer()

# # Tokenize the dataset
print('Tokenizing sentences')
posts['tokenized_conclusion'] = posts['title'].apply(lambda x: tokenizer(x))
posts['tokenized_premisse'] = posts['selftext'].apply(lambda x: tokenizer(x))
comments['tokenized_text'] = comments['body'].apply(lambda x: tokenizer(x))


Tokenizing sentences


In [8]:
merged_df = pd.merge(posts,comments,how='right',on='post_id')

In [9]:
merged_df.dropna(inplace=True)

In [10]:

# Create an Embedding object
embedding = Embedding()
print('Creating word representation')
# # Perform the embeddings
merged_df['wc'] = merged_df.apply(lambda row: calc_word_sim(row['tokenized_conclusion'],row['tokenized_text'],embedding),axis=1)
merged_df['wp'] = merged_df.apply(lambda row: calc_word_sim(row['tokenized_premisse'],row['tokenized_text'],embedding),axis=1)

Creating word representation


In [11]:
merged_df

Unnamed: 0,permalink_x,selftext,title,post_id,tokenized_conclusion,tokenized_premisse,permalink_y,body,true_label,tokenized_text,wc,wp
0,https://old.reddit.com/r/datasets/comments/t45...,I’m looking for a dataset that I can use to id...,[request] looking for a dataset that i can use...,t45uk7,"[[, request, ], look, for, a, dataset, that, i...","[i, ’, m, look, for, a, dataset, that, i, can,...",https://old.reddit.com/r/datasets/comments/t45...,Spatial problem: Suitability of new locations ...,True,"[spatial, problem, :, suitabl, of, new, locat,...",30,43
1,https://old.reddit.com/r/datasets/comments/sg9...,I collected news articles over the past 2 year...,Brainstorm some ideas with me for this Article...,sg9lv8,"[brainstorm, some, idea, with, me, for, thi, a...","[i, collect, news, articl, over, the, past, 2,...",https://old.reddit.com/r/datasets/comments/sg9...,Have you tried toying around with GDELT or Ali...,True,"[have, you, tri, toy, around, with, gdelt, or,...",19,459
4,https://old.reddit.com/r/datasets/comments/t49...,"Hi, I have a dataset with countries by 3 lette...",3 letter country code to full country name,t49fq0,"[3, letter, countri, code, to, full, countri, ...","[hi, ,, i, have, a, dataset, with, countri, by...",https://old.reddit.com/r/datasets/comments/t49...,I was about to write and say this shouldn't be...,True,"[i, wa, about, to, write, and, say, thi, shoul...",30,53
5,https://old.reddit.com/r/datasets/comments/t47...,I want to classify if an image contains a cont...,Looking for datasets that contain images of co...,t47wiw,"[look, for, dataset, that, contain, imag, of, ...","[i, want, to, classifi, if, an, imag, contain,...",https://old.reddit.com/r/datasets/comments/t47...,I'm not exactly sure how many contracts the E...,True,"[i, 'm, not, exactli, sure, how, mani, contrac...",56,65
6,https://old.reddit.com/r/datasets/comments/t49...,"Hi, I have a dataset with countries by 3 lette...",3 letter country code to full country name,t49fq0,"[3, letter, countri, code, to, full, countri, ...","[hi, ,, i, have, a, dataset, with, countri, by...",https://old.reddit.com/r/datasets/comments/t49...,"nevermind, found it\n\nfor anyone in need:\n\n...",True,"[nevermind, ,, found, it, for, anyon, in, need...",20,47
...,...,...,...,...,...,...,...,...,...,...,...,...
54819,https://old.reddit.com/r/datasets/comments/3bx...,I am currently doing a massive analysis of Red...,I have every publicly available Reddit comment...,3bxlg7,"[i, have, everi, publicli, avail, reddit, comm...","[i, am, current, do, a, massiv, analysi, of, r...",https://old.reddit.com/r/datasets/comments/bn9...,"If you divide population by CO2, you get some ...",False,"[if, you, divid, popul, by, co2, ,, you, get, ...",70,1163
54835,https://old.reddit.com/r/datasets/comments/bmn...,Doesn't need to be anything fancy: list of cit...,Ask /r/datasets: good resource for city/town a...,bmn4z,"[ask, /r/dataset, :, good, resourc, for, city/...","[doe, n't, need, to, be, anyth, fanci, :, list...",https://old.reddit.com/r/datasets/comments/bmn...,[deleted],True,"[[, delet, ]]",16,47
54838,https://old.reddit.com/r/datasets/comments/bmn...,Doesn't need to be anything fancy: list of cit...,Ask /r/datasets: good resource for city/town a...,bmn4z,"[ask, /r/dataset, :, good, resourc, for, city/...","[doe, n't, need, to, be, anyth, fanci, :, list...",https://old.reddit.com/r/datasets/comments/bmn...,I found this that's turned out to be a great r...,True,"[i, found, thi, that, 's, turn, out, to, be, a...",31,50
54840,https://old.reddit.com/r/datasets/comments/bmn...,Doesn't need to be anything fancy: list of cit...,Ask /r/datasets: good resource for city/town a...,bmn4z,"[ask, /r/dataset, :, good, resourc, for, city/...","[doe, n't, need, to, be, anyth, fanci, :, list...",https://old.reddit.com/r/datasets/comments/bmn...,This was posted in another thread.\r\n\r\nhttp...,True,"[thi, wa, post, in, anoth, thread, ., http, :,...",21,48


In [12]:
print('Creating embedding representation')
# # Perform the embeddings
merged_df['ec'] = merged_df.apply(lambda row: compute_wmd(row['tokenized_conclusion'],row['tokenized_text'],embedding.model),axis=1)
merged_df['ep'] = merged_df.apply(lambda row: compute_wmd(row['tokenized_premisse'],row['tokenized_text'],embedding.model),axis=1)

Creating embedding representation


In [13]:
print('Creating embedding representation')
# # Perform the embeddings
merged_df['fmodel'] = merged_df.apply(lambda row: calc_sim_dissim(row['wc'],row['wp'],row['ec'],row['ep']),axis=1)
merged_df['fmodel'] = merged_df.apply(lambda row: calc_sim_dissim(row['wc'],row['wp'],row['ec'],row['ep']),axis=1)

Creating embedding representation


  return alpha*(w_plus + e_plus) - (1-alpha)*(w_up+e_up)
  return alpha*(w_plus + e_plus) - (1-alpha)*(w_up+e_up)


In [18]:
t_test_for_mean = compute_t_test(merged_df[merged_df['true_label']]['fmodel'],merged_df[~merged_df['true_label']]['fmodel'])
print('T-test para diferença de médias H_0 = Médias Iguais \n p-value:',t_test_for_mean)

T-test para diferença de médias H_0 = Médias Iguais 
 p-value: 0.027114685900800076
