# Data Prep for Contrastive Learning

Reference: \
https://github.com/jahuerta92/authorship-embeddings/blob/main/gather_data.ipynb \
https://github.com/jahuerta92/authorship-embeddings/blob/main/clean_data.ipynb \
https://github.com/LLNL/LUAR/blob/main/scripts

Note: id denotes author_id rather than text_id

In [1]:
import numpy as np
import pandas as pd
from tqdm.notebook import tqdm
from multiprocessing import Pool
from transformers import AutoTokenizer

CHUNK_SIZE = 512
MODEL_TYPE = "roberta-large"
FILE_PATH = '/data/baixiang/dataset/'
tokenizer = AutoTokenizer.from_pretrained(MODEL_TYPE)

In [2]:
def split_data(row):
    eid, values = row
    input_ids = tokenizer(values.text).input_ids
    chunked = [input_ids[chunk: chunk + CHUNK_SIZE] for chunk in range(0, len(input_ids), CHUNK_SIZE)]
    decoded_chunked = tokenizer.batch_decode(chunked)
    return pd.DataFrame({'id': [eid]*len(chunked), 'pretokenized_text': chunked, 'decoded_text': decoded_chunked})
                         
    
def build_chunk_dataframe(text_data, metadata=None, cores=10):
    with Pool(cores) as p:
        chunks = list(tqdm(p.imap_unordered(split_data, text_data.iterrows()), total=len(text_data)))
    if metadata is not None:
        return pd.concat(chunks).merge(metadata, on='id')
    else:
        return pd.concat(chunks)

    
def clean_non_unique(data):
    nunique_ids = (data.id.value_counts() > 1)
    nunique_ids = nunique_ids[nunique_ids].index
    return data[data.id.isin(nunique_ids)]


def rm_duplicate_fn(df, column_ls):
    print('# duplicates:', df.text.duplicated().sum(), 'sanity check:', df.shape[0] - len(set(df.text)))
    print('Before removing duplicates, df.shape:', df.shape)
    df = df.drop_duplicates(subset=column_ls, keep='first').reset_index(drop=True)
    print('New df.shape:', df.shape)
    return df


def train_test_split_by_author(df, test_size=0.1, random_state=42):
    unique_authors = df.id.unique()
    in_test = np.random.choice(unique_authors, int(len(unique_authors) * test_size), replace=False)
    return df[~df.id.isin(in_test)], df[df.id.isin(in_test)]

## Blog
https://www.kaggle.com/datasets/rtatman/blog-authorship-corpus

In [3]:
blog_data = pd.read_csv(FILE_PATH+"blogtext.csv")
blog_data.head(2)

Unnamed: 0,id,gender,age,topic,sign,date,text
0,2059027,male,15,Student,Leo,"14,May,2004","Info has been found (+/- 100 pages,..."
1,2059027,male,15,Student,Leo,"13,May,2004",These are the team members: Drewe...


In [4]:
# replace user id values with "blog_n" where n is a number beggining at 0
n_values = len(blog_data.id.unique())
author_mapping = {k: v for k, v in zip(blog_data.id.unique(), range(n_values))}
blog_data['id'] = blog_data['id'].apply(lambda x: 'blog_' + str(author_mapping[x]))
blog_data

Unnamed: 0,id,gender,age,topic,sign,date,text
0,blog_0,male,15,Student,Leo,"14,May,2004","Info has been found (+/- 100 pages,..."
1,blog_0,male,15,Student,Leo,"13,May,2004",These are the team members: Drewe...
2,blog_0,male,15,Student,Leo,"12,May,2004",In het kader van kernfusie op aarde...
3,blog_0,male,15,Student,Leo,"12,May,2004",testing!!! testing!!!
4,blog_1,male,33,InvestmentBanking,Aquarius,"11,June,2004",Thanks to Yahoo!'s Toolbar I can ...
...,...,...,...,...,...,...,...
681279,blog_19319,male,23,Student,Taurus,"01,July,2004","Dear Susan, I could write some really ..."
681280,blog_19319,male,23,Student,Taurus,"01,July,2004","Dear Susan, 'I have the second yeast i..."
681281,blog_19319,male,23,Student,Taurus,"01,July,2004","Dear Susan, Your 'boyfriend' is fuckin..."
681282,blog_19319,male,23,Student,Taurus,"01,July,2004","Dear Susan: Just to clarify, I am as..."


In [5]:
len(np.unique(blog_data['id'])), len(np.unique(blog_data['text']))

(19320, 611652)

In [6]:
# Finding and removing duplicate rows
blog_data[blog_data[['id', 'text', 'date']].duplicated(keep=False)].sort_values('text')

Unnamed: 0,id,gender,age,topic,sign,date,text
651906,blog_18413,male,27,Technology,Taurus,"14,June,2004",
651920,blog_18413,male,27,Technology,Taurus,"02,June,2004",
212416,blog_5943,male,36,Advertising,Sagittarius,"01,August,2004",
212417,blog_5943,male,36,Advertising,Sagittarius,"01,August,2004",
651889,blog_18413,male,27,Technology,Taurus,"11,May,2004",
...,...,...,...,...,...,...,...
471113,blog_13165,male,14,Student,Sagittarius,"27,May,2004",People...I figured that Grellow would crumble...
471107,blog_13165,male,14,Student,Sagittarius,"31,May,2004","Today... I am with Sydney, and we decided to ..."
471123,blog_13165,male,14,Student,Sagittarius,"31,May,2004","Today... I am with Sydney, and we decided to ..."
471120,blog_13165,male,14,Student,Sagittarius,"26,May,2004",Tom Felton is the reason why everything wrong...


In [7]:
blog_corpus = rm_duplicate_fn(blog_data, ['id', 'text', 'date'])

# duplicates: 69632 sanity check: 69632
Before removing duplicates, df.shape: (681284, 7)
New df.shape: (676598, 7)


In [8]:
blog_corpus.text = blog_corpus.text.apply(lambda x: x.strip())
clean_blog_corpus = blog_corpus[['id', 'text']].groupby("id").agg(lambda x: '<\s>'.join(x))
meta_blog_corpus = blog_corpus[['id', 'age', 'topic', 'gender']].groupby("id").agg(lambda x: list(x)[0])
full_blog_corpus = meta_blog_corpus.merge(clean_blog_corpus, on='id')
full_blog_corpus

Unnamed: 0_level_0,age,topic,gender,text
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
blog_0,15,Student,male,"Info has been found (+/- 100 pages, and 4.5 MB..."
blog_1,33,InvestmentBanking,male,Thanks to Yahoo!'s Toolbar I can now 'capture'...
blog_10,25,indUnk,female,"Even though I am exhausted after today, I must..."
blog_100,26,indUnk,male,Hello again. This is the offical No Action bl...
blog_1000,16,Student,male,My 'band' got in its first fight tonight. most...
...,...,...,...,...
blog_9995,17,Communications-Media,male,"Good morning folks, How are me brothers and s..."
blog_9996,23,indUnk,female,"NEWater Ok, that's just gross. Another pot..."
blog_9997,26,Education,male,I love salsa. It's one of the greatest foods e...
blog_9998,13,Law,male,"Hey all, This is Jared, this is my first post ..."


In [9]:
clean_blog_corpus.shape, meta_blog_corpus.shape

((19320, 1), (19320, 3))

In [10]:
chunked_blog_data = build_chunk_dataframe(full_blog_corpus, meta_blog_corpus)
nunique_blog_data = clean_non_unique(chunked_blog_data)
nunique_blog_data

  0%|          | 0/19320 [00:00<?, ?it/s]

Token indices sequence length is longer than the specified maximum sequence length for this model (1227 > 512). Running this sequence through the model will result in indexing errors
Token indices sequence length is longer than the specified maximum sequence length for this model (1726 > 512). Running this sequence through the model will result in indexing errors
Token indices sequence length is longer than the specified maximum sequence length for this model (3686 > 512). Running this sequence through the model will result in indexing errors
Token indices sequence length is longer than the specified maximum sequence length for this model (5819 > 512). Running this sequence through the model will result in indexing errors
Token indices sequence length is longer than the specified maximum sequence length for this model (2760 > 512). Running this sequence through the model will result in indexing errors
Token indices sequence length is longer than the specified maximum sequence length fo

Unnamed: 0,id,pretokenized_text,decoded_text,age,topic,gender
0,blog_10,"[0, 8170, 600, 38, 524, 17067, 71, 452, 6, 38,...","<s>Even though I am exhausted after today, I m...",25,indUnk,female
1,blog_10,"[216, 24, 4, 152, 7105, 16, 3680, 684, 25, 121...",know it. This hell is otherwise known as U Vi...,25,indUnk,female
2,blog_10,"[17027, 12, 560, 12, 1610, 18, 2850, 12179, 33...","groom-to-be's scrotal bling, but not the Fout...",25,indUnk,female
3,blog_100,"[0, 31414, 456, 4, 1437, 152, 16, 5, 160, 3569...",<s>Hello again. This is the offical No Action...,26,indUnk,male
4,blog_100,"[5, 7884, 20774, 29, 31, 14, 6, 25, 157, 25, 5...","the singalongs from that, as well as the mino...",26,indUnk,male
...,...,...,...,...,...,...
381429,blog_9996,"[941, 187, 38, 437, 45, 10, 13853, 226, 14669,...",especially since I'm not a Lindsay Lohan fan....,23,indUnk,female
381430,blog_9996,"[13137, 1437, 46471, 17860, 229, 5602, 8, 4077...",twins urlLink Kami and Karli have not been ...,23,indUnk,female
381431,blog_9996,"[2600, 127, 7833, 5548, 255, 2747, 998, 35, 10...",reading my absolute favourite TAR website: Te...,23,indUnk,female
381432,blog_9996,"[75, 29, 1437, 157, 6, 667, 66, 10, 92, 29618,...","'ts well, trying out a new haircut and all i ...",23,indUnk,female


In [11]:
tmp = full_blog_corpus.iloc[0].text
input_ids = tokenizer(tmp).input_ids
print(len(input_ids))
[len(input_ids[chunk: chunk + CHUNK_SIZE]) for chunk in range(0, len(input_ids), CHUNK_SIZE)]
tokenizer.decode(input_ids[:CHUNK_SIZE])

Token indices sequence length is longer than the specified maximum sequence length for this model (5819 > 512). Running this sequence through the model will result in indexing errors


5819


"<s>Info has been found (+/- 100 pages, and 4.5 MB of.pdf files) Now i have to wait untill our team leader has processed it and learns html.<\\s>These are the team members:   Drewes van der Laag           urlLink mail  Ruiyu Xie                     urlLink mail  Bryan Aaldering (me)          urlLink mail<\\s>In het kader van kernfusie op aarde:  MAAK JE EIGEN WATERSTOFBOM   How to build an H-Bomb From: ascott@tartarus.uwa.edu.au (Andrew Scott) Newsgroups: rec.humor Subject: How To Build An H-Bomb (humorous!) Date: 7 Feb 1994 07:41:14 GMT Organization: The University of Western Australia  Original file dated 12th November 1990. Seemed to be a transcript of a 'Seven Days' article. Poorly formatted and corrupted. I have added the text between 'examine under a microscope' and'malleable, like gold,' as it was missing. If anyone has the full text, please distribute. I am not responsible for the accuracy of this information. Converted to HTML by Dionisio@InfiNet.com 11/13/98. (Did a little sp

In [12]:
# full_blog_corpus merges texts by authors, chunked_blog_data divide merged text into chunks of size 512
# nunique_blog_data remove authors who have only 1 chunk
blog_data.shape, full_blog_corpus.shape, chunked_blog_data.shape, nunique_blog_data.shape

((681284, 7), (19320, 4), (381434, 6), (381306, 6))

In [13]:
nunique_blog_data.id.value_counts()

blog_12861    1253
blog_8404     1120
blog_3960     1108
blog_9660      884
blog_17014     847
              ... 
blog_14983       2
blog_7449        2
blog_3639        2
blog_7450        2
blog_17707       2
Name: id, Length: 19192, dtype: int64

In [38]:
# nunique_blog_data.to_csv(FILE_PATH+"blog_as_csv_preprocessed.csv", index=False)
# blog_data = pd.read_csv(FILE_PATH+"blog_as_csv_preprocessed.csv")
blog_train, blog_test = train_test_split_by_author(nunique_blog_data)
blog_train.to_csv(FILE_PATH+"blog_train.csv", index=False)
blog_test.to_csv(FILE_PATH+"blog_test.csv", index=False)
len(blog_train), len(blog_test)

(345171, 36135)

## Mail
https://www.kaggle.com/datasets/wcukierski/enron-email-dataset

In [15]:
emails_df = pd.read_csv(FILE_PATH+"enron-emails.csv")
emails_df.head(2)

Unnamed: 0,file,message
0,allen-p/_sent_mail/1.,Message-ID: <18782981.1075855378110.JavaMail.e...
1,allen-p/_sent_mail/10.,Message-ID: <15464986.1075855378456.JavaMail.e...


In [16]:
import email
def get_text_from_email(msg):
    '''To get the content from email objects'''
    parts = []
    for part in msg.walk():
        if part.get_content_type() == 'text/plain':
            parts.append(part.get_payload())
    return ''.join(parts)


def split_email_addresses(line):
    '''To separate multiple email addresses'''
    if line:
        addrs = line.split(',')
        addrs = frozenset(map(lambda x: x.strip(), addrs))
    else:
        addrs = None
    return addrs


# Parse the emails into a list email objects
messages = list(map(email.message_from_string, emails_df['message'])) 
for key in messages[0].keys():
    emails_df[key] = [doc[key] for doc in messages]
emails_df['Text'] = list(map(get_text_from_email, messages))
emails_df['From'] = emails_df['From'].map(split_email_addresses)
emails_df['To'] = emails_df['To'].map(split_email_addresses)
del messages
emails_df = emails_df[['From', 'To', 'Text', 'Date', 'message']]
emails_df.head(2)

Unnamed: 0,From,To,Text,Date,message
0,(phillip.allen@enron.com),(tim.belden@enron.com),Here is our forecast\n\n,"Mon, 14 May 2001 16:39:00 -0700 (PDT)",Message-ID: <18782981.1075855378110.JavaMail.e...
1,(phillip.allen@enron.com),(john.lavorato@enron.com),Traveling to have a business meeting takes the...,"Fri, 4 May 2001 13:51:00 -0700 (PDT)",Message-ID: <15464986.1075855378456.JavaMail.e...


In [17]:
for i in emails_df.index:
    sender = emails_df.loc[i, 'From']
    receiver = emails_df.loc[i, 'To']
    if type(sender) is list and len(sender) > 1:
        print('More than 1 sender:', sender)
    
    # if receiver is None:
    #     receiver = 'nan'
    # # elif len(emails_df.loc[i, 'To']) > 1:
    # #     print('More than 1 receiver:', emails_df.loc[i, 'To'])
    
emails_df['From'] = emails_df["From"].apply(lambda x: list(x)[0])
# emails_df['To'] = emails_df["To"].apply(lambda x: ' '.join(list(x)))#.astype("unicode")
emails_df.head(2)

Unnamed: 0,From,To,Text,Date,message
0,phillip.allen@enron.com,(tim.belden@enron.com),Here is our forecast\n\n,"Mon, 14 May 2001 16:39:00 -0700 (PDT)",Message-ID: <18782981.1075855378110.JavaMail.e...
1,phillip.allen@enron.com,(john.lavorato@enron.com),Traveling to have a business meeting takes the...,"Fri, 4 May 2001 13:51:00 -0700 (PDT)",Message-ID: <15464986.1075855378456.JavaMail.e...


In [18]:
# Finding and removing duplicate rows
emails_df[emails_df[['From', 'To', 'Text', 'Date']].duplicated(keep=False)].sort_values('Text')

Unnamed: 0,From,To,Text,Date,message
362497,debra.perlingiere@enron.com,(veronica.espinoza@enron.com),\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t...,"Thu, 20 Jul 2000 06:57:00 -0700 (PDT)",Message-ID: <29042400.1075842319607.JavaMail.e...
359528,debra.perlingiere@enron.com,(veronica.espinoza@enron.com),\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t...,"Thu, 20 Jul 2000 06:57:00 -0700 (PDT)",Message-ID: <21091781.1075842295609.JavaMail.e...
254750,news@real-net.net,(pkeavey@ect.enron.com),\t\t\t\t\t\t\t\t\t\t[IMAGE]\n\t\t\t\t\t\t\t\t\...,"Tue, 28 Nov 2000 08:52:00 -0800 (PST)",Message-ID: <10664196.1075855642165.JavaMail.e...
253583,news@real-net.net,(pkeavey@ect.enron.com),\t\t\t\t\t\t\t\t\t\t[IMAGE]\n\t\t\t\t\t\t\t\t\...,"Tue, 28 Nov 2000 08:52:00 -0800 (PST)",Message-ID: <5846818.1075855635419.JavaMail.ev...
253196,news@real-net.net,(pkeavey@ect.enron.com),\t\t\t\t\t\t\t\t\t\t[IMAGE]\n\t\t\t\t\t\t\t\t\...,"Tue, 28 Nov 2000 08:52:00 -0800 (PST)",Message-ID: <13427260.1075855626376.JavaMail.e...
...,...,...,...,...,...
12515,bushnews@georgewbush.com,(ebass@enron.com),~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~...,"Sat, 2 Dec 2000 21:10:00 -0800 (PST)",Message-ID: <16755277.1075854579182.JavaMail.e...
13871,bushnews@georgewbush.com,(ebass@enron.com),~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~...,"Sat, 2 Dec 2000 21:10:00 -0800 (PST)",Message-ID: <13935886.1075854652367.JavaMail.e...
12612,bushnews@georgewbush.com,(ebass@enron.com),~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~...,"Tue, 28 Nov 2000 17:48:00 -0800 (PST)",Message-ID: <19045496.1075854582239.JavaMail.e...
15164,bushnews@georgewbush.com,(ebass@enron.com),~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~...,"Tue, 28 Nov 2000 17:48:00 -0800 (PST)",Message-ID: <15114726.1075854649305.JavaMail.e...


In [19]:
emails_df = emails_df.drop_duplicates(subset=['From', 'To', 'Text', 'Date'], keep='first').reset_index(drop=True)
emails_df.shape

(255451, 5)

In [20]:
mail_corpus = emails_df.copy()
mail_corpus.columns = ['user', 'receiver', 'text', 'date', 'message_old']
unique_author = mail_corpus['user'].unique()
email_mapping = {k: v for k, v in zip(unique_author, range(len(unique_author)))}
mail_corpus['id'] = mail_corpus['user'].apply(lambda x: 'mail_'+str(email_mapping[x]))
mail_corpus

Unnamed: 0,user,receiver,text,date,message_old,id
0,phillip.allen@enron.com,(tim.belden@enron.com),Here is our forecast\n\n,"Mon, 14 May 2001 16:39:00 -0700 (PDT)",Message-ID: <18782981.1075855378110.JavaMail.e...,mail_0
1,phillip.allen@enron.com,(john.lavorato@enron.com),Traveling to have a business meeting takes the...,"Fri, 4 May 2001 13:51:00 -0700 (PDT)",Message-ID: <15464986.1075855378456.JavaMail.e...,mail_0
2,phillip.allen@enron.com,(leah.arsdall@enron.com),test successful. way to go!!!,"Wed, 18 Oct 2000 03:00:00 -0700 (PDT)",Message-ID: <24216240.1075855687451.JavaMail.e...,mail_0
3,phillip.allen@enron.com,(randall.gay@enron.com),"Randy,\n\n Can you send me a schedule of the s...","Mon, 23 Oct 2000 06:13:00 -0700 (PDT)",Message-ID: <13505866.1075863688222.JavaMail.e...,mail_0
4,phillip.allen@enron.com,(greg.piper@enron.com),Let's shoot for Tuesday at 11:45.,"Thu, 31 Aug 2000 05:07:00 -0700 (PDT)",Message-ID: <30922949.1075863688243.JavaMail.e...,mail_0
...,...,...,...,...,...,...
255446,john.zufferli@enron.com,(kori.loibl@enron.com),This is a trade with OIL-SPEC-HEDGE-NG (John L...,"Wed, 28 Nov 2001 13:30:11 -0800 (PST)",Message-ID: <26807948.1075842029936.JavaMail.e...,mail_5460
255447,john.zufferli@enron.com,(john.lavorato@enron.com),Some of my position is with the Alberta Term b...,"Wed, 28 Nov 2001 12:47:48 -0800 (PST)",Message-ID: <25835861.1075842029959.JavaMail.e...,mail_5460
255448,john.zufferli@enron.com,(dawn.doucet@enron.com),2\n\n -----Original Message-----\nFrom: \tDouc...,"Wed, 28 Nov 2001 07:20:00 -0800 (PST)",Message-ID: <28979867.1075842029988.JavaMail.e...,mail_5460
255449,john.zufferli@enron.com,(jeanie.slone@enron.com),Analyst\t\t\t\t\tRank\n\nStephane Brodeur\t\t\...,"Tue, 27 Nov 2001 11:52:45 -0800 (PST)",Message-ID: <22052556.1075842030013.JavaMail.e...,mail_5460


In [21]:
import re
def clean_text(text):
    clean_mail = re.sub(r'(\\+r)?(\\+n)+', '\n', text)
    clean_mail = re.sub(r'\\+t', '\t', clean_mail)
    clean_mail = '\n'.join(clean_mail.strip().split('\n')[15:-1])
    clean_mail = re.sub(r'X-.+:.*\n', '<s>', clean_mail)
    clean_mail = re.sub(r'From:.*\n', '', clean_mail)
    clean_mail = re.sub(r"\\'", "'", clean_mail)
    return clean_mail

mail_corpus['clean_text'] = mail_corpus['message_old'].apply(clean_text)
mail_corpus

Unnamed: 0,user,receiver,text,date,message_old,id,clean_text
0,phillip.allen@enron.com,(tim.belden@enron.com),Here is our forecast\n\n,"Mon, 14 May 2001 16:39:00 -0700 (PDT)",Message-ID: <18782981.1075855378110.JavaMail.e...,mail_0,
1,phillip.allen@enron.com,(john.lavorato@enron.com),Traveling to have a business meeting takes the...,"Fri, 4 May 2001 13:51:00 -0700 (PDT)",Message-ID: <15464986.1075855378456.JavaMail.e...,mail_0,\nTraveling to have a business meeting takes t...
2,phillip.allen@enron.com,(leah.arsdall@enron.com),test successful. way to go!!!,"Wed, 18 Oct 2000 03:00:00 -0700 (PDT)",Message-ID: <24216240.1075855687451.JavaMail.e...,mail_0,
3,phillip.allen@enron.com,(randall.gay@enron.com),"Randy,\n\n Can you send me a schedule of the s...","Mon, 23 Oct 2000 06:13:00 -0700 (PDT)",Message-ID: <13505866.1075863688222.JavaMail.e...,mail_0,"\nRandy,\n\n Can you send me a schedule of the..."
4,phillip.allen@enron.com,(greg.piper@enron.com),Let's shoot for Tuesday at 11:45.,"Thu, 31 Aug 2000 05:07:00 -0700 (PDT)",Message-ID: <30922949.1075863688243.JavaMail.e...,mail_0,
...,...,...,...,...,...,...,...
255446,john.zufferli@enron.com,(kori.loibl@enron.com),This is a trade with OIL-SPEC-HEDGE-NG (John L...,"Wed, 28 Nov 2001 13:30:11 -0800 (PST)",Message-ID: <26807948.1075842029936.JavaMail.e...,mail_5460,\nThis is a trade with OIL-SPEC-HEDGE-NG (John...
255447,john.zufferli@enron.com,(john.lavorato@enron.com),Some of my position is with the Alberta Term b...,"Wed, 28 Nov 2001 12:47:48 -0800 (PST)",Message-ID: <25835861.1075842029959.JavaMail.e...,mail_5460,
255448,john.zufferli@enron.com,(dawn.doucet@enron.com),2\n\n -----Original Message-----\nFrom: \tDouc...,"Wed, 28 Nov 2001 07:20:00 -0800 (PST)",Message-ID: <28979867.1075842029988.JavaMail.e...,mail_5460,\n2\n\n -----Original Message-----\nSent:\tWed...
255449,john.zufferli@enron.com,(jeanie.slone@enron.com),Analyst\t\t\t\t\tRank\n\nStephane Brodeur\t\t\...,"Tue, 27 Nov 2001 11:52:45 -0800 (PST)",Message-ID: <22052556.1075842030013.JavaMail.e...,mail_5460,\nAnalyst\t\t\t\t\tRank\n\nStephane Brodeur\t\...


In [22]:
mail_corpus.clean_text[0]

''

In [23]:
# mail_corpus.columns = ['user', 'old_text', 'id', 'text']
mail_corpus.text = mail_corpus.text.apply(lambda x: x.strip())
clean_mail_corpus = mail_corpus[['id', 'text']].groupby("id").agg(lambda x: '<\s>'.join(x))

chunked_mail_data = build_chunk_dataframe(clean_mail_corpus, None)
nunique_mail_data = clean_non_unique(chunked_mail_data)
nunique_mail_data.reset_index(drop=True, inplace=True)
nunique_mail_data

  0%|          | 0/20328 [00:00<?, ?it/s]

Token indices sequence length is longer than the specified maximum sequence length for this model (558 > 512). Running this sequence through the model will result in indexing errors
Token indices sequence length is longer than the specified maximum sequence length for this model (898 > 512). Running this sequence through the model will result in indexing errors
Token indices sequence length is longer than the specified maximum sequence length for this model (1454 > 512). Running this sequence through the model will result in indexing errors
Token indices sequence length is longer than the specified maximum sequence length for this model (903 > 512). Running this sequence through the model will result in indexing errors
Token indices sequence length is longer than the specified maximum sequence length for this model (2133 > 512). Running this sequence through the model will result in indexing errors
Token indices sequence length is longer than the specified maximum sequence length for t

Unnamed: 0,id,pretokenized_text,decoded_text
0,mail_10002,"[0, 1185, 9918, 12180, 13, 10, 2041, 8, 6449, ...",<s>You Could Search for a Year and Find Nothin...
1,mail_10002,"[50140, 50118, 50118, 2709, 901, 3522, 2578, 4...",\n\n\n\nFor More Information Click Below Now\n...
2,mail_10000,"[0, 4014, 13181, 104, 10616, 20685, 7224, 500,...",<s>STOCKS ARE DOWN ACROSS THE BOARD\n ...
3,mail_10000,"[5298, 7, 1437, 50118, 34335, 6, 51, 10557, 7,...","symptoms to \ndevelop, they intend to diagnos..."
4,mail_10009,"[0, 3293, 347, 491, 14718, 29, 108, 1426, 155,...",<s>OTC News Alerts' Last 3 Picks have gained 3...
...,...,...,...
299088,mail_9524,"[1232, 31, 2086, 960, 6, 16, 7, 489, 7458, 521...","families from losing everything, is to keep m..."
299089,mail_9524,"[10, 948, 9, 688, 6, 229, 15964, 34, 1224, 31,...","a matter of weeks, Kmart has turned from a po..."
299090,mail_9524,"[5214, 50118, 29, 7834, 6, 3854, 8, 647, 1743,...","=\ns inventory, distribution and sales systems..."
299091,mail_9524,"[13147, 4, 5457, 3546, 50118, 50118, 15075, 11...",Reserved. =09\n\nCity - City Diary - Enron go...


In [24]:
# nunique_mail_data.to_csv(FILE_PATH+"mail_as_csv_preprocessed.csv", index=False)
mail_train, mail_test = train_test_split_by_author(nunique_mail_data)
mail_train.to_csv(FILE_PATH+"mail_train.csv", index=False)
mail_test.to_csv(FILE_PATH+"mail_test.csv", index=False)
len(mail_train), len(mail_test)

(276158, 22935)

In [19]:
nunique_mail_data.decoded_text[1]

1    \n\n\n\nFor More Information Click Below Now\n...
1     symptoms to \ndevelop, they intend to diagnos...
1     Application to Catastrophe Insurance\nhttp://...
1     to the accuracy or the\ncompleteness of the d...
1    cocktails.\n\nJeff<\s>Yes, put 20 on black if ...
                           ...                        
1     Volatility Matrix      (Click on imag=\ne to ...
1    ty, Robert; Allwein, Robert; Goodell, Scott; O...
1     **\n-Upgrade information\n-Important updates\...
1     LET UTILITIES OUT OF CALPX\nThe California Pu...
1     were answered in full during the meeting.  I ...
Name: decoded_text, Length: 11100, dtype: object

## Reddit

https://github.com/LLNL/LUAR/blob/main/scripts/download_reddit_data.sh

data.jsonl (62G) download https://storage.googleapis.com/naacl21_account_linking/raw_mud.tar.gz

300 million Reddit posts from 1 million users published over an entire year to be used to train our proposed model.
This Million User Dataset (MUD) consists of all posts by authors who published at least 100 and at most 1000 posts between July 2015 and June 2016, where the lower bound ensures a sufficiently long history from which to sample, and the upper bound is intended to reduce the impact of bot and spam account. Together with its text content, each comment is labeled by its publication time and the subreddit to which it was posted, a categorical feature roughly indicating its topic.

Data source: Khan, Aleem, et al. "A deep metric learning approach to account linking." NAACL (2021). https://arxiv.org/pdf/2105.07263.pdf
The data by are drawed from the existing Pushshift Reddit corpus (Baumgartner, Jason, et al. "The pushshift reddit dataset." AAAI. Vol. 14. 2020. https://arxiv.org/pdf/2001.08435.pdf)

| Number of users contributing                       | 1,071,477   |
|----------------------------------------------------|-------------|
| Number of posts                                    | 321,659,421 |
| Mean post length                                   | 42.5 tokens |
| Mean number of posts contributed by a user         | 300.2       |
| Mean number of subreddits accessed by a user       | 22.1        |
| Mean number of months a user was active            | 9.9         |
| Percentage of posts containing more than 64 tokens | 17.37%      | 

In [3]:
%%time
# data.jsonl (62G) download https://storage.googleapis.com/naacl21_account_linking/raw_mud.tar.gz
df = pd.read_json(FILE_PATH+'reddit_mud/data.jsonl', lines=True)
print(f"{df.shape[0]:,}")

1,071,477
CPU times: user 6min 29s, sys: 2min 19s, total: 8min 49s
Wall time: 8min 48s


In [4]:
%%time
df = df[['syms', 'author_id', 'action_type']]
df.rename(columns={'syms': 'text', 'author_id': 'id', 'action_type': 'topic'}, inplace=True)
df = df.explode(['text', 'topic'], ignore_index=True)
print(f"{df.shape[0]:,}")

321,659,421
CPU times: user 54.8 s, sys: 12.6 s, total: 1min 7s
Wall time: 1min 7s


In [5]:
%%time
print('# duplicates:', f"{df.text.duplicated().sum():,}", 'sanity check:', f"{df.shape[0] - len(set(df.text)):,}")
print('Before removing duplicates, # rows:', f"{df.shape[0]:,}")
df = df.drop_duplicates(subset=['id', 'text', 'topic'], keep='first').reset_index(drop=True)
print('New # rows::', f"{df.shape[0]:,}")

# duplicates: 23,811,501 sanity check: 23,811,501
Before removing duplicates, # rows: 321,659,421
New # rows:: 318,647,132
CPU times: user 16min 56s, sys: 3min 8s, total: 20min 5s
Wall time: 20min 2s


In [6]:
%%time
print('# rows with empty text:', (df['text'].values == '').sum())
df = df[df['text'] != '']
print(f"{df.shape[0]:,}")

# rows with empty text: 5285
318,641,847
CPU times: user 59 s, sys: 12.4 s, total: 1min 11s
Wall time: 1min 11s


In [7]:
author_mapping = {k: v for k, v in zip(df.id.unique(), range(len(df.id.unique())))}
df['id'] = df['id'].apply(lambda x: 'reddit_' + str(author_mapping[x]))
df

Unnamed: 0,text,id,topic
0,I dash attack waaaayyyyy too often,reddit_0,smashbros
1,SO HAPPY ARMADA CAME THROUGH AGAINST THE PUNK ...,reddit_0,smashbros
2,Anyone have a replay of this? I missed it,reddit_0,smashbros
3,You look like a dead goldfish that was left in...,reddit_0,RoastMe
4,Poké Floats was legal??? Better not tell twitc...,reddit_0,smashbros
...,...,...,...
318647127,&gt; campest dab,reddit_1071475,gifs
318647128,50m + whatever it cost to bribe the judge,reddit_1071475,todayilearned
318647129,"Too late, the deed is done",reddit_1071475,gifs
318647130,"teehee, the wedding will be fabulous either way",reddit_1071475,todayilearned


In [8]:
%%time
df.text = df.text.apply(lambda x: x.strip())
clean_reddit_corpus = df[['id', 'text']].groupby("id").agg(lambda x: '<\s>'.join(x))
meta_reddit_corpus = df[['id', 'topic']].groupby("id").agg(lambda x: [e for e in x])
full_reddit_corpus = meta_reddit_corpus.merge(clean_reddit_corpus, on='id')
full_reddit_corpus

CPU times: user 4min 28s, sys: 1min 15s, total: 5min 44s
Wall time: 5min 43s


Unnamed: 0_level_0,topic,text
id,Unnamed: 1_level_1,Unnamed: 2_level_1
reddit_0,"[smashbros, smashbros, smashbros, RoastMe, sma...",I dash attack waaaayyyyy too often<\s>SO HAPPY...
reddit_1,"[DCcomics, totalwar, totalwar, totalwar, total...","Awesome, I'll be sure to check them out, thank..."
reddit_10,"[baseball, politics, politics, printSF, printS...",Back in Texas. Buddy had a kid in an up and c...
reddit_100,"[StreetFighter, StreetFighter, StreetFighter, ...",That's an old term.<\s>I'm also somewhat new a...
reddit_1000,"[relationships, relationships, relationships, ...",I'm so sorry your friend died. As for this sit...
...,...,...
reddit_999995,"[roblox, roblox, roblox, roblox, roblox, roblo...",Lmfaoooo<\s>The Fedora Tipping is actually an ...
reddit_999996,"[relationships, relationships, relationships, ...",I do this too. I tell ten unrelated stories in...
reddit_999997,"[FantasyWarTactics, FantasyWarTactics, Fantasy...",Thats oddd i tested with my Dominique second s...
reddit_999998,"[AskReddit, Fitness, Fitness, AskReddit, AskRe...","Painful, very drawn out, but final breakup wit..."


In [None]:
%%time
# Time estimation: 3h 30min
chunked_reddit_data = build_chunk_dataframe(full_reddit_corpus, meta_reddit_corpus)
nunique_reddit_data = clean_non_unique(chunked_reddit_data)
nunique_reddit_data

  0%|          | 0/1071476 [00:00<?, ?it/s]

Token indices sequence length is longer than the specified maximum sequence length for this model (2655 > 512). Running this sequence through the model will result in indexing errors
Token indices sequence length is longer than the specified maximum sequence length for this model (3354 > 512). Running this sequence through the model will result in indexing errors
Token indices sequence length is longer than the specified maximum sequence length for this model (7782 > 512). Running this sequence through the model will result in indexing errors
Token indices sequence length is longer than the specified maximum sequence length for this model (6495 > 512). Running this sequence through the model will result in indexing errors
Token indices sequence length is longer than the specified maximum sequence length for this model (7665 > 512). Running this sequence through the model will result in indexing errors
Token indices sequence length is longer than the specified maximum sequence length fo

In [10]:
nunique_reddit_data[['id', 'pretokenized_text', 'decoded_text']]

Unnamed: 0,id,pretokenized_text,decoded_text
0,reddit_1000000,"[0, 8170, 71, 5, 7726, 3607, 5, 5466, 6, 89, 3...",<s>Even after the rocket leaves the atmosphere...
1,reddit_1000000,"[476, 804, 49069, 37457, 29, 15698, 10105, 70,...",power online.<\s>Because all SPCs that end in...
2,reddit_1000000,"[5, 1272, 52, 33, 32, 31, 15503, 4458, 14, 126...",the problems we have are from furries that me...
3,reddit_1000000,"[18656, 49069, 37457, 29, 15698, 31593, 995, 2...",reens.<\s>Fallout 4 will not full-screen for m...
4,reddit_1000000,"[15698, 100, 16134, 3999, 1571, 5751, 4, 289, ...",>Iddntity theft. Hacking of social media. Some...
...,...,...,...
30120872,reddit_999993,"[49069, 37457, 29, 15698, 100, 524, 14, 621, 4...",.<\s>I am that person. My parents influenced m...
30120873,reddit_999993,"[611, 14118, 4, 1437, 50118, 50118, 100, 4157,...",chbag. \n\nI hate myself but at the very least...
30120874,reddit_999993,"[21031, 10, 9310, 4, 50118, 50118, 40992, 110,...",Take a shower.\n\nCut your nails.\n\nDance to ...
30120875,reddit_999993,"[49069, 37457, 29, 15698, 11243, 16, 430, 6, 1...",".<\s>Everyone is different, no problem. \n\nM..."


In [11]:
print(f"{full_reddit_corpus.shape[0]:,}", f"{chunked_reddit_data.shape[0]:,}", f"{nunique_reddit_data.shape[0]:,}")

1,071,476 30,120,877 30,120,438


In [14]:
FILE_PATH

'/data/baixiang/dataset/'

In [17]:
%%time
nunique_reddit_data.reset_index(drop=True, inplace=True)
nunique_reddit_data[['id', 'pretokenized_text', 'decoded_text']].to_csv(FILE_PATH+'reddit_mud/reddit_processed.csv', index=False)
nunique_reddit_data[['id', 'decoded_text']].to_csv(FILE_PATH+'reddit_mud/reddit_pro_text_only.csv', index=False)  # smaller file size
# df_reddit = pd.read_csv(FILE_PATH+"reddit_processed.csv")

CPU times: user 48min 4s, sys: 3min 52s, total: 51min 56s
Wall time: 51min 57s


In [18]:
%%time
reddit_train, reddit_test = train_test_split_by_author(nunique_reddit_data)
reddit_train[['id', 'decoded_text']].to_csv(FILE_PATH+"reddit_mud/reddit_train.csv", index=False)
reddit_test[['id', 'decoded_text']].to_csv(FILE_PATH+"reddit_mud/reddit_test.csv", index=False)
# reddit_train[['id', 'pretokenized_text', 'decoded_text']].to_csv(FILE_PATH+"reddit_mud/reddit_train_plus_token.csv", index=False)
# reddit_test[['id', 'pretokenized_text', 'decoded_text']].to_csv(FILE_PATH+"reddit_mud/reddit_test_plus_token.csv", index=False)

CPU times: user 11min 29s, sys: 53.9 s, total: 12min 23s
Wall time: 12min 23s


In [19]:
print(f"{reddit_train.shape[0]:,}", f"{reddit_test.shape[0]:,}")

27,106,354 3,014,084


In [20]:
# %%time
# reddit_train = pd.read_csv(FILE_PATH+"reddit_mud/reddit_train.csv")
# reddit_train.shape

### Topic (action_type) Analysis

In [37]:
val_cnt = df.topic.value_counts()
val_cnt[:20]

topic
AskReddit          22892662
leagueoflegends     5106245
politics            3630520
funny               3445469
pics                3042891
pcmasterrace        2948537
worldnews           2915803
videos              2902798
GlobalOffensive     2718675
nba                 2551179
DotA2               2517609
news                2515962
todayilearned       2408060
nfl                 2353151
DestinyTheGame      2350909
soccer              2345857
gaming              2211573
movies              2097312
SquaredCircle       2093007
The_Donald          1778814
Name: count, dtype: int64

In [49]:
# Check if letter case influences the number of topics
len(val_cnt), len(np.unique(df.topic)), len(np.unique(df.topic.str.lower()))

(125186, 125186, 125186)

### TODO: Language Analysis

In [8]:
df1 = df[:1000].copy()

In [21]:
import langid
langid.classify("War doesn't show who's right, just who's left.")[0], langid.classify("C'est l'heure où les videurs deviennent gentils")[0]

('en', 'fr')

In [22]:
%%time
df1['lang'] = df1['text'].apply(lambda x: langid.classify(x)[0])
df1[df1['lang'] == 'en'].shape

CPU times: user 766 ms, sys: 4.13 ms, total: 770 ms
Wall time: 769 ms


(969, 5)

In [None]:
%%time
df['lang'] = df['text'].apply(lambda x: langid.classify(x)[0])
df[df['lang'] == 'en'].shape

In [14]:
import nltk
from nltk.corpus import words

def get_english_word_rate(row):
    row_words = row.text.lower().split() #decoded_text
    word_count = len(row_words)
    english_words = 0
    for w in row_words:
        if w in words.words():
            english_words += 1
    return english_words / word_count

In [17]:
%%time
# takes a long time to run
df1['english_word_rate'] = df1.apply(get_english_word_rate, axis=1)
df1[df1['english_word_rate'] > 0.75].shape

CPU times: user 17min 23s, sys: 11.2 s, total: 17min 34s
Wall time: 17min 35s


(337, 4)

In [20]:
df1

Unnamed: 0,text,id,topic,english_word_rate
0,I dash attack waaaayyyyy too often,Sorry_I_Am_Canadian,smashbros,0.833333
1,SO HAPPY ARMADA CAME THROUGH AGAINST THE PUNK ...,Sorry_I_Am_Canadian,smashbros,1.000000
2,Anyone have a replay of this? I missed it,Sorry_I_Am_Canadian,smashbros,0.777778
3,You look like a dead goldfish that was left in...,Sorry_I_Am_Canadian,RoastMe,0.950000
4,Poké Floats was legal??? Better not tell twitc...,Sorry_I_Am_Canadian,smashbros,0.666667
...,...,...,...,...
995,Me too :( and I was really excited about this ...,goingHAMandcheese,GirlGamers,0.850000
996,"Well, you're my hero. \nMany thanks!",goingHAMandcheese,GirlGamers,0.333333
997,Hope all is well now.,goingHAMandcheese,AskReddit,0.800000
998,"If you get a break, use it to take a walk inst...",goingHAMandcheese,AskReddit,0.782609


In [27]:
from langdetect import detect
detect("War doesn't show who's right, just who's left."), detect("C'est l'heure où les videurs deviennent gentils")

('en', 'fr')

In [19]:
%%time
df1['lang'] = df1['text'].apply(lambda x: detect(x))
df1

LangDetectException: No features in text.

In [23]:
import fasttext
model = fasttext.load_model('lid.176.ftz')
print(model.predict('الشمس تشرق', k=2)) 

ModuleNotFoundError: No module named 'fasttext'

In [25]:
from textblob import TextBlob
TextBlob("War doesn't show who's right, sjust who's left.").detect_language()

HTTPError: HTTP Error 400: Bad Request

In [None]:
%%time
df1['lang'] = df1['text'].apply(lambda x: langid.classify(x)[0])
print(df1.head())
df1[df1['lang'] == 'en'].shape
TextBlob(i).detect_language()

## TODO: Book

In [None]:
meta = pd.read_csv('data/nlp/gutenberg/metadata/metadata.csv')

available_texts = list(os.listdir('data/nlp/gutenberg/data/text'))

clean_meta = meta[~meta.author.isin({'Anonymous', 'Various'})]
clean_meta = clean_meta[clean_meta.language.apply(lambda x: 'en' in x)]
clean_meta = clean_meta[clean_meta.id.apply(lambda x: f'{x}_text.txt' in available_texts)]

In [None]:
book_data_meta = book_data.merge(clean_meta, on=['id']).drop('type', axis=1).dropna()
n_values = len(book_data_meta.author.unique())
book_mapping = {k: v for k, v in zip(book_data_meta.author.unique(), range(n_values))}

book_data_meta['id_2'] = book_data_meta.author.apply(lambda x: 'book_' + str(book_mapping[x]))
book_data_meta