In [14]:
import pandas as pd
import numpy as np
import re

import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords


from string import punctuation

In [None]:
df = pd.read_csv('dataset/SolanaNFTs_top.csv', lineterminator='\n')

In [None]:
##Step 1: Remove lines where comments says -removed or deleted or NaN

# get and remove rows where comment has [removed]
mask_removed = df['title'] == '[removed]'
df = df.loc[~mask_removed,:].reset_index(drop = True)

# get and remove rows where comment has [deleted]
mask_deleted = df['title']== '[deleted]'
df = df.loc[~mask_deleted,:].reset_index(drop = True)

mask_NaN = df['title'] == 'NaN'
df = df.loc[~mask_NaN,:].reset_index(drop = True)

In [157]:
##Step 2: Convert to lower case

# Convert messages to lower case
df['title']= df['title'].astype(str).map(lambda x: x.lower())
df['selftext'] = df['selftext'].astype(str).map(lambda x: x.lower())

In [158]:
##Step 3: Replace newline and carriage return characters

def replace_newline(s):
    s = re.sub('\n', ' ',s)
    s = re.sub("\r", ' ',s)
    return s

df['title'] = df['title'].map(replace_newline)
df['selftext'] = df['selftext'].map(replace_newline)

In [159]:
##Step 4: Replace common english slang with full words

def replace_slang(s):
    '''replace commonly used phrases with the full thing'''
    to_replace = ["i'd", "you'd", "we'd","they'd", "i'll", "you'll", "we'll", "they'll", "i'm",
                  "y'all", "you're", "we're", "they're", "i've", "you've", "we've", "they've", "didn't", "shouldn't",
                 "couldn't", "wouldn't", "won't", "can't", "mustn't", "ain't", "isn't", "that's", "doesn't", "it'd",
                  "it'll", "/month", "/ month", "don't", "there's", "it's", "imo"]
    
    replace_with = ["i would", "you would", "we would", "they would", "i will", "you will", "we will", "they will",
                   "i am", "you all", "you are", "we are", "they are", "i have", "you have", "we have", "they have", 
                    "did not", "should not", "could not", "would not", "will not", "cannot", "must not", "am not", "is not",
                   "that is", "does not", "it would", "it will", "per month", "per month", "do not", "there is", "it is",
                   "in my opinion"]
    
    for i in range(len(to_replace)):
        s = re.sub(r"\b%s\b" %to_replace[i], replace_with[i], s)
    
    return s

df['title'] = df['title'].map(replace_slang)
df['selftext'] = df['selftext'].map(replace_slang)

In [160]:
##Step 5: Remove urls

# remove websites from comments
def remove_urls(s):
    s = re.sub("http[^\s]+", " ", s)
    s = re.sub("https[^\s]+", " ", s)
    s = re.sub("[^\s]+\.us[^\s]+", " ", s)
    s = re.sub("[^\s]+\.co.uk[^\s]+", " ", s)
    s = re.sub("[^\s]+\.com[^\s]+", " ", s)
    s = re.sub(" +", " ", s)
    return s

df['title'] = df['title'].map(remove_urls)
df['selftext'] = df['selftext'].map(remove_urls)

In [161]:
##Step 6: Change encoding to utf-8 to remove some unknown characters that have creeped into the comments

def encode_decode(s):
    s = s.encode('ascii', errors = 'ignore').decode('utf-8')
    return s

# remove unknown characters
df['title'] = df['title'].map(encode_decode)
df['selftext'] = df['selftext'].map(encode_decode)

In [162]:
##Step 7: Remove apostrophes, stars () and extra spaces*

# remove apostrophes
def remove_apostrophe(s):
    s = re.sub("'", '',s)
    s = re.sub("\"", "", s)
    s = re.sub("\*", " ", s)
    s = re.sub("&gt;", " ", s)
    s = re.sub("&gt", " ", s)
    s = re.sub("&amp;#37;", "%", s)
    s = re.sub("&amp;", "&", s)
    s = re.sub(" +", " ", s)
    s = s.rstrip().lstrip()
    return s

# remove apostrophe
df['title'] = df['title'].map(remove_apostrophe)
df['selftext'] = df['selftext'].map(remove_apostrophe)

In [163]:
##Step 8: Some slangs weren't caught last time.

def replace_slang_again(s):
    '''replace commonly used phrases with the full thing'''
    to_replace = ["id", "youd", "theyd", "youll", "they'll", "im",
                  "yall", "youre", "theyre", "ive", "youve", "weve", "theyve", "didnt", "shouldnt",
                 "couldnt", "wouldnt", "wont", "cant", "mustnt", "aint", "isnt", "thats", "doesnt", "itd", "ill", "w/ live",
                 "w/live", "w/no", "itll", "/ month", "/month", "w/ no", "dont", "theres", "at&amp;t", "its"]
    
    replace_with = ["i would", "you would", "they would", "you will", "they will",
                   "i am", "you all", "you are", "they are", "i have", "you have", "we have", "they have", 
                    "did not", "should not", "could not", "would not", "will not", "cannot", "must not", "am not",
                   "is not", "that is", "does not", "it would", "i will", "with live", "with live", "with no", "it will",
                   "per month", "per month", "with no", "do not", "there is", "att" ,"it is"]
    
    for i in range(len(to_replace)):
        s = re.sub(r"\b%s\b" %to_replace[i], replace_with[i], s)
    
    return s

df['title'] = df['title'].map(replace_slang_again)
df['selftext'] = df['selftext'].map(replace_slang_again)

In [164]:
##Step 9: Remove parantheses

# remove apostrophes
def remove_parantheses(s):
    s = re.sub("\(", ' ',s)
    s = re.sub("\[", ' ',s)
    s = re.sub("]", ' ',s)
    s = re.sub("\|", ' ', s)
    s = re.sub("\)", " ", s)
    s = re.sub("\^", " ", s)
    s = re.sub("#", " ", s)
    s = re.sub(" +", " ", s)
    s = s.rstrip().lstrip()
    return s

# remove apostrophe
df['title'] = df['title'].map(remove_parantheses)
df['selftext'] = df['selftext'].map(remove_parantheses)

In [165]:
##Step 10: Add space after full stops where it is directly followed by a character other than a space

# Many comments have full stops not followed by any spaces. Lets correct this.
def fullstop_space(s):
    s = re.sub(r'(?<=[.,])(?=[^\s\.[0-9]])', r' ', s)
    return s

df['title'] = df['title'].map(fullstop_space)
df['selftext'] = df['selftext'].map(fullstop_space)

In [166]:
##Step 11: Remove empty comments
# These are comments that had just an unknown character in them

mask_empty = df['title'] == ''
mask_empty = df['selftext'] == ''
df = df.loc[~mask_empty, :]

In [167]:
df.to_csv('dataset/preproc_SolanaNFTs_top.csv', index = False)

In [188]:
df = pd.read_csv('dataset/SolanaNFTs_top.csv', lineterminator='\n')
# df = df.drop(columns=['Unnamed: 0'])
df.shape

(217, 15)

In [None]:


stop_list += ['would', 'said', 'say', 'year', 'day', 'also', 'first', 'last', 'one', 'two', 'people', 'told', 'new', 'could', 'singapore', 'three', 'may', 'like', 'world', 'since', 'nft', 'metaverse','fashion','amp', 
              'digital','project','nfts', 'eyesoffashion']

In [3]:
df = pd.read_csv('combined_top_data.csv', lineterminator='\n')
# df = df.drop(columns=['Unnamed: 0'])
df.shape

  df = pd.read_csv('combined_top_data.csv', lineterminator='\n')


(325132, 16)

In [9]:
df[df['link_flair_richtext'] != "[]"] 

Unnamed: 0,0,subreddit,title,selftext,id,date_created,author,link_flair_richtext,permalink,ups,downs,num_comments,score,total_awards_received,submission,p or c
643,0,subreddit,title,selftext,id,date_created,author,link_flair_richtext,permalink,ups,downs,num_comments,score,total_awards_received,submission,p or c
4728,0,subreddit,title,selftext,id,date_created,author,link_flair_richtext,permalink,ups,downs,num_comments,score,total_awards_received,submission,p or c
5915,0,subreddit,title,selftext,id,date_created,author,link_flair_richtext,permalink,ups,downs,num_comments,score,total_awards_received,submission,p or c
6512,0,subreddit,title,selftext,id,date_created,author,link_flair_richtext,permalink,ups,downs,num_comments,score,total_awards_received,submission,p or c
6513,1,opensea,F.A.Q thread,"Hey everyone, I wanted to make this sticky FAQ...",m6o42y,2021-03-17 00:53:09,profgrosvenor,"[{'e': 'text', 't': 'Discussion - General'}]",/r/opensea/comments/m6o42y/faq_thread/,1150,0,1886,1150,76,m6o42y,post
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
324909,3834,CryptoArt,,Link here: https://app.rarible.com/token/0x60f...,gj4zksf,2021-01-13 18:26:22,kingfathom,"[{'e': 'text', 't': 'verified-artist'}]",/r/CryptoArt/comments/kwmiox/dipping_my_toe_in...,2,0,,2,0,kwmiox,comment
324910,3835,CryptoArt,,"Very nice, love the colors! What program did y...",gj7f0jv,2021-01-14 07:02:32,Iteration__,"[{'e': 'text', 't': 'verified-artist'}]",/r/CryptoArt/comments/kwmiox/dipping_my_toe_in...,2,0,,2,0,kwmiox,comment
324912,3837,CryptoArt,,Hey thanks! It's a program called Mandelbulb 3...,gj8nxra,2021-01-14 16:16:42,kingfathom,"[{'e': 'text', 't': 'verified-artist'}]",/r/CryptoArt/comments/kwmiox/dipping_my_toe_in...,2,0,,2,0,kwmiox,comment
324913,3838,CryptoArt,,Hey thanks! I'll follow back. \nAnd yeah for s...,gjb75c8,2021-01-15 04:21:10,kingfathom,"[{'e': 'text', 't': 'verified-artist'}]",/r/CryptoArt/comments/kwmiox/dipping_my_toe_in...,2,0,,2,0,kwmiox,comment
