# Imports

In [None]:
import re, string, unicodedata
import nltk
import contractions
import inflect
# from nltk import word_tokenize, sent_tokenize
from nltk.corpus import stopwords
from nltk.stem import SnowballStemmer, WordNetLemmatizer
import pandas as pd
import os
import numpy as np
import time
import seaborn as sns 
import scipy 
import statistics
import matplotlib.pyplot as plt

# Data addresses

In [None]:
origDataAddr = 'data/original/TheRedPill/'
outAddr = 'data/preprocessed/TheRedPill/'

if not os.path.exists(outAddr):
    os.makedirs(outAddr)

# Table names

In [None]:
tableNames = [
    "`fh-bigquery.reddit_posts.2015_12`"
]

for year in range(2016, 2017+1):
    for month in range(1,12+1):
        month = "{0:0=2d}".format(month)
        tableNames.append("`fh-bigquery.reddit_posts."+str(year)+"_"+str(month)+"`")
        
for year in range(2018, 2018+1):
    for month in range(1, 12+1):
        month = "{0:0=2d}".format(month)
        tableNames.append("`fh-bigquery.reddit_posts."+str(year)+"_"+str(month)+"`")
        
tableNames += [
     '`fh-bigquery.reddit_posts.2019_01`'
]

tableNames

# Read reddit  data

In [None]:
dfDict = {}
for tableName in tableNames:
    print (tableName)
    filename = tableName[1:-1]
    try:
        df = pd.read_csv(origDataAddr + filename + '.csv', index_col=0)
    except:
        print ('Error')
        continue
    print (len(df))
    dfDict[filename] = df

In [None]:
df = pd.concat(dfDict.values()).reset_index(drop=True)

In [None]:
print (len(df))
df

# Use only relevant columns, replace name column with t3_id

In [None]:
for tablename in dfDict:
    print (tablename)
    cols = [ 'name', 'id', 'title', 'selftext' ]
    df = dfDict[tablename]
    df = df[cols]
    df['name'] = 't3_'+df['id']
    dfDict[tablename] = df
    print (len(df))

In [None]:
df = pd.concat(dfDict.values()).reset_index(drop=True)
print (len(df))
df

# Suggested text prep flow


In [None]:
from ekphrasis.classes.preprocessor import TextPreProcessor
from ekphrasis.classes.tokenizer import SocialTokenizer
from ekphrasis.dicts.emoticons import emoticons

text_processor = TextPreProcessor(
    # terms that will be normalized
    normalize=['url', 'email', 'percent', 'money', 'phone', 'user',
        'time', 'url', 'date', 'number'],
    # terms that will be annotated
    annotate={"hashtag", "allcaps", "elongated", "repeated",
        'emphasis', 'censored'},
    fix_html=True,  # fix HTML tokens
    
    # corpus from which the word statistics are going to be used 
    # for word segmentation 
    segmenter="twitter", 
    
    # corpus from which the word statistics are going to be used 
    # for spell correction
    corrector="twitter", 
    
    unpack_hashtags=True,  # perform word segmentation on hashtags
    unpack_contractions=True,  # Unpack contractions (can't -> can not)
    spell_correct_elong=False,  # spell correction for elongated words
    
    # select a tokenizer. You can use SocialTokenizer, or pass your own
    # the tokenizer, should take as input a string and return a list of tokens
    tokenizer=SocialTokenizer(lowercase=True).tokenize,
    
    # list of dictionaries, for replacing tokens extracted from the text,
    # with other expressions. You can pass more than one dictionaries.
    dicts=[emoticons]
)


# Preprocess title, selftext, comment

In [None]:
for tablename in dfDict:
    print (tablename)
    
    start = time.time()
    df = dfDict[tablename]
    print (len(df))
    df = df[~df['name'].isnull()]
    print ('After dropping nan names have ', len(df))
    df = df.drop_duplicates(subset=['name'], keep='first')
    print ('After dropping duplicate names ', len(df))
    
    #replace np.nan with '', removed, deleeted as well
    df['title_ekphrasis'] = df['title'].replace(np.nan, '', regex=True)
    df['selftext_ekphrasis'] = df['selftext'].replace(np.nan, '', regex=True)
    print ('After replaceing nans texts with emptry string ')

    #fasttext
    df['title_ekphrasis'] = df['title_ekphrasis'].apply(lambda x:text_processor.pre_process_doc(x))
    df['title_ekphrasis'] = df['title_ekphrasis'].replace(np.nan, '', regex=True)
    
    print ('After title prep')
    df['selftext_ekphrasis'] = df['selftext_ekphrasis'].apply(lambda x:text_processor.pre_process_doc(x))
    df['selftext_ekphrasis'] = df['selftext_ekphrasis'].replace(np.nan, '', regex=True)
    
    print ('After selef text prep')
    
    dfDict[tablename] = df
#     df.to_csv(outAddr + tablename + '.csv')

    end = time.time()
    print('Time ', end - start)
#     break


In [None]:
df = pd.concat(dfDict.values()).reset_index(drop=True)
print (len(df))
df

# Find list of hashtags used

In [None]:
def findOccurrences(s, ch):
    return [i for i, letter in enumerate(s) if letter == ch]

hashtags = {}
for tablename in dfDict:
    df = dfDict[tablename]
    for idx, row in df.iterrows():
        text = row['selftext']
        if type(text) == str:
            finds = findOccurrences(text, '#')
        else:
            continue
        for find in finds:
            temp = text[find : ]
            tokens = temp.split()
            if len(tokens) > 1:
                print (tokens[0], tokens[1])
            else:
                print (tokens[0])


In [None]:
hashtags

# Remove certain things in tags

In [None]:
def cleanEkphrasis(sample):
    sample = ' '.join(sample)
    
    sample = sample.replace('<allcaps>', ' ')
    sample = sample.replace('</allcaps>', ' ')
    sample = sample.replace('<repeated>', ' ')
    sample = sample.replace('<elongated>', ' ')
    sample = sample.replace('<emphasis>', ' ')
    sample = sample.replace('<url>', ' ')
    
    return sample

In [None]:
cols = [ 'name', 'title_preprocessed', 'selftext_preprocessed', 'title_ekphrasis', 'selftext_ekphrasis' ]
for tablename in dfDict:
    print (tablename)
    
    start = time.time()
    df = dfDict[tablename]
    print (len(df))
    
    #replace np.nan with '', removed, deleeted as well
    df['title_preprocessed'] = df['title'].replace(np.nan, '', regex=True)
    df['selftext_preprocessed'] = df['selftext'].replace(np.nan, '', regex=True)

    df['title_preprocessed'] = df['title_ekphrasis'].apply(lambda x:cleanEkphrasis(x))
    df['title_preprocessed'] = df['title_preprocessed'].replace(np.nan, '', regex=True)
    
    print ('After title prep')
    df['selftext_preprocessed'] = df['selftext_ekphrasis'].apply(lambda x:cleanEkphrasis(x))
    df['selftext_preprocessed'] = df['selftext_preprocessed'].replace(np.nan, '', regex=True)
    
    print ('After self text prep')
    
    dfDict[tablename] = df
#     df[cols].to_csv(outAddr + tablename + '.csv') #to save space on server, only save relevant columns
#     df.to_csv(outAddr + tablename + '.csv')

    end = time.time()
    print('Time ', end - start)
#     break


In [None]:
df = pd.concat(dfDict.values()).reset_index(drop=True)
print (len(df))
df

# Average length distribution of posts before len limiting, after cleaning

In [None]:
df['text_preprocessed'] = df['title_preprocessed'] + ' ' + df['selftext_preprocessed']

In [None]:
x = [ len(i) for i in df['text_preprocessed'] ]
print (len(x))
print ('Mode ', statistics.mode(x))
print ('Median ', statistics.median(x))
print (scipy.stats.describe(x))
plt.title('TheRedPill length distribution before len limit after clean')
sns.distplot(x)

# heuristics for better text classification
1. Impose len limits. Drop social data posts if !(len>=256 and len <= 4096) characters the body text (not including the title) was less than 256 or more than 4096 characters in length, AFTER cleaning
2. Drop those without selftext 

In [None]:
df = pd.concat(dfDict.values()).reset_index(drop=True)
print (len(df))
df

In [None]:
print ('before dropping those whose selftext is None ', len(df))
df = df.dropna(subset=['selftext'])
print ('After dropping those whose selftext is None ', len(df))

In [None]:
print ('Dropping all those without names ', len(df))
df = df.dropna(subset=['name'])
print ('After ', len(df))

In [None]:
print ('Dropping all those with duplicate names ', len(df))
df = df.drop_duplicates(subset=['name'], keep='first')
print ('After ', len(df))

In [None]:
df['len_selftext_preprocessed'] = df['selftext_preprocessed'].apply(lambda x:len(x))

In [None]:
print ('before dropping coz too long selftext ', len(df))
len_controlled_df = df[ df['len_selftext_preprocessed'] <= 4096 ]
len_controlled_df = len_controlled_df.reset_index(drop=True)
print ('after dropping ', len(len_controlled_df))

In [None]:
print ('before dropping coz too short ', len(len_controlled_df))
len_controlled_df = len_controlled_df[ len_controlled_df['len_selftext_preprocessed'] >= 256 ]
len_controlled_df = len_controlled_df.reset_index(drop=True)
print ('after dropping ', len(len_controlled_df))


# Combine title, selftext

In [None]:
len_controlled_df['text_preprocessed'] = len_controlled_df['title_preprocessed'] + ' ' + len_controlled_df['selftext_preprocessed']

In [None]:
print (len(len_controlled_df))

# Average length distribution of posts after len limiting, after cleaning

In [None]:
x = [ len(i) for i in len_controlled_df['text_preprocessed'] ]
print (len(x))
print ('Mode ', statistics.mode(x))
print ('Median ', statistics.median(x))
print (scipy.stats.describe(x))
plt.title('TheRedPill length distribution after len limit after clean')
sns.distplot(x)

# Save df

In [None]:
df = len_controlled_df
print (len(df))

In [None]:
df.to_csv(outAddr + 'TheRedPillCleaned.csv')

In [None]:
outAddr

In [None]:
df['text_preprocessed']