# Clean users and posts compiled datasets 

For both the users and posts datasets:
- Pare down features
- Drop nulls and deal with missing data
- Remove usernames and html artifacts 
- Join users to post dataset
- Export cleaned data as 2 separate csvs: modeling (for train and test) and validation 

In [1]:
import numpy as np
import pandas as pd
import re

pd.set_option('display.max_columns', 100)
pd.set_option('display.max_colwidth', 500)

In [2]:
posts = pd.read_csv('data/posts_01_raw_sample.csv')
posts.shape

(100000, 38)

In [3]:
posts.head(3)

Unnamed: 0,comments,body,bodywithurls,createdAt,createdAtformatted,creator,datatype,depth,depthRaw,followers,following,hashtags,id,lastseents,links,media,posts,sensitive,shareLink,upvotes,urls,username,verified,article,impressions,preview,reposts,state,parent,color,commentDepth,controversy,downvotes,post,score,isPrimary,conversation,replyingTo
0,0,Possibly........\n\n•Chynaa.\n•Soros.\n•Globalists.\n•Deep State.\n\nAnd a whole host of brain washed Idiots - investing in their own future safety.,Possibly........\n\n•Chynaa.\n•Soros.\n•Globalists.\n•Deep State.\n\nAnd a whole host of brain washed Idiots - investing in their own future safety.\n,20200900000000.0,2020-09-01 18:05:07 UTC,e0ccf0acef0a43fa9ea7a447debdc781,comments,2.0,2.0,701.0,525.0,[],08511f2c61514e0f805e299e467f4727,2020-12-26T12:16:52.766453+00:00,[],168.0,1100.0,0.0,https://parler.com/comment/08511f2c61514e0f805e299e467f4727,0.0,[],Dd061973,0.0,,,,,,692baa94e4c845df829fa5b333a0e61b,#808080,1.0,0.0,0.0,19a9db6ce1c040f1accff10028d90cb8,0.0,0.0,,
1,0,Right!,Right!\n,20200720000000.0,2020-07-24 20:58:59 UTC,781e9ee94ab242f294627d69ee1e74ac,comments,1.0,1.0,10.0,12.0,[],52059b3edb194960b7f1db5fa577f2d9,2021-01-09T18:36:02.804212+00:00,[],0.0,18.0,0.0,https://parler.com/comment/52059b3edb194960b7f1db5fa577f2d9,0.0,[],AlisonHMcvay,0.0,,,,,,77528f6960b34bb691405135b27f9782,#a60303,0.0,0.0,0.0,77528f6960b34bb691405135b27f9782,0.0,1.0,,
2,0,Cuomo is an egotistical asshole. His day is coming.,Cuomo is an egotistical asshole. His day is coming.\n,20201130000000.0,2020-11-29 16:25:55 UTC,9c46ba5cdb7445b28d1e301ad873bb75,comments,1.0,1.0,3100.0,5700.0,[],5d3df500ce124a99bf93d15518b732ee,2021-01-09T16:02:03.575467+00:00,[],1.0,7900.0,0.0,https://parler.com/comment/5d3df500ce124a99bf93d15518b732ee,1.0,[],Mlaster206,0.0,,,,,,7c187094ac5c4ed2a408d6288f923b4e,#a60303,0.0,0.0,0.0,7c187094ac5c4ed2a408d6288f923b4e,1.0,1.0,,


## Drop columns

In [4]:
posts = posts.drop(columns=['bodywithurls', 'createdAt', 'color', 'shareLink', 'urls'])

In [5]:
# lowercase column names
posts.columns = posts.columns.str.lower()

## Clean up text with regex 

In [6]:
# remove usernames
posts['body'] = posts['body'].map(lambda x: re.sub("\@[a-zA-Z0-9]*", ' ', str(x)))

In [7]:
# remove new lines characters and html artifacts
posts['body'] = posts['body'].map(lambda x: re.sub("\n|\r|&amp;#x200B;|&amp;", ' ', str(x)))

## Remove rows where body text contains the following words 
- parler

In [8]:
posts = posts[posts['body'].str.contains('parler') == False]

In [9]:
posts.head(2)

Unnamed: 0,comments,body,createdatformatted,creator,datatype,depth,depthraw,followers,following,hashtags,id,lastseents,links,media,posts,sensitive,upvotes,username,verified,article,impressions,preview,reposts,state,parent,commentdepth,controversy,downvotes,post,score,isprimary,conversation,replyingto
0,0,Possibly........ •Chynaa. •Soros. •Globalists. •Deep State. And a whole host of brain washed Idiots - investing in their own future safety.,2020-09-01 18:05:07 UTC,e0ccf0acef0a43fa9ea7a447debdc781,comments,2.0,2.0,701.0,525.0,[],08511f2c61514e0f805e299e467f4727,2020-12-26T12:16:52.766453+00:00,[],168.0,1100.0,0.0,0.0,Dd061973,0.0,,,,,,692baa94e4c845df829fa5b333a0e61b,1.0,0.0,0.0,19a9db6ce1c040f1accff10028d90cb8,0.0,0.0,,
1,0,Right!,2020-07-24 20:58:59 UTC,781e9ee94ab242f294627d69ee1e74ac,comments,1.0,1.0,10.0,12.0,[],52059b3edb194960b7f1db5fa577f2d9,2021-01-09T18:36:02.804212+00:00,[],0.0,18.0,0.0,0.0,AlisonHMcvay,0.0,,,,,,77528f6960b34bb691405135b27f9782,0.0,0.0,0.0,77528f6960b34bb691405135b27f9782,0.0,1.0,,


In [13]:
(posts.isna().sum()/len(posts) * 100).sort_values()

comments               0.000000
verified               0.000000
username               0.000000
sensitive              0.000000
posts                  0.000000
media                  0.000000
links                  0.000000
lastseents             0.000000
id                     0.000000
upvotes                0.000000
following              0.000000
followers              0.000000
depthraw               0.000000
depth                  0.000000
datatype               0.000000
creator                0.000000
createdatformatted     0.000000
body                   0.000000
hashtags               0.000000
parent                13.583560
score                 22.592341
post                  22.592341
downvotes             22.592341
controversy           22.592341
commentdepth          22.592341
isprimary             32.367805
preview               77.407659
reposts               77.407659
impressions           77.407659
article               78.104459
state                 78.836708
conversa

In [16]:
posts = posts.drop(columns=['conversation', 'replyingto'])

## Export cleaned posts csv

In [17]:
posts.to_csv('data/posts_01_cleaned_sample.csv', index=False)