In [1]:
# Dataset reading, preprocessing and saving as csv

import json
import pandas as pd 
import numpy as np

# read json file as dataframe
parler_df = pd.read_json('D:\\bachelors_thesis\Datasets\parler_data\parler_data000000000014.ndjson', lines = True) 

In [2]:
# remove columns  

print(parler_df.columns)
print('Dimension of whole dataframe: ' + str(parler_df.shape) + '\n') # df.shape -> (rows, columns)

# final_df_1 = pd.DataFrame()
# final_df_1['body'] = df['body'].copy()
# final_df_1['createdAtformatted'] = df['createdAtformatted'].copy()
parler_df.drop(parler_df.iloc[:, 2:38], inplace = True, axis = 1) # remove all columns between column index 2 to 38
parler_df.drop(['comments'], inplace = True, axis = 1) # remove first column

print(parler_df.columns)
print('Dimension of dataframe after removing columns: ' + str(parler_df.shape) + '\n')

Index(['comments', 'body', 'bodywithurls', 'createdAt', 'createdAtformatted',
       'creator', 'datatype', 'depth', 'depthRaw', 'followers', 'following',
       'hashtags', 'id', 'lastseents', 'links', 'media', 'posts', 'sensitive',
       'shareLink', 'upvotes', 'urls', 'username', 'verified', 'article',
       'impressions', 'preview', 'reposts', 'state', 'parent', 'color',
       'commentDepth', 'controversy', 'downvotes', 'post', 'score',
       'isPrimary', 'conversation', 'replyingTo'],
      dtype='object')
Dimension of whole dataframe: (1096458, 38)

Index(['body'], dtype='object')
Dimension of dataframe after removing columns: (1096458, 1)



In [3]:
# filter out null values

print('Dimension of dataframe: ' + str(parler_df.shape)) 
print(parler_df['body']) 

parler_df['body'].replace("", np.nan, inplace=True)
parler_df.dropna(subset=['body'], inplace=True)

print('\n'  + 'Dimension of dataframe after filtering out null values: ' + str(parler_df.shape)) 
print(parler_df['body'])

Dimension of dataframe: (1096458, 1)
0                                                           
1          Sorry BLM, 'Stop snitching' might work in the ...
2                                                           
3                                       Parisian enrichment.
4          Replacing Mark Esper, Acting Defense Secretary...
                                 ...                        
1096453    A few have asked, why the name Copernicus4u? P...
1096454                                    #christians\nAmen
1096455                                                     
1096456                   Murdered by Obama/Biden Feb. 2016.
1096457    #parler #wwg1wga #maga #kag #newuser #news #tr...
Name: body, Length: 1096458, dtype: object

Dimension of dataframe after filtering out null values: (637153, 1)
1          Sorry BLM, 'Stop snitching' might work in the ...
3                                       Parisian enrichment.
4          Replacing Mark Esper, Acting Defense Secretary.

In [4]:
parler_df.to_csv('parler_df_14_before.csv')

In [5]:
# convert to lowercase

print(parler_df['body'])
parler_df['body'] = parler_df['body'].apply(lambda x: ' '.join([w.lower() for w in x.split()]))
print(parler_df['body'])

1          Sorry BLM, 'Stop snitching' might work in the ...
3                                       Parisian enrichment.
4          Replacing Mark Esper, Acting Defense Secretary...
5          Have you noticed how loud and shrill #FakeNews...
7          The benefits of hydroxychloroquine are incredi...
                                 ...                        
1096450                           America, respected again!!
1096453    A few have asked, why the name Copernicus4u? P...
1096454                                    #christians\nAmen
1096456                   Murdered by Obama/Biden Feb. 2016.
1096457    #parler #wwg1wga #maga #kag #newuser #news #tr...
Name: body, Length: 637153, dtype: object
1          sorry blm, 'stop snitching' might work in the ...
3                                       parisian enrichment.
4          replacing mark esper, acting defense secretary...
5          have you noticed how loud and shrill #fakenews...
7          the benefits of hydroxychloroqui

In [6]:
# remove emojis

import demoji
print(parler_df['body'])
parler_df['body'] = parler_df['body'].apply(lambda x: demoji.replace(x, ""))
print(parler_df['body'])

1          sorry blm, 'stop snitching' might work in the ...
3                                       parisian enrichment.
4          replacing mark esper, acting defense secretary...
5          have you noticed how loud and shrill #fakenews...
7          the benefits of hydroxychloroquine are incredi...
                                 ...                        
1096450                           america, respected again!!
1096453    a few have asked, why the name copernicus4u? p...
1096454                                     #christians amen
1096456                   murdered by obama/biden feb. 2016.
1096457    #parler #wwg1wga #maga #kag #newuser #news #tr...
Name: body, Length: 637153, dtype: object
1          sorry blm, 'stop snitching' might work in the ...
3                                       parisian enrichment.
4          replacing mark esper, acting defense secretary...
5          have you noticed how loud and shrill #fakenews...
7          the benefits of hydroxychloroqui

In [7]:
# expand contractions  

import contractions
print(parler_df['body'])
parler_df['body'] = parler_df['body'].apply(lambda x: ' '.join([contractions.fix(word) for word in x.split()]))
print(parler_df['body'])

1          sorry blm, 'stop snitching' might work in the ...
3                                       parisian enrichment.
4          replacing mark esper, acting defense secretary...
5          have you noticed how loud and shrill #fakenews...
7          the benefits of hydroxychloroquine are incredi...
                                 ...                        
1096450                           america, respected again!!
1096453    a few have asked, why the name copernicus4u? p...
1096454                                     #christians amen
1096456                   murdered by obama/biden feb. 2016.
1096457    #parler #wwg1wga #maga #kag #newuser #news #tr...
Name: body, Length: 637153, dtype: object
1          sorry blm, 'stop snitching' might work in the ...
3                                       parisian enrichment.
4          replacing mark esper, acting defense secretary...
5          have you noticed how loud and shrill #fakenews...
7          the benefits of hydroxychloroqui

In [8]:
# remove numbers

import re
print(parler_df['body'])
parler_df['body'] = parler_df['body'].apply(lambda x: ' '.join(re.sub("[^a-zA-Z]+", " ", x).split()))
print(parler_df['body'])

1          sorry blm, 'stop snitching' might work in the ...
3                                       parisian enrichment.
4          replacing mark esper, acting defense secretary...
5          have you noticed how loud and shrill #fakenews...
7          the benefits of hydroxychloroquine are incredi...
                                 ...                        
1096450                           america, respected again!!
1096453    a few have asked, why the name copernicus4u? p...
1096454                                     #christians amen
1096456               murdered by obama/biden february 2016.
1096457    #parler #wwg1wga #maga #kag #newuser #news #tr...
Name: body, Length: 637153, dtype: object
1          sorry blm stop snitching might work in the hoo...
3                                        parisian enrichment
4          replacing mark esper acting defense secretary ...
5          have you noticed how loud and shrill fakenews ...
7          the benefits of hydroxychloroqui

In [9]:
# remove punctuation

import string 
print(parler_df['body'])
parler_df['body'] = parler_df['body'].apply(lambda x: ''.join([i for i in x if i not in string.punctuation]))
print(parler_df['body'])

1          sorry blm stop snitching might work in the hoo...
3                                        parisian enrichment
4          replacing mark esper acting defense secretary ...
5          have you noticed how loud and shrill fakenews ...
7          the benefits of hydroxychloroquine are incredi...
                                 ...                        
1096450                              america respected again
1096453    a few have asked why the name copernicus u pri...
1096454                                      christians amen
1096456                     murdered by obama biden february
1096457    parler wwg wga maga kag newuser news trump tru...
Name: body, Length: 637153, dtype: object
1          sorry blm stop snitching might work in the hoo...
3                                        parisian enrichment
4          replacing mark esper acting defense secretary ...
5          have you noticed how loud and shrill fakenews ...
7          the benefits of hydroxychloroqui

In [10]:
import nltk
from nltk.stem import WordNetLemmatizer, SnowballStemmer
from nltk.stem.porter import *
from nltk.corpus import stopwords
stop_words = stopwords.words('english')
nltk.download('wordnet')

[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\cosmi\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [11]:
# remove stopwords

print(parler_df['body'])
stopwords = [sw for sw in nltk.corpus.stopwords.words('english') if sw not in ['not', 'no']]
stop_words.extend(['from', 'subject', 're', 'edu', 'use'])
parler_df['body'] = parler_df['body'].apply(lambda x: ' '.join([w for w in x.split() if w not in stopwords]))
print(parler_df['body'])

1          sorry blm stop snitching might work in the hoo...
3                                        parisian enrichment
4          replacing mark esper acting defense secretary ...
5          have you noticed how loud and shrill fakenews ...
7          the benefits of hydroxychloroquine are incredi...
                                 ...                        
1096450                              america respected again
1096453    a few have asked why the name copernicus u pri...
1096454                                      christians amen
1096456                     murdered by obama biden february
1096457    parler wwg wga maga kag newuser news trump tru...
Name: body, Length: 637153, dtype: object
1          sorry blm stop snitching might work hood ameri...
3                                        parisian enrichment
4          replacing mark esper acting defense secretary ...
5          noticed loud shrill fakenews become last days ...
7          benefits hydroxychloroquine incr

In [12]:
# lemmatization

print(parler_df['body'])
parler_df['body'] = parler_df['body'].apply(lambda x: ' '.join([WordNetLemmatizer().lemmatize(w) for w in x.split()]))
print(parler_df['body'])

1          sorry blm stop snitching might work hood ameri...
3                                        parisian enrichment
4          replacing mark esper acting defense secretary ...
5          noticed loud shrill fakenews become last days ...
7          benefits hydroxychloroquine incredible hse ref...
                                 ...                        
1096450                                    america respected
1096453    asked name copernicus u prior nicolaus coperni...
1096454                                      christians amen
1096456                        murdered obama biden february
1096457    parler wwg wga maga kag newuser news trump tru...
Name: body, Length: 637153, dtype: object
1          sorry blm stop snitching might work hood ameri...
3                                        parisian enrichment
4          replacing mark esper acting defense secretary ...
5          noticed loud shrill fakenews become last day e...
7          benefit hydroxychloroquine incre

In [13]:
# remove short words

print(parler_df['body'])
parler_df['body'] = parler_df['body'].apply(lambda x: ' '.join([w.strip() for w in x.split() if len(w.strip()) >= 3]))
print(parler_df['body'])

1          sorry blm stop snitching might work hood ameri...
3                                        parisian enrichment
4          replacing mark esper acting defense secretary ...
5          noticed loud shrill fakenews become last day e...
7          benefit hydroxychloroquine incredible hse refu...
                                 ...                        
1096450                                    america respected
1096453    asked name copernicus u prior nicolaus coperni...
1096454                                       christian amen
1096456                        murdered obama biden february
1096457    parler wwg wga maga kag newuser news trump tru...
Name: body, Length: 637153, dtype: object
1          sorry blm stop snitching might work hood ameri...
3                                        parisian enrichment
4          replacing mark esper acting defense secretary ...
5          noticed loud shrill fakenews become last day e...
7          benefit hydroxychloroquine incre

In [14]:
# filter out null values once more

print('Dimension of dataframe: ' + str(parler_df.shape)) 
print(parler_df['body']) 

parler_df['body'].replace("", np.nan, inplace=True)
parler_df.dropna(subset=['body'], inplace=True)

print('\n'  + 'Dimension of dataframe after preprocessing: ' + str(parler_df.shape)) 
print(parler_df['body'])

Dimension of dataframe: (637153, 1)
1          sorry blm stop snitching might work hood ameri...
3                                        parisian enrichment
4          replacing mark esper acting defense secretary ...
5          noticed loud shrill fakenews become last day e...
7          benefit hydroxychloroquine incredible hse refu...
                                 ...                        
1096450                                    america respected
1096453    asked name copernicus prior nicolaus copernicu...
1096454                                       christian amen
1096456                        murdered obama biden february
1096457    parler wwg wga maga kag newuser news trump tru...
Name: body, Length: 637153, dtype: object

Dimension of dataframe after preprocessing: (605628, 1)
1          sorry blm stop snitching might work hood ameri...
3                                        parisian enrichment
4          replacing mark esper acting defense secretary ...
5          

In [15]:
parler_df.to_csv('parler_df_14.csv')