DATA PREPROCESSING Part 1

1. Read NDJSON file as pandas dataframe
2. Remove unuseful columns = dimension reduction (columns) -> only [body] and [createdAtformatted] left
3. Filter out null values found in [body] of tweets = dimension reduction (rows)
4. Filter out tweets not in relevant time period = dimension reduction (rows) -> only Nov 2020, Dec 2020 and Jan 2021 left
5. Filter out [body] of tweets in languages except English = dimension reduction (rows)
6. Remove unuseful time info from [createdAtformatted] column -> only date info left 
7. Save pandas dataframe containing only useful info (no index) as CSV file

In [1]:
import json
import pandas as pd 
import numpy as np

In [2]:
# 1. Read NDJSON file as pandas dataframe

parler_df = pd.read_json('D:\\bachelors_thesis\Datasets\parler_data\parler_data000000000030.ndjson', lines = True) 

In [3]:
# 2. Remove unuseful columns = dimension reduction (columns) -> only [body] and [createdAtformatted] left

print(parler_df.columns)
print('Dimension of whole dataframe: ' + str(parler_df.shape) + '\n') # df.shape -> (rows, columns)

# final_df_1 = pd.DataFrame()
# final_df_1['body'] = df['body'].copy()
# final_df_1['createdAtformatted'] = df['createdAtformatted'].copy()
# parler_df.drop(parler_df.iloc[:, 2:38], inplace = True, axis = 1) # remove all columns between column index 2 to 38
# parler_df.drop(['comments'], inplace = True, axis = 1)            # remove first column
parler_df.drop(parler_df.iloc[:, 5:38], inplace = True, axis = 1)   # remove all columns between column index 5 to 38
parler_df.drop(['comments'], inplace = True, axis = 1)              # remove column nr. 0
parler_df.drop(['bodywithurls'], inplace = True, axis = 1)          # remove column nr. 2
parler_df.drop(['createdAt'], inplace = True, axis = 1)             # remove column nr. 3

print(parler_df.columns)
print('Dimension of dataframe after removing columns: ' + str(parler_df.shape) + '\n')

Index(['comments', 'body', 'bodywithurls', 'createdAt', 'createdAtformatted',
       'creator', 'datatype', 'depth', 'depthRaw', 'followers', 'following',
       'hashtags', 'id', 'lastseents', 'links', 'media', 'parent', 'posts',
       'sensitive', 'upvotes', 'urls', 'username', 'verified', 'article',
       'impressions', 'preview', 'reposts', 'state', 'shareLink', 'color',
       'commentDepth', 'controversy', 'conversation', 'downvotes', 'post',
       'replyingTo', 'score', 'isPrimary'],
      dtype='object')
Dimension of whole dataframe: (1097921, 38)

Index(['body', 'createdAtformatted'], dtype='object')
Dimension of dataframe after removing columns: (1097921, 2)



In [4]:
# 3. Filter out null values found in [body] of tweets = dimension reduction (rows)

print('Dimension of dataframe: ' + str(parler_df.shape)) 
print(parler_df) 

parler_df['body'].replace("", np.nan, inplace=True)
parler_df.dropna(subset=['body'], inplace=True)
# parler_df.reset_index(drop=True, inplace=True)
# parler_df.drop_duplicates(subset = ['body'], inplace=True, keep='first'/'last'/False)

print('\n'  + 'Dimension of dataframe after filtering out null values: ' + str(parler_df.shape)) 
print(parler_df)

Dimension of dataframe: (1097921, 2)
        body       createdAtformatted
0             2019-10-14 00:26:36 UTC
1             2020-07-30 12:02:25 UTC
2             2020-11-09 04:22:41 UTC
3             2020-07-05 06:36:11 UTC
4             2020-11-23 13:25:53 UTC
...      ...                      ...
1097916       2021-01-05 17:31:36 UTC
1097917       2020-01-21 12:41:41 UTC
1097918       2020-06-05 03:48:34 UTC
1097919       2019-11-02 14:07:03 UTC
1097920       2020-11-20 05:16:40 UTC

[1097921 rows x 2 columns]

Dimension of dataframe after filtering out null values: (638636, 2)
                                                      body  \
33       I agree, get the US outa UN, and convert that ...   
79        Professor my ass more like a pot head gone meth.   
134      Expand the electoral college so that large cit...   
170                   Need To Spread The News, Jo Ann !!..   
346                                          Demon Spawn..   
...                                   

In [5]:
# 4. Filter out tweets not in relevant time period = dimension reduction (rows) -> only Nov 2020, Dec 2020 and Jan 2021 left

def check_date(string):
    if ( (((string.split('-'))[0] == '2020') and ((string.split('-'))[1] == '11'))          # November 2020
        or (((string.split('-'))[0] == '2020') and ((string.split('-'))[1] == '12'))        # December 2020
        or (((string.split('-'))[0] == '2021') and ((string.split('-'))[1] == '01')) ):     # January  2021
            return True
    return False

print('Dimension of dataframe before: ' + str(parler_df.shape)) 
print(parler_df)

parler_df['appropiateDate'] = parler_df['createdAtformatted'].apply(check_date)
# parler_df.drop(parler_df[parler_df['appropiateDate'] == False].index, inplace=True)
parler_df = parler_df[parler_df['appropiateDate'] == True]
parler_df.drop(['appropiateDate'], inplace = True, axis = 1)          

print('Dimension of dataframe with tweets from relevant time period only: ' + str(parler_df.shape)) 
print(parler_df)

Dimension of dataframe before: (638636, 2)
                                                      body  \
33       I agree, get the US outa UN, and convert that ...   
79        Professor my ass more like a pot head gone meth.   
134      Expand the electoral college so that large cit...   
170                   Need To Spread The News, Jo Ann !!..   
346                                          Demon Spawn..   
...                                                    ...   
1097851  What NASCAR?\nWhat NFL?\nWhat NBA?\nWhat MLB?\...   
1097877                                         She’s good   
1097897  The DEMONRATS/DEMOCRATS/SOCIALIST Pigs Cheated...   
1097901                                            Love it   
1097910  THEY COMMITTED TREASON!!!!!!!\n\nTHEY MUST AND...   

              createdAtformatted  
33       2020-08-22 19:18:42 UTC  
79       2020-12-13 13:43:28 UTC  
134      2019-12-16 01:39:41 UTC  
170      2020-12-10 18:39:32 UTC  
346      2020-10-13 22:19:41 UTC  
..

In [6]:
# 5. Filter out [body] of tweets in languages except English = dimension reduction (rows)

import fasttext
model = fasttext.load_model("lid.176.ftz")

def fast_detect(msg):
    try:
        ln = model.predict(msg)[0][0].split("__")[2] 
    except Exception as e:
        ln = None
    return ln

print('Dimension of dataframe before: ' + str(parler_df.shape)) 
parler_df['language'] = parler_df['body'].apply(fast_detect)
print('Dimension of dataframe with tweets in English only: ' + str(parler_df.shape)) 

print(parler_df)
parler_df.drop(parler_df[parler_df['language'] != 'en'].index, inplace=True)
parler_df.drop(['language'], inplace = True, axis = 1)          
print(parler_df) 

Dimension of dataframe before: (353660, 2)




Dimension of dataframe with tweets in English only: (353660, 3)
                                                      body  \
79        Professor my ass more like a pot head gone meth.   
170                   Need To Spread The News, Jo Ann !!..   
362              Fuck yes fire this loser non American !!!   
409                      EVERYONE PLEASE\nECHO, ECHO, ECHO   
491                                    Hang that pedophile   
...                                                    ...   
1097616  Exactly what I think every time I see John Rob...   
1097656                            Twat, the term is twat!   
1097897  The DEMONRATS/DEMOCRATS/SOCIALIST Pigs Cheated...   
1097901                                            Love it   
1097910  THEY COMMITTED TREASON!!!!!!!\n\nTHEY MUST AND...   

              createdAtformatted language  
79       2020-12-13 13:43:28 UTC       en  
170      2020-12-10 18:39:32 UTC       en  
362      2020-11-12 02:09:25 UTC       en  
409      2020-1

In [7]:
# 6. Remove unuseful time info from [createdAtformatted] column -> only date info left 

print(parler_df['createdAtformatted'])
parler_df['createdAtformatted'] = parler_df['createdAtformatted'].str.split(n = 0, expand = False).str[0]
print(parler_df['createdAtformatted'])

79         2020-12-13 13:43:28 UTC
170        2020-12-10 18:39:32 UTC
362        2020-11-12 02:09:25 UTC
491        2020-12-25 01:18:19 UTC
648        2020-12-08 23:16:48 UTC
                    ...           
1097610    2020-12-23 01:11:04 UTC
1097613    2020-11-02 13:24:42 UTC
1097616    2020-12-25 12:00:49 UTC
1097656    2020-11-18 17:42:38 UTC
1097901    2020-12-20 22:22:55 UTC
Name: createdAtformatted, Length: 294641, dtype: object
79         2020-12-13
170        2020-12-10
362        2020-11-12
491        2020-12-25
648        2020-12-08
              ...    
1097610    2020-12-23
1097613    2020-11-02
1097616    2020-12-25
1097656    2020-11-18
1097901    2020-12-20
Name: createdAtformatted, Length: 294641, dtype: object


In [8]:
# 7. Save pandas dataframe containing only useful info (no index) as CSV file

parler_df.to_csv('parler_df_030_dates_before.csv', index=False)