In [3]:
'''
Imports
'''

from concurrent.futures import ProcessPoolExecutor, as_completed, ThreadPoolExecutor
from IPython.display import clear_output, display
import math
from multiprocessing import  Pool
import numpy as np
from numpy import asarray
import os
import pandas as pd
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)
from PIL import Image
from PIL import ImageFile
import shutil
import string
import time
from tqdm import tqdm

start = time.time()

In [4]:
'''
Util imports
'''

from utils.dataframeUtils import addComments, checkIfTrainCSVIsValid, cleanDataFrameFromNansandnans, countFakeNoFake, createIDLabelFile, createIDTitleCommentsTextLabelFile, createIDTitleTextLabelFile, createIDTitleFile, createIDTitleCommentsTextMetaDataLabelFile, createMetaDataLabelFile, encodeAuthors, replaceNanInScoreAndUpvote, show_pandas_n_front_columns, show_pandas_n_last_columns, writeAuthorListToCSV, writeOutCleanedDataFrameToCSV

from utils.fileAndDirUtils import calcMeanAndStdOfImage, checkIfDirExistsAndCreate, checkIfImagesAreAvailableAndValid, checkIfImageIsAvaliable, copyImageFromAToB, listdir_fullpath, writeMeansToFile

from utils.multiprocessingUtils import calculateMeansAndStdMultiprocessing, generateFileList, generateFileListForCopy, generateFileListForMeanAndStds, parallelize_dataframe, parallelize_dataframe_comments, resizeImagesMultiprocessing, resizeNormalizeImagesMultiprocessing, workerCopyAToB, workerMeanStds

from utils.otherUtils import calcZeroBaseline, convertRowToDictionary, isBlank, parseStringAsNpArray, processComment

In [3]:
def addComments1(dataframe):
    try:
        dataframe.insert(loc=dataframe.shape[1], column='comments', value=[list for i in range(dataframe.shape[0])])
        dataframe.insert(loc=dataframe.shape[1], column='up_vote_comments', value=[list for i in range(dataframe.shape[0])])
    except ValueError:
        print('Found columns, ignoring inserting')
    df_comments = pd.merge(dataframe, df_all_comments, left_on='id', right_on='submission_id', how='inner',suffixes=('_left','_right'))
    for row in dataframe.itertuples(index=True, name=None):
        row_dict = convertRowToDictionary(row, dataframe.columns, True)        
        currentCommentsSelector = df_comments['submission_id'] == row_dict['id']

        # Selecting all related comments and cleaning unnamed stuff
        selectedComments = df_comments[currentCommentsSelector]
        selectedComments = selectedComments.loc[:, ~selectedComments.columns.str.contains('^Unnamed')]

        clean_comments = []
        clean_up_vote = []

        if not selectedComments.empty:
            if (len(selectedComments)) is not int(row_dict['num_comments']):
                print(f'Checked comments and num_comments -> mismatch! len of comments found: {len(selectedComments)}, but should be {int(row_dict["num_comments"])} at id {row_dict["id"]}')
                dataframe.at[row[0], 'num_comments'] = len(selectedComments)
        
            # Iterating over all found comments, cleaning them 
            for row_comment in selectedComments.itertuples(index=True, name=None):
                row_dict_comments = convertRowToDictionary(row_comment, selectedComments.columns, True)
                clean_comments.append(processComment(row_dict_comments['body']))
                clean_up_vote.append(row_dict_comments['ups'])       
        else:
            dataframe.at[row[0], 'num_comments'] = 0
        
        # Inserting at correct position
        dataframe.at[row[0], 'comments'] = clean_comments
        dataframe.at[row[0], 'up_vote_comments'] = clean_up_vote

    return dataframe


def parallelize_dataframe_comments(df, func, n_cores=4):
    df_split = np.array_split(df, n_cores)
    pool = Pool(n_cores)
    df = pd.concat(pool.map(func, df_split))
    pool.close()
    pool.join()
    return df




In [4]:
'''
Settings
'''

# DecompressionBomb
#https://pillow.readthedocs.io/en/5.1.x/releasenotes/5.0.0.html
Image.MAX_IMAGE_PIXELS = None

ImageFile.LOAD_TRUNCATED_IMAGES = True


# Settings Resize
IMG_WIDTH = 256
IMG_HEIGHT = 256
IMG_SIZES_1 = (IMG_WIDTH,IMG_HEIGHT)
IMG_WIDTH = 768
IMG_HEIGHT = 768
IMG_SIZES_2 = (IMG_WIDTH,IMG_HEIGHT)

# Assign methods to all pandas dataframe calls  https://stackoverflow.com/questions/30608310/is-there-a-pandas-function-to-display-the-first-last-n-columns-as-in-head?noredirect=1&lq=1
pd.DataFrame.show_pandas_n_front_columns = show_pandas_n_front_columns
pd.DataFrame.show_pandas_n_last_columns = show_pandas_n_last_columns

verbose = False

In [5]:
path_to_fakeddit_dataset_dir_server = "/home/armin/repos/FKD-Dataset"
path_to_fakeddit_dataset_dir_home = "D:\\000_Diplomarbeit\\002_original_daten_bearbeitet\\000_Fakeddit"
path_to_all_gdrive_fakeddit_dataset_images = '/home/armin/repos/FKD-Dataset/001_fakeddit_from_website/003_gdrive_images'

isServer = True

if isServer:
    path_to_fakeddit_dataset_dir = path_to_fakeddit_dataset_dir_server
else:
      path_to_fakeddit_dataset_dir = path_to_fakeddit_dataset_dir_home

path_to_fakeddit_dataset_images_dir = os.path.join(path_to_fakeddit_dataset_dir , "002_images");


# Path to comments TSV
path_to_comments_tsv = os.path.join(path_to_fakeddit_dataset_dir , "001_fakeddit_from_website", "001_website_data", "all_comments.tsv")
print("Path to all comments is: " + path_to_comments_tsv)

# Path to train data
path_to_train_tsv = os.path.join(path_to_fakeddit_dataset_dir , "001_fakeddit_from_website", "001_website_data", "train.tsv")
print("Path to train.tsv is: " + path_to_train_tsv)

path_to_train_images_dir = os.path.join(path_to_fakeddit_dataset_images_dir , "train")
print("Path to train images is: " + path_to_train_images_dir)

# Path to test data
path_to_test_tsv = os.path.join(path_to_fakeddit_dataset_dir , "001_fakeddit_from_website", "001_website_data", "test_public.tsv")
print("Path to test.tsv is: " + path_to_test_tsv)

path_to_test_images_dir = os.path.join(path_to_fakeddit_dataset_images_dir , "test")
print("Path to test images is: " + path_to_test_images_dir)

# Path to val data
path_to_val_tsv = os.path.join(path_to_fakeddit_dataset_dir , "001_fakeddit_from_website", "001_website_data", "validate.tsv" )
print("Path to val.tsv is: " + path_to_val_tsv)

path_to_val_images_dir = os.path.join(path_to_fakeddit_dataset_images_dir , "val")
print("Path to test images is: " + path_to_val_images_dir)

Path to all comments is: /home/armin/repos/FKD-Dataset/001_fakeddit_from_website/001_website_data/all_comments.tsv
Path to train.tsv is: /home/armin/repos/FKD-Dataset/001_fakeddit_from_website/001_website_data/train.tsv
Path to train images is: /home/armin/repos/FKD-Dataset/002_images/train
Path to test.tsv is: /home/armin/repos/FKD-Dataset/001_fakeddit_from_website/001_website_data/test_public.tsv
Path to test images is: /home/armin/repos/FKD-Dataset/002_images/test
Path to val.tsv is: /home/armin/repos/FKD-Dataset/001_fakeddit_from_website/001_website_data/validate.tsv
Path to test images is: /home/armin/repos/FKD-Dataset/002_images/val


In [6]:
# Excerpt from all comments
df_all_comments = pd.read_csv(path_to_comments_tsv, header=0, sep='\t')
# df_all_comments.head()

  interactivity=interactivity, compiler=compiler, result=result)


In [7]:
# Excerpt from train set
df_train_original = pd.read_csv(path_to_train_tsv, header=0, sep='\t')
df_train_original = df_train_original.loc[:, ~df_train_original.columns.str.contains('^Unnamed')]
# df_train_original.head()

In [8]:
checkIfDirExistsAndCreate(path_to_train_images_dir)  

imageListTuple = generateFileListForCopy(df_train_original, path_to_all_gdrive_fakeddit_dataset_images, path_to_train_images_dir)

with ThreadPoolExecutor(max_workers=16) as executor:
    results = list(tqdm(executor.map(workerCopyAToB, imageListTuple), total=len(imageListTuple)))

100%|██████████| 878218/878218 [00:34<00:00, 25567.19it/s]


In [9]:
# Checking image dir if all images are available and structurally intact.

temp_dataframe = df_train_original.copy()
temp_keep_indices, temp_drop_indices = checkIfImagesAreAvailableAndValid(temp_dataframe, path_to_train_images_dir)
 
temp_dataframe.describe()
print("Need to drop " + str(len(temp_drop_indices)) + " files, due to no images attached or not found images.")

Starting check directory, need to check: 639101 files... but have 878218 entries in dataframe
Processed 325901 lines.



638506 images foundnes.
595 images not found
Check total images within data set, which are not available = 0
Need to drop 239712 files, due to no images attached or not found images.


In [12]:
# Filtering nans
df_not_taken_train = df_train_original.take(temp_drop_indices)
df_taken_train = df_train_original.take(temp_keep_indices)

print('Checking for nans')
print(f'Len of dataframe before {len(df_taken_train)}')
df_taken_train = cleanDataFrameFromNansandnans(df_taken_train)
print(f'Len of dataframe after {len(df_taken_train)}')


print("original count " + str(df_train_original.count()["id"]))
print("to drop count " + str(len(temp_drop_indices)))
print("len of new df " + str(temp_dataframe.count()["id"] - len(temp_drop_indices)))
print("dropped checksum (must be 0!! = )" + str(temp_dataframe.count()["id"] - df_train_original.count()["id"]))

Checking for nans
Len of dataframe before 638506
Len of dataframe after 560622
original count 878218
to drop count 239712
len of new df 638506
dropped checksum (must be 0!! = )0


In [15]:
df_taken_train.describe()

Unnamed: 0,created_utc,num_comments,score,upvote_ratio,2_way_label,3_way_label,6_way_label
count,560622.0,393533.0,560622.0,393533.0,560622.0,560622.0,560622.0
mean,1478669000.0,20.350748,395.478972,0.855417,0.393317,1.188769,1.885372
std,69701560.0,141.389503,3051.310199,0.110086,0.488487,0.969418,1.782129
min,1212297000.0,0.0,-950.0,0.5,0.0,0.0,0.0
25%,1418107000.0,1.0,5.0,0.78,0.0,0.0,0.0
50%,1488879000.0,2.0,14.0,0.88,0.0,2.0,2.0
75%,1550452000.0,7.0,46.0,0.94,1.0,2.0,4.0
max,1573859000.0,10783.0,137179.0,1.0,1.0,2.0,5.0


In [18]:
df_taken_train.isna().sum()

author                   28535
clean_title                  0
created_utc                  0
domain                  167089
hasImage                     0
id                           0
image_url                 1518
linked_submission_id    393533
num_comments            167089
score                        0
subreddit                    0
title                        0
upvote_ratio            167089
2_way_label                  0
3_way_label                  0
6_way_label                  0
dtype: int64

In [20]:
path_to_cleaned_csv_file = os.path.join(path_to_fakeddit_dataset_dir, "003_cleaned_datasets", "train_clean_meta_test.csv")
isTrainCSVValid = checkIfTrainCSVIsValid(path_to_cleaned_csv_file, df_taken_train)

if isTrainCSVValid:
    df_taken_train = pd.read_csv(path_to_cleaned_csv_file, header=0, sep='\t')
# Add mean and std column to df
if not isTrainCSVValid:
    df_taken_train.insert(loc=df_taken_train.shape[1], column='means', value=[list for i in range(df_taken_train.shape[0])])
    df_taken_train.insert(loc=df_taken_train.shape[1], column='stds', value=[list for i in range(df_taken_train.shape[0])])

In [21]:
print(f'Using already processes csv: {isTrainCSVValid}')

Using already processes csv: False


In [13]:
if not isTrainCSVValid:
    means, stds = calculateMeansAndStdMultiprocessing(df_taken_train, path_to_train_images_dir)
    df_taken_train['means'] = means
    df_taken_train['stds'] = stds

560622sed 870001 lines.


  "Palette images with Transparency expressed in bytes should be "
100%|██████████| 560622/560622 [42:27<00:00, 220.06it/s]  


In [14]:
%%capture
df_taken_train = parallelize_dataframe(df_taken_train, addComments1, 16)

In [15]:
df_taken_train = replaceNanInScoreAndUpvote(df_taken_train)

In [16]:
# Calc z score normalization scores


score_mean = np.nanmean(df_taken_train['score'])
score_stds = np.nanstd(df_taken_train['score'])
df_taken_train['score'] = (df_taken_train['score'] - score_mean) / score_stds

num_comments_mean = np.nanmean(df_taken_train['num_comments'])
num_comments_stds = np.nanstd(df_taken_train['num_comments'])

df_taken_train['num_comments'] = (df_taken_train['num_comments'] - num_comments_mean) / num_comments_stds

print(f'scores mean: {score_mean}, scores stds: {score_stds}, num_comments mean: {num_comments_mean}, num comments stds: {num_comments_stds} ')

scores mean: 395.47897157086237, scores stds: 3051.3074771950264, num_comments mean: 9.194637384904624, num comments stds: 42.185042249523924 


In [17]:
# df_taken_train.head(50)

In [18]:
# df_taken_train = replaceNanInScoreAndUpvote(df_taken_train)

In [19]:
if not isTrainCSVValid:
    meansOfDataset = np.mean(np.array(means), axis=0)
    pathToMeansDir = os.path.join(path_to_fakeddit_dataset_dir, "010_configs")
    pathToMeansFile = os.path.join(path_to_fakeddit_dataset_dir, "010_configs", 'means_non_resized.txt')
    checkIfDirExistsAndCreate(pathToMeansDir)
    writeMeansToFile(str(meansOfDataset), pathToMeansFile)
    print(f'Means of dataset per channel is: {meansOfDataset}')

Dir not found, creating /home/armin/repos/FKD-Dataset/010_configs instead
Writing  file
Means of dataset per channel is: [119.841705 112.0786   104.98751 ]


In [20]:
if not isTrainCSVValid:
    path_to_cleaned_files = os.path.join(path_to_fakeddit_dataset_dir, "003_cleaned_datasets")
    df_taken_train = writeOutCleanedDataFrameToCSV(df_taken_train, path_to_cleaned_files, "train_clean.csv")

writing cleaned dataframe -> 
no outdir found, creating it instead!
finished writing cleaned dataframe!


In [21]:
# Excerpt from test set
df_test_original = pd.read_csv(path_to_test_tsv, header=0, sep='\t')
df_test_original = df_test_original.loc[:, ~df_test_original.columns.str.contains('^Unnamed')]

In [22]:
checkIfDirExistsAndCreate(path_to_test_images_dir)  

imageListTuple = generateFileListForCopy(df_test_original, path_to_all_gdrive_fakeddit_dataset_images, path_to_test_images_dir)

with ThreadPoolExecutor(max_workers=16) as executor:
    results = list(tqdm(executor.map(workerCopyAToB, imageListTuple), total=len(imageListTuple)))


100%|██████████| 92444/92444 [00:03<00:00, 24665.96it/s]


In [23]:
temp_dataframe = df_test_original.copy()
temp_keep_indices, temp_drop_indices = checkIfImagesAreAvailableAndValid(temp_dataframe, path_to_test_images_dir)
 
temp_dataframe.describe()
print("Need to drop " + str(len(temp_drop_indices)) + " files, due to no images attached or not found images.")

Starting check directory, need to check: 67255 files... but have 92444 entries in dataframe
67185 images foundnes.
70 images not found
Check total images within data set, which are not available = 0
Need to drop 25259 files, due to no images attached or not found images.


In [24]:
df_not_taken_test = df_test_original.take(temp_drop_indices)
df_taken_test = df_test_original.take(temp_keep_indices)


print('Checking for nans')
print(f'Len of dataframe before {len(df_taken_test)}')
df_taken_test = cleanDataFrameFromNansandnans(df_taken_test)
print(f'Len of dataframe after {len(df_taken_test)}')


print("original count " + str(df_test_original.count()["id"]))
print("to drop count " + str(len(temp_drop_indices)))
print("len of new df " + str(temp_dataframe.count()["id"] - len(temp_drop_indices)))
print("dropped checksum (must be 0!! = )" + str(temp_dataframe.count()["id"] - df_test_original.count()["id"]))



path_to_cleaned_files = os.path.join(path_to_fakeddit_dataset_dir, "003_cleaned_datasets")
df_taken_test = writeOutCleanedDataFrameToCSV(df_taken_test, path_to_cleaned_files, "test_clean.csv")

Checking for nans
Len of dataframe before 67185
Len of dataframe after 58954
original count 92444
to drop count 25259
len of new df 67185
dropped checksum (must be 0!! = )0
writing cleaned dataframe -> 
finished writing cleaned dataframe!


In [25]:
%%capture
df_taken_test = parallelize_dataframe(df_taken_test, addComments1, 16)

In [26]:
df_taken_test = replaceNanInScoreAndUpvote(df_taken_test)

In [27]:
df_taken_test['score'] = (df_taken_test['score'] - score_mean) / score_stds
df_taken_test['num_comments'] = (df_taken_test['num_comments'] - num_comments_mean) / num_comments_stds
# df_taken_test.describe()


In [28]:
# Excerpt from val set
df_val_original = pd.read_csv(path_to_val_tsv, header=0, sep='\t')
df_val_original = df_val_original.loc[:, ~df_val_original.columns.str.contains('^Unnamed')]
df_val_original['title'] = df_val_original['title'].astype(str)

In [29]:
checkIfDirExistsAndCreate(path_to_val_images_dir)  

imageListTuple = generateFileListForCopy(df_val_original, path_to_all_gdrive_fakeddit_dataset_images, path_to_val_images_dir)

with ThreadPoolExecutor(max_workers=16) as executor:
    results = list(tqdm(executor.map(workerCopyAToB, imageListTuple), total=len(imageListTuple)))


100%|██████████| 92444/92444 [00:03<00:00, 25662.26it/s]


In [30]:
temp_dataframe = df_val_original.copy()
temp_keep_indices, temp_drop_indices = checkIfImagesAreAvailableAndValid(temp_dataframe, path_to_val_images_dir)
 
temp_dataframe.describe()
print("Need to drop " + str(len(temp_drop_indices)) + " files, due to no images attached or not found images.")

Starting check directory, need to check: 67208 files... but have 92444 entries in dataframe
67140 images foundnes.
68 images not found
Check total images within data set, which are not available = 0
Need to drop 25304 files, due to no images attached or not found images.


In [31]:
# Take ist schneller als drop!
df_not_taken_val = df_val_original.take(temp_drop_indices)
df_taken_val = df_val_original.take(temp_keep_indices)

print('Checking for nans')
print(f'Len of dataframe before {len(df_taken_val)}')
df_taken_val = cleanDataFrameFromNansandnans(df_taken_val)
print(f'Len of dataframe after {len(df_taken_val)}')



print("original count " + str(df_val_original.count()["id"]))
print("to drop count " + str(len(temp_drop_indices)))
print("len of new df " + str(temp_dataframe.count()["id"] - len(temp_drop_indices)))
print("dropped checksum (must be 0!! = )" + str(temp_dataframe.count()["id"] - df_val_original.count()["id"]))

print("TODO: Chack and raise error if not 0")



path_to_cleaned_files = os.path.join(path_to_fakeddit_dataset_dir, "003_cleaned_datasets")

df_taken_val = writeOutCleanedDataFrameToCSV(df_taken_val, path_to_cleaned_files, "val_clean.csv")


Checking for nans
Len of dataframe before 67140
Len of dataframe after 58972
original count 92444
to drop count 25304
len of new df 67140
dropped checksum (must be 0!! = )0
TODO: Chack and raise error if not 0
writing cleaned dataframe -> 
finished writing cleaned dataframe!


In [32]:
%%capture
df_taken_val = parallelize_dataframe(df_taken_val, addComments1, 16)

In [33]:
df_taken_val = replaceNanInScoreAndUpvote(df_taken_val)

In [34]:
df_taken_val['score'] = (df_taken_val['score'] - score_mean) / score_stds
df_taken_val['num_comments'] = (df_taken_val['num_comments'] - num_comments_mean) / num_comments_stds


In [35]:
all_authors_train = df_taken_train.author.unique().tolist()
all_authors_test = df_taken_test.author.unique().tolist()
all_authors_val = df_taken_val.author.unique().tolist()
all_authors_noset = all_authors_train +  all_authors_test  + all_authors_val
all_authors = list(set(all_authors_noset))


all_authors.append('no_author')
pathToAuthorDir = os.path.join(path_to_fakeddit_dataset_dir, "010_configs")
if not isTrainCSVValid:   
    pathToAuthorFile_train = os.path.join(path_to_fakeddit_dataset_dir, "010_configs", 'all_authors_train.csv')
    checkIfDirExistsAndCreate(pathToAuthorDir)   
    writeAuthorListToCSV(all_authors_train, pathToAuthorFile_train)
    df_taken_train = encodeAuthors(df_taken_train, all_authors)

pathToAuthorFile_noset = os.path.join(path_to_fakeddit_dataset_dir, "010_configs", 'all_authors_noset.csv')
writeAuthorListToCSV(all_authors_noset, pathToAuthorFile_noset)
pathToAuthorFile_test = os.path.join(path_to_fakeddit_dataset_dir, "010_configs", 'all_authors_test.csv')
writeAuthorListToCSV(all_authors_test, pathToAuthorFile_test)
pathToAuthorFile_val = os.path.join(path_to_fakeddit_dataset_dir, "010_configs", 'all_authors_val.csv')   
writeAuthorListToCSV(all_authors_val, pathToAuthorFile_val)

pathToAuthorFile_all = os.path.join(path_to_fakeddit_dataset_dir, "010_configs", 'all_authors.csv')
writeAuthorListToCSV(all_authors, pathToAuthorFile_all)
df_taken_test = encodeAuthors(df_taken_test, all_authors)
df_taken_val = encodeAuthors(df_taken_val, all_authors)

In [19]:
len(all_authors)

NameError: name 'all_authors' is not defined

# Cleaning is done -> doing some statistics


In [37]:
print("By removing all comments without images and images which are not found, we have " + str(df_train_original.count()[0] - df_not_taken_train.count()[0]) + " samples for training left")
print()
print("Training set total -> " + str(df_train_original.count()['id']) + " we took " + str(df_taken_train.count()['id']) + " samples  because of missing images or only text modality available.")
print()
percentage_train = ( df_taken_train.count()['id'] * 100) /   df_train_original.count()['id']
print("This is " + str(round(percentage_train)) + " % of the whole train set")
print()
count_fake, count_not_fake = countFakeNoFake(df_taken_train)
print("We have " + str(count_not_fake) + " true labels and " + str(count_fake) + " fakes.")
calcZeroBaseline(count_fake, count_not_fake) 
print()


print("By removing all comments without images and images which are not found, we have " + str(df_test_original.count()[0] - df_not_taken_test.count()[0]) + " samples for testing left")
print()
print("Test set total -> " + str(df_test_original.count()[0]) + " we took " + str(df_taken_test.count()[0]) + " samples  because of missing images or only text modality available.")
print()
percentage_test = ( df_taken_test.count()['id'] * 100) /   df_test_original.count()['id']
print("This is " + str(round(percentage_test)) + " % of the whole test set")
print()
count_fake, count_not_fake = countFakeNoFake(df_taken_test)
print("We have " + str(count_not_fake) + " true labels and " + str(count_fake) + " fakes.")
calcZeroBaseline(count_fake, count_not_fake)
# print()

print()
print("By removing all comments without images and images which are not found, we have " + str(df_val_original.count()[0] - df_not_taken_val.count()[0]) + " samples for validating left")
print()
print("Validation set total -> " + str(df_val_original.count()[0]) + " we took " + str(df_taken_val.count()[0]) + " samples  because of missing images or only text modality available.")
print()
percentage_val = ( df_taken_val.count()[0] * 100) /   df_val_original.count()[0]
print("This is " + str(round(percentage_val)) + " % of the whole validation set")
print()
count_fake, count_not_fake = countFakeNoFake(df_taken_val)
print("We have " + str(count_not_fake) + " true labels and " + str(count_fake) + " fakes.")
calcZeroBaseline(count_fake, count_not_fake)  
print()



By removing all comments without images and images which are not found, we have 607869 samples for training left

Training set total -> 878218 we took 560622 samples  because of missing images or only text modality available.

This is 64.0 % of the whole train set

We have 340120 not fakes!
We have 220502 fakes!
We have 340120 true labels and 220502 fakes.
The zero baseline for this set is: 61%. 

By removing all comments without images and images which are not found, we have 63917 samples for testing left

Test set total -> 76752 we took 58954 samples  because of missing images or only text modality available.

This is 64.0 % of the whole test set

We have 35608 not fakes!
We have 23346 fakes!
We have 35608 true labels and 23346 fakes.
The zero baseline for this set is: 60%. 

By removing all comments without images and images which are not found, we have 63865 samples for validating left

Validation set total -> 76767 we took 58972 samples  because of missing images or only text moda

# Preprocessing for easier handling

In [38]:
# train
path_to_cleaned_files = os.path.join(path_to_fakeddit_dataset_dir, "004_images_id_label_files")
df_train_labels = createIDLabelFile(df_taken_train, path_to_cleaned_files, "train_id_label.csv", True)

# test
path_to_cleaned_files = os.path.join(path_to_fakeddit_dataset_dir, "004_images_id_label_files")
df_test_labels = createIDLabelFile(df_taken_test, path_to_cleaned_files, "test_id_label.csv")

# val
path_to_cleaned_files = os.path.join(path_to_fakeddit_dataset_dir, "004_images_id_label_files")
df_val_labels = createIDLabelFile(df_taken_val, path_to_cleaned_files, "val_id_label.csv")


no outdir found, creating it instead!


In [39]:
# Creating text id label files

#train
path_to_cleaned_files = os.path.join(path_to_fakeddit_dataset_dir, "005_text_id_label_files")
createIDTitleFile(df_taken_train, path_to_cleaned_files , 'train_id_text_label.csv')

#test
path_to_cleaned_files = os.path.join(path_to_fakeddit_dataset_dir, "005_text_id_label_files")
createIDTitleFile(df_taken_test, path_to_cleaned_files , 'test_id_text_label.csv')

#val
path_to_cleaned_files = os.path.join(path_to_fakeddit_dataset_dir, "005_text_id_label_files")
createIDTitleFile(df_taken_val, path_to_cleaned_files , 'val_id_text_label.csv')


no outdir found, creating it instead!


## Preprocessing images

    

In [42]:
pathToAllImages = os.path.join(path_to_fakeddit_dataset_dir, "002_images")
pathToAllLabels = os.path.join(path_to_fakeddit_dataset_dir, "004_images_id_label_files")
pathToAllResizedImages = os.path.join(path_to_fakeddit_dataset_dir, "006_images_resized")
pathToAllResizedImages_2 = os.path.join(path_to_fakeddit_dataset_dir, "006_images_resized_2")


# Train Set Handling
pathToSourceTrainImages = os.path.join(pathToAllImages, "train")
pathToTrainLabels = os.path.join(pathToAllLabels, "train_id_label.csv")
pathToDestTrainImages = os.path.join(pathToAllResizedImages, "train")
pathToDestTrainImages_2 = os.path.join(pathToAllResizedImages_2, "train")
checkIfDirExistsAndCreate(pathToDestTrainImages)
checkIfDirExistsAndCreate(pathToDestTrainImages_2)
imageListTuple = generateFileList(df_train_labels, pathToSourceTrainImages, pathToDestTrainImages, IMG_SIZES_1)

resizeImagesMultiprocessing(imageListTuple)

imageListTuple = generateFileList(df_train_labels, pathToSourceTrainImages, pathToDestTrainImages_2, IMG_SIZES_2)

resizeImagesMultiprocessing(imageListTuple)

Dir not found, creating /home/armin/repos/FKD-Dataset/006_images_resized_2/train instead
Processed 560601 lines.

100%|██████████| 560622/560622 [00:08<00:00, 62431.88it/s]


Processed 560601 lines.

100%|██████████| 560622/560622 [1:36:41<00:00, 96.63it/s]  


In [43]:
# df_taken_train.insert(loc=df_taken_train.shape[1], column='means_resized', value=[list for i in range(df_taken_train.shape[0])])
# df_taken_train.insert(loc=df_taken_train.shape[1], column='stds_resized', value=[list for i in range(df_taken_train.shape[0])])

means, stds = calculateMeansAndStdMultiprocessing(df_taken_train, pathToDestTrainImages)



560622sed 560001 lines.


100%|██████████| 560622/560622 [07:09<00:00, 1306.67it/s]


In [44]:
meansOfDataset = np.mean(np.array(means), axis=0)
pathToMeansDir = os.path.join(path_to_fakeddit_dataset_dir, "010_configs")
pathToMeansFile = os.path.join(path_to_fakeddit_dataset_dir, "010_configs", 'means_resized.txt')
pathToStdsFile = os.path.join(path_to_fakeddit_dataset_dir, "010_configs", 'stds_resized.txt')

checkIfDirExistsAndCreate(pathToMeansDir)

writeMeansToFile(str(meansOfDataset), pathToMeansFile)
writeMeansToFile(str(stds), pathToStdsFile)
print(f'Means of dataset per channel is: {meansOfDataset}')
print(f'Means of dataset per channel is: {stds}')

Writing  file
Writing  file
Means of dataset per channel is: [119.80977 112.05151 104.99459]


IOPub data rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_data_rate_limit`.

Current values:
NotebookApp.iopub_data_rate_limit=1000000.0 (bytes/sec)
NotebookApp.rate_limit_window=3.0 (secs)



In [45]:
means, stds = calculateMeansAndStdMultiprocessing(df_taken_train, pathToDestTrainImages_2)

560622sed 560001 lines.


100%|██████████| 560622/560622 [43:54<00:00, 212.81it/s]  


In [46]:
meansOfDataset = np.mean(np.array(means), axis=0)
pathToMeansDir = os.path.join(path_to_fakeddit_dataset_dir, "010_configs")
pathToMeansFile = os.path.join(path_to_fakeddit_dataset_dir, "010_configs", 'means_resized_768.txt')
pathToStdsFile = os.path.join(path_to_fakeddit_dataset_dir, "010_configs", 'stds_resized_768.txt')

checkIfDirExistsAndCreate(pathToMeansDir)

writeMeansToFile(str(meansOfDataset), pathToMeansFile)
writeMeansToFile(str(stds), pathToStdsFile)
print(f'Means of dataset per channel is: {meansOfDataset}')
print(f'Means of dataset per channel is: {stds}')

Writing  file
Writing  file
Means of dataset per channel is: [119.80394  112.04619  105.001976]


IOPub data rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_data_rate_limit`.

Current values:
NotebookApp.iopub_data_rate_limit=1000000.0 (bytes/sec)
NotebookApp.rate_limit_window=3.0 (secs)



In [48]:
# Test Set Handling
pathToSourceTestImages = os.path.join(pathToAllImages, "test")
pathToTestLabels = os.path.join(pathToAllLabels, "test_id_label.csv")
pathToDestTestImages = os.path.join(pathToAllResizedImages, "test")
checkIfDirExistsAndCreate(pathToDestTestImages)
pathToDestTestImages_2 = os.path.join(pathToAllResizedImages_2, "test")
checkIfDirExistsAndCreate(pathToDestTestImages_2)


imageListTuple = generateFileList(df_test_labels, pathToSourceTestImages, pathToDestTestImages, IMG_SIZES_1)
resizeImagesMultiprocessing(imageListTuple)
imageListTuple = generateFileList(df_test_labels, pathToSourceTestImages, pathToDestTestImages_2, IMG_SIZES_2)
resizeImagesMultiprocessing(imageListTuple)

Dir not found, creating /home/armin/repos/FKD-Dataset/006_images_resized_2/test instead
Processed 1 lines.Processed 101 lines.Processed 201 lines.Processed 301 lines.Processed 401 lines.Processed 501 lines.Processed 601 lines.Processed 701 lines.Processed 801 lines.Processed 901 lines.Processed 1001 lines.Processed 1101 lines.Processed 1201 lines.Processed 1301 lines.Processed 1401 lines.Processed 1501 lines.Processed 1601 lines.Processed 1701 lines.Processed 1801 lines.Processed 1901 lines.Processed 2001 lines.Processed 2101 lines.Processed 2201 lines.Processed 2301 lines.Processed 2401 lines.Processed 2501 lines.Processed 2601 lines.Processed 2701 lines.Processed 2801 lines.Processed 2901 lines.Processed 3001 lines.Processed 3101 lines.Processed 3201 lines.Processed 3301 lines.Processed 3401 lines.Processed 3501 lines.Processed 3601 lines.Processed 3701 lines.Processed 3801 lines.Processed 3901 lines.Processed 4001 lines.Processed 4101 lines.

100%|██████████| 58954/58954 [00:02<00:00, 27022.68it/s]


Processed 1 lines.Processed 101 lines.Processed 201 lines.Processed 301 lines.Processed 401 lines.Processed 501 lines.Processed 601 lines.Processed 701 lines.Processed 801 lines.Processed 901 lines.Processed 1001 lines.Processed 1101 lines.Processed 1201 lines.Processed 1301 lines.Processed 1401 lines.Processed 1501 lines.Processed 1601 lines.Processed 1701 lines.Processed 1801 lines.Processed 1901 lines.Processed 2001 lines.Processed 2101 lines.Processed 2201 lines.Processed 2301 lines.Processed 2401 lines.Processed 2501 lines.Processed 2601 lines.Processed 2701 lines.Processed 2801 lines.Processed 2901 lines.Processed 3001 lines.Processed 3101 lines.Processed 3201 lines.Processed 3301 lines.Processed 3401 lines.Processed 3501 lines.Processed 3601 lines.Processed 3701 lines.Processed 3801 lines.Processed 3901 lines.Processed 4001 lines.Processed 4101 lines.Processed 4201 lines.Processed 4301 lines.Processed 4401 lines.Processed 4501 lines.

Processed 52101 lines.Processed 52201 lines.Processed 52301 lines.Processed 52401 lines.Processed 52501 lines.Processed 52601 lines.Processed 52701 lines.Processed 52801 lines.Processed 52901 lines.Processed 53001 lines.Processed 53101 lines.Processed 53201 lines.Processed 53301 lines.Processed 53401 lines.Processed 53501 lines.Processed 53601 lines.Processed 53701 lines.Processed 53801 lines.Processed 53901 lines.Processed 54001 lines.Processed 54101 lines.Processed 54201 lines.Processed 54301 lines.Processed 54401 lines.Processed 54501 lines.Processed 54601 lines.Processed 54701 lines.Processed 54801 lines.Processed 54901 lines.Processed 55001 lines.Processed 55101 lines.Processed 55201 lines.Processed 55301 lines.Processed 55401 lines.Processed 55501 lines.Processed 55601 lines.Processed 55701 lines.Processed 55801 lines.Processed 55901 lines.Processed 56001 lines.Processed 56101 lines.Processed 56201 lines.Processed 56301 lines.Processed 5

100%|██████████| 58954/58954 [10:24<00:00, 94.46it/s] 


In [49]:
# df_test_labels.head(1)

# Val Set Handling
pathToSourceValImages = os.path.join(pathToAllImages, "val")
pathToValLabels = os.path.join(pathToAllLabels, "val_id_label.csv")
pathToDestValImages = os.path.join(pathToAllResizedImages, "val")
pathToDestValImages_2 = os.path.join(pathToAllResizedImages_2, "val")
checkIfDirExistsAndCreate(pathToDestValImages)
checkIfDirExistsAndCreate(pathToDestValImages_2)


imageListTuple = generateFileList(df_val_labels, pathToSourceValImages, pathToDestValImages, IMG_SIZES_1)
resizeImagesMultiprocessing(imageListTuple)
imageListTuple = generateFileList(df_val_labels, pathToSourceValImages, pathToDestValImages_2, IMG_SIZES_2)
resizeImagesMultiprocessing(imageListTuple)

Dir not found, creating /home/armin/repos/FKD-Dataset/006_images_resized_2/val instead
Processed 58901 lines.

100%|██████████| 58972/58972 [00:02<00:00, 28567.91it/s]


Processed 58901 lines.

100%|██████████| 58972/58972 [10:38<00:00, 92.39it/s] 


In [50]:

path_to_cleaned_files_text_image_label_file = os.path.join(path_to_fakeddit_dataset_dir, "007_text_image_label")    
checkIfDirExistsAndCreate(path_to_cleaned_files_text_image_label_file) 

path_to_cleaned_files_text_image_meta_label_file = os.path.join(path_to_fakeddit_dataset_dir, "008_text_image_meta_label")    
checkIfDirExistsAndCreate(path_to_cleaned_files_text_image_meta_label_file) 

path_to_cleaned_meta_label_file = os.path.join(path_to_fakeddit_dataset_dir, "009_meta_label")    
checkIfDirExistsAndCreate(path_to_cleaned_meta_label_file) 

Dir not found, creating /home/armin/repos/FKD-Dataset/007_text_image_label instead
Dir not found, creating /home/armin/repos/FKD-Dataset/008_text_image_meta_label instead
Dir not found, creating /home/armin/repos/FKD-Dataset/009_meta_label instead


In [51]:
createIDTitleTextLabelFile(df_taken_train, path_to_cleaned_files, 'train_title_image_label.csv')   
createIDTitleTextLabelFile(df_taken_test, path_to_cleaned_files, 'test_title_image_label.csv')    
createIDTitleTextLabelFile(df_taken_val, path_to_cleaned_files, 'val_title_image_label.csv')

In [52]:
# %%capture
# df_taken_train = parallelize_dataframe_comments(df_taken_train, addComments1, 16)
# df_taken_test = parallelize_dataframe(df_taken_test, addComments1, 16)
# df_taken_val = parallelize_dataframe(df_taken_val, addComments1, 16)

In [53]:
%%capture
print('starting with comments train')
createIDTitleCommentsTextLabelFile(df_taken_train, path_to_cleaned_files_text_image_label_file, 'train_text_image_label.csv', True)   

print('starting with comments test')
createIDTitleCommentsTextLabelFile(df_taken_test, path_to_cleaned_files_text_image_label_file, 'test_text_image_label.csv')    

print('starting with comments val')
createIDTitleCommentsTextLabelFile(df_taken_val, path_to_cleaned_files_text_image_label_file, 'val_text_image_label.csv')    


In [54]:
%%capture
print('starting with all meta data train')
createMetaDataLabelFile(df_taken_train, path_to_cleaned_meta_label_file, 'train_meta_label.csv')   

print('starting with all meta data test')
createMetaDataLabelFile(df_taken_test, path_to_cleaned_meta_label_file, 'test_meta_label.csv')    

print('starting with all meta data val')
createMetaDataLabelFile(df_taken_val, path_to_cleaned_meta_label_file, 'val_meta_label.csv')  

In [55]:
# def addFullPath123(fileName, pathToImages):
#     fileName = (str(fileName) + '.jpg')
#     return os.path.join(pathToImages, fileName)

In [56]:
# def createIDTitleCommentsTextMetaDataLabelFile123(dataframe, pathToDirectory, pathToImages, fileName, isTrain = False): 
#     path_to_cleaned_csv = os.path.join(pathToDirectory, fileName)
#     dataframe = dataframe.reindex(columns=['author_enc', 'clean_title', 'id', 'imagePath', 'comments', 'num_comments', 'up_vote_comments', 'score', 'hasNanScore', 'upvote_ratio', 'hasNanUpvote', '2_way_label'])
#     dataframe['imagePath'] = dataframe['imagePath'].astype(str)
#     if isTrain:
# #         df = dataframe[['author_enc', 'clean_title', 'id', 'imagePath', 'score', 'hasNanScore', 'upvote_ratio', 'hasNanUpvote', 'comments', 'num_comments', 'up_vote_comments', 'means', 'stds', '2_way_label']]
#         for tupleRaw in dataframe.itertuples(index=True, name=None):
#             row_dict = convertRowToDictionary(tupleRaw, dataframe.columns, True)
#             path = addFullPath123(row_dict['id'], pathToImages)
#             dataframe.at[tupleRaw[0], "imagePath"] = path
#     else:
#         for tupleRaw in dataframe.itertuples(index=True, name=None):
#             row_dict = convertRowToDictionary(tupleRaw, dataframe.columns, True)
# #             df = dataframe[['author_enc', 'clean_title', 'id', 'imagePath', 'comments', 'num_comments', 'up_vote_comments', 'score', 'hasNanScore', 'upvote_ratio', 'hasNanUpvote', '2_way_label']]
#             path = addFullPath123(row_dict['id'], pathToImages)
#             dataframe.at[tupleRaw[0], "imagePath"] = path
#     dataframe.to_csv(path_to_cleaned_csv, sep='\t', encoding='utf-8', index=False)
#     return dataframe

In [57]:
%%capture
print('starting with all data train')
createIDTitleCommentsTextMetaDataLabelFile(df_taken_train, path_to_cleaned_files_text_image_meta_label_file, os.path.join(pathToAllResizedImages, 'train'), 'train_text_image_meta_label.csv', True)   

print('starting with all data test')
createIDTitleCommentsTextMetaDataLabelFile(df_taken_test, path_to_cleaned_files_text_image_meta_label_file, os.path.join(pathToAllResizedImages, 'test'),'test_text_image_meta_label.csv')    

print('starting with all data val')
createIDTitleCommentsTextMetaDataLabelFile(df_taken_val, path_to_cleaned_files_text_image_meta_label_file, os.path.join(pathToAllResizedImages, 'val'),'val_text_image_meta_label.csv')    

In [58]:
print("Done!!!")

Done!!!


In [59]:
end = time.time()
print(f'It took {(end - start) / 60} minutes to process everything' )

It took 591.5060166517893 minutes to process everything
