In [1]:
# !pip install demoji

In [2]:
import datetime
from glob import glob
import time
import os
import re
import pandas as pd
import demoji
demoji.download_codes()

def print_full(x):
    pd.set_option('display.max_rows', len(x))
    pd.set_option('display.max_columns', None)
    pd.set_option('display.width', 2000)
    pd.set_option('display.float_format', '{:20,.2f}'.format)
    pd.set_option('display.max_colwidth', -1)
    print(x)
    pd.reset_option('display.max_rows')
    pd.reset_option('display.max_columns')
    pd.reset_option('display.width')
    pd.reset_option('display.float_format')
    pd.reset_option('display.max_colwidth')

pd.options.display.max_rows
pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns', None)
pd.set_option('display.width', None)
pd.set_option('display.max_colwidth', -1)

chunksize = 10 ** 6
date_time = datetime.datetime.now().strftime("%Y%m%d_%H%M%S")
print("\ndate_time = {}".format(date_time))

out_folder = "out_{}".format(date_time)

if not os.path.exists(out_folder):
    os.mkdir(out_folder)
    print("Directory " , out_folder ,  " Created ")
else:    
    print("Directory " , out_folder ,  " already exists")

# df = pd.read_csv("data/out_35.csv")
# df = pd.read_csv("data/subset_1.csv", encoding='utf8', dtype=str)
# print(df.count())
# df.head()


[33mDownloading emoji data ...[0m
[92m... OK[0m (Got response in 0.86 seconds)
[33mWriting emoji data to C:\Users\USER\.demoji/codes.json ...[0m
[92m... OK[0m

date_time = 20200414_105818
Directory  out_20200414_105818  Created 




In [3]:
def filterDuplicate(df):
    import time
    time_start = time.time()
    print("  -> filterDuplicate()", end='')
    
    remove = df.duplicated(keep='first')
    result = df[~remove]
    
#     print("Removed: {} \n".format(df[remove].head()))
    
    processed = len(df.index)
    skipped = processed - len(result.index)
    skipped_percentage = skipped / processed * 100
    
    time = time.time() - time_start
    print(" - Processed: {:,} | Skipped: {:,} ({:.2f}%) | Time: {:,.3f} sec".format(processed, skipped, skipped_percentage, time))
    
    return result


In [4]:
def filterDuplicateOriginalTweetId(df):
    import time
    time_start = time.time()
    print("  -> filterDuplicateOriginalTweetId()", end='')

    file_name = "{}/df_original_tweet_ids.csv".format(out_folder)
    if not os.path.exists(file_name):
        original_tweet_ids = pd.DataFrame(columns=['original_tweet_id'], dtype=str)
        original_tweet_ids.to_csv(file_name, index=False, encoding='utf-8')
    
    remove = ~df.original_tweet_id.notnull()
    remove = remove | df.duplicated(subset=['original_tweet_id'], keep='first')
    
    for original_tweet_ids in pd.read_csv(file_name, encoding='utf-8', dtype=str, chunksize=chunksize):
        remove = remove | df.original_tweet_id.isin(original_tweet_ids.original_tweet_id)
        
    result = df[~remove]
    
    original_tweet_ids = pd.DataFrame(columns=['original_tweet_id'], dtype=str)
    original_tweet_ids = pd.concat([original_tweet_ids, pd.DataFrame({'original_tweet_id': result.original_tweet_id})])
    original_tweet_ids.to_csv(file_name, mode='a', header=False, index=False, encoding='utf-8')
    
    remove = remove & (df.is_retweet.isin(['True']) | df.is_quote.isin(['True']))
    result = df[~remove]
    
#     print("Removed: {} \n".format(df[remove].head()))
    
    processed = len(df.index)
    skipped = processed - len(result.index)
    skipped_percentage = skipped / processed * 100
    
    time = time.time() - time_start
    print(" - Processed: {:,} | Skipped: {:,} ({:.2f}%) | Time: {:,.3f} sec".format(processed, skipped, skipped_percentage, time))
    
    return result

# subsets = [file for file in glob('data/subset_*.csv'.format())]
# subsets = sorted(subsets)
# for file_name in subsets:
#     print("\nProcessing '{}'".format(file_name))
#     df = pd.read_csv(file_name, encoding='utf8', dtype=str)
#     m = df[df['original_tweet_id']=='1236850730163560448']
#     print(m.head(100))
#     print()


In [5]:
def filterUrl(df):
    import time
    time_start = time.time()
    print("  -> filterUrl()", end='')
    
    url_regex = r'http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\(\), ]|(?:%[0-9a-fA-F][0-9a-fA-F]))+'
    df['text'] = df['text'].str.replace(url_regex, '').astype(str)
    df['quoted_text'] = df['quoted_text'].str.replace(url_regex, '').astype(str)
    
    processed = len(df.index)
    time = time.time() - time_start
    print(" - Processed: {:,} | Time: {:,.3f} sec".format(processed, time))
    
    return df


In [6]:
def checkNonEngAndEmoji(row):
    text = re.sub(r'…', '', str(row['text']))
    quoted_text = re.sub(r'…', '', str(row['quoted_text']))
    
    text_emoji = demoji.replace(text, '')
    if len(text_emoji) != len(text):
        row['has_emoji'] = 'True'
    quoted_text_emoji = demoji.replace(quoted_text, '')
    if len(quoted_text_emoji) != len(quoted_text):
        row['has_emoji'] = 'True'
    
    try:
        text_emoji.encode(encoding='utf-8').decode('ascii')
        quoted_text_emoji.encode(encoding='utf-8').decode('ascii')
    except UnicodeDecodeError:
        row['has_non_eng'] = 'True'
        
    return row


def filterNonEngAndEmoji(df):
    import time
    time_start = time.time()
    print("  -> filterNonEngAndEmoji()", end='')
    
    df['has_emoji'] = 'False'
    df['has_non_eng'] = 'False'
    
    df = df.apply(checkNonEngAndEmoji, axis=1)
    
    remove = df.has_non_eng.isin(['True'])
    result = df[~remove]
    
    processed = len(df.index)
    skipped = processed - len(result.index)
    skipped_percentage = skipped / processed * 100
    time = time.time() - time_start
    print(" - Processed: {:,} | Non-Eng Skipped: {:,} ({:.2f}%) | Time: {:,.3f} sec".format(processed, skipped, skipped_percentage, time))
    
    return result


In [7]:
subsets = [file for file in glob('data/tweet_files/out_*.csv'.format())]
subsets = sorted(subsets)
print("List of csv: {}\n".format(subsets))

for file_name in subsets:
    print("\nProcessing '{}'".format(file_name))
    df = pd.read_csv(file_name, encoding='utf8', dtype=str,error_bad_lines=False)
    df = filterDuplicate(df)
    df = filterDuplicateOriginalTweetId(df)
    df = filterUrl(df)
    df = filterNonEngAndEmoji(df)
    
    out_file_name = "{}/combined.csv".format(out_folder)
    if not os.path.exists(out_file_name):
        df.to_csv(out_file_name, mode='w', header=True, index=False, encoding='utf-8')
    else:
        df.to_csv(out_file_name, mode='a', header=False, index=False, encoding='utf-8')


List of csv: ['data/tweet_files\\out_37.csv', 'data/tweet_files\\out_38.csv', 'data/tweet_files\\out_39.csv', 'data/tweet_files\\out_40.csv', 'data/tweet_files\\out_41.csv', 'data/tweet_files\\out_42.csv', 'data/tweet_files\\out_43.csv', 'data/tweet_files\\out_44.csv', 'data/tweet_files\\out_45.csv', 'data/tweet_files\\out_46.csv', 'data/tweet_files\\out_47.csv', 'data/tweet_files\\out_48.csv', 'data/tweet_files\\out_49.csv', 'data/tweet_files\\out_50.csv', 'data/tweet_files\\out_51.csv', 'data/tweet_files\\out_52.csv', 'data/tweet_files\\out_53.csv', 'data/tweet_files\\out_54.csv', 'data/tweet_files\\out_55.csv', 'data/tweet_files\\out_56.csv', 'data/tweet_files\\out_57.csv', 'data/tweet_files\\out_58.csv', 'data/tweet_files\\out_59.csv', 'data/tweet_files\\out_60.csv', 'data/tweet_files\\out_61.csv', 'data/tweet_files\\out_62.csv', 'data/tweet_files\\out_63.csv', 'data/tweet_files\\out_64.csv', 'data/tweet_files\\out_65.csv', 'data/tweet_files\\out_66.csv', 'data/tweet_files\\out_67.

b'Skipping line 521165: expected 16 fields, saw 17\n'
b'Skipping line 806035: expected 16 fields, saw 17\n'


  -> filterDuplicate() - Processed: 999,981 | Skipped: 197 (0.02%) | Time: 2.586 sec
  -> filterDuplicateOriginalTweetId() - Processed: 999,784 | Skipped: 769,799 (77.00%) | Time: 0.738 sec
  -> filterUrl() - Processed: 229,985 | Time: 0.594 sec
  -> filterNonEngAndEmoji() - Processed: 229,985 | Non-Eng Skipped: 100,547 (43.72%) | Time: 169.204 sec

Processing 'data/tweet_files\out_39.csv'


b'Skipping line 459654: expected 16 fields, saw 17\n'


  -> filterDuplicate() - Processed: 999,990 | Skipped: 139 (0.01%) | Time: 2.805 sec
  -> filterDuplicateOriginalTweetId() - Processed: 999,851 | Skipped: 827,807 (82.79%) | Time: 0.782 sec
  -> filterUrl() - Processed: 172,044 | Time: 0.471 sec
  -> filterNonEngAndEmoji() - Processed: 172,044 | Non-Eng Skipped: 74,150 (43.10%) | Time: 126.338 sec

Processing 'data/tweet_files\out_40.csv'


b'Skipping line 675645: expected 16 fields, saw 17\n'


  -> filterDuplicate() - Processed: 815,270 | Skipped: 265 (0.03%) | Time: 2.155 sec
  -> filterDuplicateOriginalTweetId() - Processed: 815,005 | Skipped: 677,700 (83.15%) | Time: 0.759 sec
  -> filterUrl() - Processed: 137,305 | Time: 0.381 sec
  -> filterNonEngAndEmoji() - Processed: 137,305 | Non-Eng Skipped: 57,623 (41.97%) | Time: 106.193 sec

Processing 'data/tweet_files\out_41.csv'


b'Skipping line 533128: expected 16 fields, saw 17\n'


  -> filterDuplicate() - Processed: 999,990 | Skipped: 193 (0.02%) | Time: 2.702 sec
  -> filterDuplicateOriginalTweetId() - Processed: 999,797 | Skipped: 778,685 (77.88%) | Time: 1.037 sec
  -> filterUrl() - Processed: 221,112 | Time: 0.624 sec
  -> filterNonEngAndEmoji() - Processed: 221,112 | Non-Eng Skipped: 108,937 (49.27%) | Time: 167.107 sec

Processing 'data/tweet_files\out_42.csv'
  -> filterDuplicate() - Processed: 999,990 | Skipped: 154 (0.02%) | Time: 2.559 sec
  -> filterDuplicateOriginalTweetId() - Processed: 999,836 | Skipped: 813,995 (81.41%) | Time: 1.150 sec
  -> filterUrl() - Processed: 185,841 | Time: 0.507 sec
  -> filterNonEngAndEmoji() - Processed: 185,841 | Non-Eng Skipped: 88,946 (47.86%) | Time: 141.719 sec

Processing 'data/tweet_files\out_43.csv'
  -> filterDuplicate() - Processed: 999,994 | Skipped: 156 (0.02%) | Time: 2.753 sec
  -> filterDuplicateOriginalTweetId() - Processed: 999,838 | Skipped: 843,306 (84.34%) | Time: 1.188 sec
  -> filterUrl() - Proces

b'Skipping line 310888: expected 16 fields, saw 17\n'
b'Skipping line 403718: expected 16 fields, saw 17\n'


  -> filterDuplicate() - Processed: 999,979 | Skipped: 191 (0.02%) | Time: 2.671 sec
  -> filterDuplicateOriginalTweetId() - Processed: 999,788 | Skipped: 755,874 (75.60%) | Time: 1.944 sec
  -> filterUrl() - Processed: 243,914 | Time: 0.660 sec
  -> filterNonEngAndEmoji() - Processed: 243,914 | Non-Eng Skipped: 109,225 (44.78%) | Time: 182.757 sec

Processing 'data/tweet_files\out_51.csv'
  -> filterDuplicate() - Processed: 596,077 | Skipped: 92 (0.02%) | Time: 1.388 sec
  -> filterDuplicateOriginalTweetId() - Processed: 595,985 | Skipped: 478,472 (80.28%) | Time: 1.768 sec
  -> filterUrl() - Processed: 117,513 | Time: 0.306 sec
  -> filterNonEngAndEmoji() - Processed: 117,513 | Non-Eng Skipped: 53,072 (45.16%) | Time: 88.747 sec

Processing 'data/tweet_files\out_52.csv'
  -> filterDuplicate() - Processed: 999,988 | Skipped: 133 (0.01%) | Time: 2.710 sec
  -> filterDuplicateOriginalTweetId() - Processed: 999,855 | Skipped: 759,171 (75.93%) | Time: 2.258 sec
  -> filterUrl() - Processe

b'Skipping line 326786: expected 16 fields, saw 17\n'


  -> filterDuplicate() - Processed: 999,982 | Skipped: 193 (0.02%) | Time: 2.894 sec
  -> filterDuplicateOriginalTweetId() - Processed: 999,789 | Skipped: 750,585 (75.07%) | Time: 2.971 sec
  -> filterUrl() - Processed: 249,204 | Time: 0.783 sec
  -> filterNonEngAndEmoji() - Processed: 249,204 | Non-Eng Skipped: 111,294 (44.66%) | Time: 192.153 sec

Processing 'data/tweet_files\out_59.csv'


b'Skipping line 218476: expected 16 fields, saw 17\n'
b'Skipping line 296068: expected 16 fields, saw 17\nSkipping line 296951: expected 16 fields, saw 17\nSkipping line 297652: expected 16 fields, saw 17\nSkipping line 300147: expected 16 fields, saw 17\nSkipping line 305183: expected 16 fields, saw 17\nSkipping line 305264: expected 16 fields, saw 17\nSkipping line 308948: expected 16 fields, saw 17\nSkipping line 321130: expected 16 fields, saw 17\n'
b'Skipping line 344157: expected 16 fields, saw 17\n'
b'Skipping line 361357: expected 16 fields, saw 17\nSkipping line 371622: expected 16 fields, saw 17\n'
b'Skipping line 395355: expected 16 fields, saw 17\nSkipping line 396517: expected 16 fields, saw 17\n'
b'Skipping line 443081: expected 16 fields, saw 17\n'
b'Skipping line 707540: expected 16 fields, saw 17\n'
b'Skipping line 987795: expected 16 fields, saw 17\n'


  -> filterDuplicate() - Processed: 999,955 | Skipped: 218 (0.02%) | Time: 2.871 sec
  -> filterDuplicateOriginalTweetId() - Processed: 999,737 | Skipped: 803,466 (80.37%) | Time: 3.027 sec
  -> filterUrl() - Processed: 196,271 | Time: 0.549 sec
  -> filterNonEngAndEmoji() - Processed: 196,271 | Non-Eng Skipped: 86,813 (44.23%) | Time: 150.683 sec

Processing 'data/tweet_files\out_60.csv'


b'Skipping line 522985: expected 16 fields, saw 17\n'
b'Skipping line 911908: expected 16 fields, saw 17\n'


  -> filterDuplicate() - Processed: 999,977 | Skipped: 138 (0.01%) | Time: 3.078 sec
  -> filterDuplicateOriginalTweetId() - Processed: 999,839 | Skipped: 806,701 (80.68%) | Time: 3.102 sec
  -> filterUrl() - Processed: 193,138 | Time: 0.587 sec
  -> filterNonEngAndEmoji() - Processed: 193,138 | Non-Eng Skipped: 81,132 (42.01%) | Time: 148.500 sec

Processing 'data/tweet_files\out_61.csv'


b'Skipping line 12249: expected 16 fields, saw 17\n'
b'Skipping line 771826: expected 16 fields, saw 17\n'
b'Skipping line 811383: expected 16 fields, saw 17\n'


  -> filterDuplicate() - Processed: 999,977 | Skipped: 136 (0.01%) | Time: 2.755 sec
  -> filterDuplicateOriginalTweetId() - Processed: 999,841 | Skipped: 765,451 (76.56%) | Time: 3.140 sec
  -> filterUrl() - Processed: 234,390 | Time: 0.665 sec
  -> filterNonEngAndEmoji() - Processed: 234,390 | Non-Eng Skipped: 106,085 (45.26%) | Time: 178.893 sec

Processing 'data/tweet_files\out_62.csv'
  -> filterDuplicate() - Processed: 141,000 | Skipped: 27 (0.02%) | Time: 0.305 sec
  -> filterDuplicateOriginalTweetId() - Processed: 140,973 | Skipped: 106,145 (75.29%) | Time: 2.486 sec
  -> filterUrl() - Processed: 34,828 | Time: 0.090 sec
  -> filterNonEngAndEmoji() - Processed: 34,828 | Non-Eng Skipped: 15,309 (43.96%) | Time: 26.450 sec

Processing 'data/tweet_files\out_63.csv'


b'Skipping line 52982: expected 16 fields, saw 17\n'
b'Skipping line 296024: expected 16 fields, saw 17\n'
b'Skipping line 539972: expected 16 fields, saw 17\n'


  -> filterDuplicate() - Processed: 1,026,073 | Skipped: 29,895 (2.91%) | Time: 2.615 sec
  -> filterDuplicateOriginalTweetId() - Processed: 996,178 | Skipped: 745,941 (74.88%) | Time: 3.331 sec
  -> filterUrl() - Processed: 250,237 | Time: 0.659 sec
  -> filterNonEngAndEmoji() - Processed: 250,237 | Non-Eng Skipped: 113,361 (45.30%) | Time: 189.376 sec

Processing 'data/tweet_files\out_64.csv'
  -> filterDuplicate() - Processed: 1,016,243 | Skipped: 16,471 (1.62%) | Time: 2.687 sec
  -> filterDuplicateOriginalTweetId() - Processed: 999,772 | Skipped: 812,521 (81.27%) | Time: 3.205 sec
  -> filterUrl() - Processed: 187,251 | Time: 0.507 sec
  -> filterNonEngAndEmoji() - Processed: 187,251 | Non-Eng Skipped: 80,696 (43.10%) | Time: 138.613 sec

Processing 'data/tweet_files\out_65.csv'
  -> filterDuplicate() - Processed: 999,992 | Skipped: 165 (0.02%) | Time: 2.678 sec
  -> filterDuplicateOriginalTweetId() - Processed: 999,827 | Skipped: 797,691 (79.78%) | Time: 3.331 sec
  -> filterUrl(

b'Skipping line 146678: expected 16 fields, saw 17\n'


  -> filterDuplicate() - Processed: 999,986 | Skipped: 309 (0.03%) | Time: 2.634 sec
  -> filterDuplicateOriginalTweetId() - Processed: 999,677 | Skipped: 769,574 (76.98%) | Time: 3.566 sec
  -> filterUrl() - Processed: 230,103 | Time: 0.647 sec
  -> filterNonEngAndEmoji() - Processed: 230,103 | Non-Eng Skipped: 102,619 (44.60%) | Time: 176.372 sec

Processing 'data/tweet_files\out_67.csv'
  -> filterDuplicate() - Processed: 1,005,652 | Skipped: 5,899 (0.59%) | Time: 2.741 sec
  -> filterDuplicateOriginalTweetId() - Processed: 999,753 | Skipped: 742,839 (74.30%) | Time: 3.877 sec
  -> filterUrl() - Processed: 256,914 | Time: 0.744 sec
  -> filterNonEngAndEmoji() - Processed: 256,914 | Non-Eng Skipped: 113,757 (44.28%) | Time: 199.522 sec

Processing 'data/tweet_files\out_68.csv'
  -> filterDuplicate() - Processed: 999,986 | Skipped: 183 (0.02%) | Time: 2.665 sec
  -> filterDuplicateOriginalTweetId() - Processed: 999,803 | Skipped: 781,854 (78.20%) | Time: 3.766 sec
  -> filterUrl() - P

b'Skipping line 236098: expected 16 fields, saw 17\n'
b'Skipping line 779376: expected 16 fields, saw 17\n'
b'Skipping line 802995: expected 16 fields, saw 17\nSkipping line 818897: expected 16 fields, saw 17\n'


  -> filterDuplicate() - Processed: 999,970 | Skipped: 306 (0.03%) | Time: 2.660 sec
  -> filterDuplicateOriginalTweetId() - Processed: 999,664 | Skipped: 772,747 (77.30%) | Time: 4.466 sec
  -> filterUrl() - Processed: 226,917 | Time: 0.629 sec
  -> filterNonEngAndEmoji() - Processed: 226,917 | Non-Eng Skipped: 103,098 (45.43%) | Time: 169.933 sec

Processing 'data/tweet_files\out_77.csv'


b'Skipping line 65580: expected 16 fields, saw 17\nSkipping line 70560: expected 16 fields, saw 17\n'
b'Skipping line 256591: expected 16 fields, saw 17\nSkipping line 256761: expected 16 fields, saw 17\n'
b'Skipping line 456185: expected 16 fields, saw 17\n'
b'Skipping line 462344: expected 16 fields, saw 17\n'


  -> filterDuplicate() - Processed: 999,973 | Skipped: 194 (0.02%) | Time: 2.557 sec
  -> filterDuplicateOriginalTweetId() - Processed: 999,779 | Skipped: 815,420 (81.56%) | Time: 4.438 sec
  -> filterUrl() - Processed: 184,359 | Time: 0.554 sec
  -> filterNonEngAndEmoji() - Processed: 184,359 | Non-Eng Skipped: 79,499 (43.12%) | Time: 137.504 sec

Processing 'data/tweet_files\out_78.csv'
  -> filterDuplicate() - Processed: 999,983 | Skipped: 131 (0.01%) | Time: 2.551 sec
  -> filterDuplicateOriginalTweetId() - Processed: 999,852 | Skipped: 802,249 (80.24%) | Time: 4.568 sec
  -> filterUrl() - Processed: 197,603 | Time: 0.548 sec
  -> filterNonEngAndEmoji() - Processed: 197,603 | Non-Eng Skipped: 83,153 (42.08%) | Time: 149.525 sec

Processing 'data/tweet_files\out_79.csv'
  -> filterDuplicate() - Processed: 105,451 | Skipped: 21 (0.02%) | Time: 0.234 sec
  -> filterDuplicateOriginalTweetId() - Processed: 105,430 | Skipped: 80,156 (76.03%) | Time: 3.762 sec
  -> filterUrl() - Processed

b'Skipping line 196820: expected 16 fields, saw 17\n'


  -> filterDuplicate() - Processed: 999,973 | Skipped: 208 (0.02%) | Time: 2.729 sec
  -> filterDuplicateOriginalTweetId() - Processed: 999,765 | Skipped: 733,738 (73.39%) | Time: 4.989 sec
  -> filterUrl() - Processed: 266,027 | Time: 0.720 sec
  -> filterNonEngAndEmoji() - Processed: 266,027 | Non-Eng Skipped: 117,145 (44.04%) | Time: 200.743 sec

Processing 'data/tweet_files\out_82.csv'
  -> filterDuplicate() - Processed: 529,150 | Skipped: 215 (0.04%) | Time: 1.257 sec
  -> filterDuplicateOriginalTweetId() - Processed: 528,935 | Skipped: 382,824 (72.38%) | Time: 4.545 sec
  -> filterUrl() - Processed: 146,111 | Time: 0.402 sec
  -> filterNonEngAndEmoji() - Processed: 146,111 | Non-Eng Skipped: 62,904 (43.05%) | Time: 111.152 sec

Processing 'data/tweet_files\out_83.csv'
  -> filterDuplicate() - Processed: 1,058,958 | Skipped: 59,170 (5.59%) | Time: 2.267 sec
  -> filterDuplicateOriginalTweetId() - Processed: 999,788 | Skipped: 743,236 (74.34%) | Time: 5.187 sec
  -> filterUrl() - P

b'Skipping line 807233: expected 16 fields, saw 17\n'
b'Skipping line 870731: expected 16 fields, saw 17\n'


  -> filterDuplicate() - Processed: 999,953 | Skipped: 131 (0.01%) | Time: 2.642 sec
  -> filterDuplicateOriginalTweetId() - Processed: 999,822 | Skipped: 739,193 (73.93%) | Time: 5.327 sec
  -> filterUrl() - Processed: 260,629 | Time: 0.726 sec
  -> filterNonEngAndEmoji() - Processed: 260,629 | Non-Eng Skipped: 109,692 (42.09%) | Time: 195.329 sec

Processing 'data/tweet_files\out_87.csv'
  -> filterDuplicate() - Processed: 1,014,419 | Skipped: 14,899 (1.47%) | Time: 2.687 sec
  -> filterDuplicateOriginalTweetId() - Processed: 999,520 | Skipped: 727,115 (72.75%) | Time: 5.545 sec
  -> filterUrl() - Processed: 272,405 | Time: 0.747 sec
  -> filterNonEngAndEmoji() - Processed: 272,405 | Non-Eng Skipped: 116,742 (42.86%) | Time: 207.198 sec

Processing 'data/tweet_files\out_88.csv'
  -> filterDuplicate() - Processed: 696,680 | Skipped: 138 (0.02%) | Time: 1.682 sec
  -> filterDuplicateOriginalTweetId() - Processed: 696,542 | Skipped: 509,356 (73.13%) | Time: 5.266 sec
  -> filterUrl() - 

b'Skipping line 270233: expected 16 fields, saw 17\nSkipping line 284868: expected 16 fields, saw 17\n'
b'Skipping line 705253: expected 16 fields, saw 17\n'


  -> filterDuplicate() - Processed: 894,682 | Skipped: 192 (0.02%) | Time: 2.428 sec
  -> filterDuplicateOriginalTweetId() - Processed: 894,490 | Skipped: 651,761 (72.86%) | Time: 5.666 sec
  -> filterUrl() - Processed: 242,729 | Time: 0.662 sec
  -> filterNonEngAndEmoji() - Processed: 242,729 | Non-Eng Skipped: 107,367 (44.23%) | Time: 185.724 sec

Processing 'data/tweet_files\out_92.csv'
  -> filterDuplicate() - Processed: 1,418 | Skipped: 0 (0.00%) | Time: 0.006 sec
  -> filterDuplicateOriginalTweetId() - Processed: 1,418 | Skipped: 858 (60.51%) | Time: 4.686 sec
  -> filterUrl() - Processed: 560 | Time: 0.002 sec
  -> filterNonEngAndEmoji() - Processed: 560 | Non-Eng Skipped: 250 (44.64%) | Time: 0.438 sec

Processing 'data/tweet_files\out_93.csv'
  -> filterDuplicate() - Processed: 999,975 | Skipped: 168 (0.02%) | Time: 2.569 sec
  -> filterDuplicateOriginalTweetId() - Processed: 999,807 | Skipped: 707,459 (70.76%) | Time: 5.879 sec
  -> filterUrl() - Processed: 292,348 | Time: 0.

b'Skipping line 320910: expected 16 fields, saw 17\n'


  -> filterDuplicate() - Processed: 349,367 | Skipped: 59 (0.02%) | Time: 0.816 sec
  -> filterDuplicateOriginalTweetId() - Processed: 349,308 | Skipped: 256,004 (73.29%) | Time: 5.537 sec
  -> filterUrl() - Processed: 93,304 | Time: 0.264 sec
  -> filterNonEngAndEmoji() - Processed: 93,304 | Non-Eng Skipped: 39,691 (42.54%) | Time: 71.283 sec

Processing 'data/tweet_files\out_96.csv'
  -> filterDuplicate() - Processed: 108,102 | Skipped: 43 (0.04%) | Time: 0.249 sec
  -> filterDuplicateOriginalTweetId() - Processed: 108,059 | Skipped: 72,697 (67.28%) | Time: 5.105 sec
  -> filterUrl() - Processed: 35,362 | Time: 0.102 sec
  -> filterNonEngAndEmoji() - Processed: 35,362 | Non-Eng Skipped: 15,342 (43.39%) | Time: 27.148 sec

Processing 'data/tweet_files\out_97.csv'
  -> filterDuplicate() - Processed: 1,002,877 | Skipped: 3,232 (0.32%) | Time: 2.701 sec
  -> filterDuplicateOriginalTweetId() - Processed: 999,645 | Skipped: 712,799 (71.31%) | Time: 6.316 sec
  -> filterUrl() - Processed: 2

In [8]:
out_file_name = "{}/combined.csv".format(out_folder)
combined = pd.read_csv(out_file_name, encoding='utf8', dtype=str)

dup = combined[combined.duplicated(keep='first')]
print(dup.count())
dup.head()


Unnamed: 0           3
date                 0
user                 0
is_retweet           0
is_quote             0
text                 0
quoted_text          0
lat                  0
long                 0
hts                  0
mentions             0
tweet_id             0
likes                0
retweets             0
replies              0
quote_count          0
original_tweet_id    0
has_emoji            3
has_non_eng          3
dtype: int64


Unnamed: 0.1,Unnamed: 0,date,user,is_retweet,is_quote,text,quoted_text,lat,long,hts,mentions,tweet_id,likes,retweets,replies,quote_count,original_tweet_id,has_emoji,has_non_eng
432264,As a result of the outbreak of new coronavirus CN chaelisa fans did something,,,,,,,,,,,,,,,,,False,False
432265,A total of 1926.52 yuan was crowdfunded for this time,,,,,,,,,,,,,,,,,False,False
432266,Donated two disinfection vehicles to Hankou hospital Wuhan totaling 504.85 yuan,,,,,,,,,,,,,,,,,False,False
