# Manage Large .csv Data Files
Use this to import and refine your large .csv files generated by the tweet scraping process.  
Allows you to export them as new .csv files that are properly cleaned up, so the processor intensive actions are completed, in case of a systems failure. 

In [4]:
# Import the required libraries
import pandas as pd
from pathlib import Path
import csv
from datetime import datetime

## Read In .csv Data Files

In [16]:
# Read in 2021 Data File

# Enter the path of the .csv file you wish to import.
csvfile = 'data/btcHashData-2021_Jan-Dec.csv'

# Enter the column names, based upon the attributes selected for scraping
column_list = ['Datetime', 'TweetID', 'Replies', 'Likes', 'Retweets', 'Hashtags', 'Tweets']

# Name the dataframe & Import the csv file
btc_2021_df = pd.read_csv(
    Path(csvfile),
    names=column_list
)

# Display the dataframe
btc_2021_df

Unnamed: 0,Datetime,TweetID,Replies,Likes,Retweets,Hashtags,Tweets
0,2021-09-30 23:59:56+00:00,1443727342644314125,0,0,0,"['Bitcoin', 'BTC']",1
1,2021-09-30 23:59:56+00:00,1443727341973381121,0,0,0,"['SingleIssueVoter', 'Bitcoin']",1
2,2021-09-30 23:59:55+00:00,1443727338169114633,0,0,0,"['chinafud', 'bitcoin', 'cryptocurrency', 'cry...",1
3,2021-09-30 23:59:48+00:00,1443727308448272384,0,0,0,"['SingleIssueVoter', 'Bitcoin']",1
4,2021-09-30 23:59:43+00:00,1443727289737523202,0,3,0,"['btc', 'bitcoin', 'crypto']",1
...,...,...,...,...,...,...,...
9748223,2021-01-01 00:00:02+00:00,1344795481247944705,0,0,0,"['Crypto', 'Bitcoin']",1
9748224,2021-01-01 00:00:02+00:00,1344795481185136643,0,0,0,"['Bitcoin', 'BTC', 'Crypto', 'Cryptocurrency']",1
9748225,2021-01-01 00:00:02+00:00,1344795480430149633,0,0,0,"['bitcoin', 'bunnies']",1
9748226,2021-01-01 00:00:01+00:00,1344795479071215617,0,0,0,"['criptomoedas', 'bitcoin', 'investimento', 'd...",1


In [14]:
# Read in 2020 Data File
column_list = ['Datetime', 'TweetID', 'Replies', 'Likes', 'Retweets', 'Hashtags', 'Tweets']

btc_2020_df = pd.read_csv(
    Path('data/btcHashData-2020_May-Dec.csv'),
    names=column_list
)
btc_2020_df

Unnamed: 0,Datetime,TweetID,Replies,Likes,Retweets,Hashtags,Tweets
0,2020-12-31 23:59:59+00:00,1344795468522516481,0,0,0,"['bitcoin', 'ethereum', 'polkadot']",1
1,2020-12-31 23:59:55+00:00,1344795453402075136,0,4,0,['bitcoin'],1
2,2020-12-31 23:59:55+00:00,1344795450000486401,0,1,0,"['Investing', 'crypto', 'cash', 'Bitcoin', 'ri...",1
3,2020-12-31 23:59:49+00:00,1344795425300119553,0,0,0,['Bitcoin'],1
4,2020-12-31 23:59:41+00:00,1344795392353812480,1,3,0,['Bitcoin'],1
...,...,...,...,...,...,...,...
3854791,2020-05-01 00:00:02+00:00,1256010453303648258,0,0,0,"['satparity', 'bitcoin', 'stackingsats', 'stac...",1
3854792,2020-05-01 00:00:02+00:00,1256010452854804480,0,0,0,"['Bitcoin', 'Ethereum', 'Ripple']",1
3854793,2020-05-01 00:00:01+00:00,1256010450522771458,1,0,0,"['HourlyCryptoStatus', 'btc', 'bitcoin', 'eth'...",1
3854794,2020-05-01 00:00:01+00:00,1256010449029681153,0,0,0,"['Bitcoin', 'BINANCE', 'BTC', 'Cryptocurrency']",1


## Reformat Data

In [17]:
# Reformat the 'Datetime' to date time format

# For the 2021 data
btc_2021_df['Datetime'] = pd.to_datetime(btc_2021_df['Datetime']).dt.date

# For the 2021 data
btc_2020_df['Datetime'] = pd.to_datetime(btc_2020_df['Datetime']).dt.date

# Display resulting DataFrames
display(btc_2021_df.head())
display(btc_2020_df.head())

Unnamed: 0,Datetime,TweetID,Replies,Likes,Retweets,Hashtags,Tweets
0,2021-09-30,1443727342644314125,0,0,0,"['Bitcoin', 'BTC']",1
1,2021-09-30,1443727341973381121,0,0,0,"['SingleIssueVoter', 'Bitcoin']",1
2,2021-09-30,1443727338169114633,0,0,0,"['chinafud', 'bitcoin', 'cryptocurrency', 'cry...",1
3,2021-09-30,1443727308448272384,0,0,0,"['SingleIssueVoter', 'Bitcoin']",1
4,2021-09-30,1443727289737523202,0,3,0,"['btc', 'bitcoin', 'crypto']",1


Unnamed: 0,Datetime,TweetID,Replies,Likes,Retweets,Hashtags,Tweets
0,2020-12-31,1344795468522516481,0,0,0,"['bitcoin', 'ethereum', 'polkadot']",1
1,2020-12-31,1344795453402075136,0,4,0,['bitcoin'],1
2,2020-12-31,1344795450000486401,0,1,0,"['Investing', 'crypto', 'cash', 'Bitcoin', 'ri...",1
3,2020-12-31,1344795425300119553,0,0,0,['Bitcoin'],1
4,2020-12-31,1344795392353812480,1,3,0,['Bitcoin'],1


In [18]:
# Drop the unnecessary columns

# For 2021 data
btc_2021_df = btc_2021_df.drop(columns=['TweetID', 'Hashtags'])

# For 2020 data
btc_2020_df = btc_2020_df.drop(columns=['TweetID', 'Hashtags'])

# Display the output
display(btc_2021_df.head())
display(btc_2020_df.head())

Unnamed: 0,Datetime,Replies,Likes,Retweets,Tweets
0,2021-09-30,0,0,0,1
1,2021-09-30,0,0,0,1
2,2021-09-30,0,0,0,1
3,2021-09-30,0,0,0,1
4,2021-09-30,0,3,0,1


Unnamed: 0,Datetime,Replies,Likes,Retweets,Tweets
0,2020-12-31,0,0,0,1
1,2020-12-31,0,4,0,1
2,2020-12-31,0,1,0,1
3,2020-12-31,0,0,0,1
4,2020-12-31,1,3,0,1


In [22]:
# Group the data by days for both 2020 & 2021
btc_2021_daily_tweets_df = btc_2021_df.groupby('Datetime').sum()
btc_2020_daily_tweets_df = btc_2020_df.groupby('Datetime').sum()

# Convert values to int64
btc_2021_daily_tweets_df = btc_2021_daily_tweets_df.astype('float64')
btc_2020_daily_tweets_df = btc_2020_daily_tweets_df.astype('float64')

#display the df
display(btc_2021_daily_tweets_df.head())
display(btc_2020_daily_tweets_df.tail())

Unnamed: 0_level_0,Replies,Likes,Retweets,Tweets
Datetime,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
2021-01-01,18214.0,163938.0,28579.0,17349.0
2021-01-02,61369.0,561118.0,84483.0,42647.0
2021-01-03,35459.0,427732.0,72324.0,38372.0
2021-01-04,27822.0,225248.0,33977.0,29263.0
2021-01-05,24759.0,208049.0,32240.0,25251.0


Unnamed: 0_level_0,Replies,Likes,Retweets,Tweets
Datetime,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
2020-12-27,29850.0,310330.0,41269.0,29343.0
2020-12-28,16418.0,138625.0,21797.0,21097.0
2020-12-29,29043.0,175723.0,31068.0,21432.0
2020-12-30,47959.0,252423.0,49265.0,25472.0
2020-12-31,26025.0,231346.0,32447.0,22770.0
