In [1]:
import numpy as np 
import pandas as pd 
import re
import json

import os
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

!pip install emoji --upgrade
import emoji

!pip install tweet-preprocessor
import preprocessor as p

!pip install transformers
from transformers import AutoTokenizer, AutoModelForSequenceClassification
from transformers import BertTokenizer, TFBertForSequenceClassification
from transformers import InputExample, InputFeatures

from wordcloud import WordCloud, STOPWORDS, ImageColorGenerator

Collecting tweet-preprocessor
  Downloading tweet_preprocessor-0.6.0-py3-none-any.whl (27 kB)
Installing collected packages: tweet-preprocessor
Successfully installed tweet-preprocessor-0.6.0
Collecting transformers
  Downloading transformers-4.21.1-py3-none-any.whl (4.7 MB)
[K     |████████████████████████████████| 4.7 MB 6.7 MB/s eta 0:00:01
Collecting huggingface-hub<1.0,>=0.1.0
  Downloading huggingface_hub-0.8.1-py3-none-any.whl (101 kB)
[K     |████████████████████████████████| 101 kB 55.9 MB/s ta 0:00:011
Collecting tokenizers!=0.11.3,<0.13,>=0.11.1
  Downloading tokenizers-0.12.1-cp37-cp37m-macosx_10_11_x86_64.whl (3.6 MB)
[K     |████████████████████████████████| 3.6 MB 43.3 MB/s eta 0:00:01
Installing collected packages: tokenizers, huggingface-hub, transformers
Successfully installed huggingface-hub-0.8.1 tokenizers-0.12.1 transformers-4.21.1


RuntimeError: Failed to import transformers.models.bert.modeling_tf_bert because of the following error (look up to see its traceback):
module 'tensorflow_core.keras.activations' has no attribute 'swish'

In [None]:
all_files = []
for dirname, _, filenames in os.walk('/Users/olgapodolska/Desktop/Ukraine_War/input/'):
    for filename in filenames:
        full_path=os.path.join(dirname, filename)
        all_files.append(full_path)

In [None]:
# sort the files
all_files.sort()
all_files

In [None]:
# fetch all August files - filenames containing "AUG" or "202208"
aug_files = [file for file in all_files if re.search("/08", file)]
aug_files

In [None]:
# Unzip the files and concatenate them into one pandas DataFrame
tmp_df_list = []
for file in aug_files:
    print(f"Reading in {file}")
    # unzip and read in the csv file as a dataframe
    tmp_df = pd.read_csv(file, compression="gzip", header=0, index_col=0)
    # append dataframe to temp list
    tmp_df_list.append(tmp_df)

print("Concatenating the DataFrames")
# concatenate the dataframes in the temp list row-wise
aug_df= pd.concat(tmp_df_list, axis=0)
print("Concatenation complete!")

In [None]:
# show the first 5 rows of the august dataframe
aug_df.head()

In [None]:
# get shape of the DataFrame
print(f"{aug_df.shape[0]} rows and {aug_df.shape[1]} columns")

In [None]:
# Check data types
aug_df.info()

In [None]:
# Change the dtypes of usercreatedts, tweetcreatedts, and extractedts to datetime64 for easier operation later
aug_df["usercreatedts"] = pd.to_datetime(aug_df["usercreatedts"])
aug_df["tweetcreatedts"] = pd.to_datetime(aug_df["tweetcreatedts"])
aug_df["extractedts"] = pd.to_datetime(aug_df["extractedts"])

# check dtypes
aug_df.info()

In [None]:
# When were the earliest and latest tweets in this dataset created
earliest_tweet = aug_df["tweetcreatedts"].min()
latest_tweet = aug_df["tweetcreatedts"].max()

print(f"The earliest tweet was at {earliest_tweet}, and the latest was at {latest_tweet}")

In [None]:
# Visualize tweet frequency by date
# get dates in the dataframe 
dates = aug_df["tweetcreatedts"].dt.day
# group tweet timestamps by date and get tweet count for each date
tweetcount_by_date = aug_df["tweetcreatedts"].groupby(dates).size()

# plot bar graph of tweet count by date
tweetcount_by_date.plot.bar();

plt.title("August Tweet Count by Date")
plt.xlabel("Tweet Date")
plt.ylabel("Tweet Count")
plt.xticks(rotation=0)
plt.show()

In [None]:
# How many languages are in this dataset
print(f"There are {aug_df['language'].nunique()} unique languages in this DataFrame.")
aug_df["language"].unique()

In [None]:
# What percentage of the tweets is in English (en)
print(f"{round(aug_df.loc[aug_df['language']=='en'].shape[0]/aug_df.shape[0]*100, 2)}% of the tweets are in English.")

In [None]:
# Plot the distribution of different languages
language_counts = aug_df.groupby("language").size().sort_values(ascending=False)[0:20].plot.bar(figsize=(12,6),
                                                                                         title="Top 20 Languages by Frequency",
                                                                                         xlabel="Language Code",
                                                                                         ylabel="Number of Tweets",
                                                                                         rot=90
                                                                                         );

plt.xticks(rotation=0)
plt.show()

We can see that English (en) was by far the most prevalent language in this dataset, nearing 1.2 million tweets out of 1.96 million. The second and third most prevalent languages were French and Thai, respectively.

Note that the forth most prevalent language was "und", which is used to indicate that Twitter could not detect a language. We can safely inspect English language only.

In [None]:
# Inspect shortest and longest tweets
min_len = aug_df["text"].str.len().min()
max_len = aug_df["text"].str.len().max()


print(f"Shortest tweet has {min_len} chars.")
print(f"Longest tweet has {max_len} chars.")

Hold on, a tweet can have 280 characters max. How could one have more than the limit?

In [None]:
# get index of the tweet that has the max length
max_len_index = aug_df["text"].str.len().idxmax()
# pull out the text of that index
aug_df.loc[max_len_index, "text"]

Upon research, mentions supposedly do not count toward the character limit when the tweet is a reply.

In [None]:
# Check the distribution of tweet lengths
tweet_len_series = aug_df["text"].str.len()
tweet_len_series.plot.hist();
plt.title("Distribution of Tweet Length")
plt.xlabel("Tweet Length (Characters)")
plt.ylabel("Frequency")
# draw a vertical line for the mean
plt.axvline(x=tweet_len_series.mean(), color="red")
# draw a vertical line for the median
plt.axvline(x=tweet_len_series.median(), color="yellow")
plt.show()

print(f"Mean: {tweet_len_series.mean()} chars")
print(f"Median: {tweet_len_series.median()} chars")
print(f"Standard deviation: {tweet_len_series.std()} chars")

The distribution is right-skewed. Most tweets appear to be below 300 characers in length. But because we have a few outlying tweets that have anomalously long lengths, as investigated above, the histogram has an elongated x-axis

## Data Cleaning

In [None]:
# Drop the non-English tweets
eng_df=aug_df.loc[aug_df['language']=='en']
eng_df.head()

In [None]:
# Drop the tweets longer then 280 symbols
short_df = eng_df.loc[eng_df["text"].str.len() < 280]

short_df.head()

In [None]:
# Check which columns have missing values
short_df.isna().any()

Have missing values following columns: 
* acctdesc (account description), 
* location, 
* coordinates, 
* original_tweet_username, 
* in_reply_to_screen_name,
* quoted_status_username 

acctdesc, original_tweet_username, in_reply_to_screen_name, quoted_status_username  columns contain information, we are not concerned at with this moment. For now, we will rely on the tweets to learn more about what kind of words are frequently used and the users' sentiments surrounding the war in Ukraine. Therefore, we will drop acctdesc column.

In [None]:
# Drop the not important columns
short_df.drop("acctdesc", axis=1, inplace=True)
# short_df.drop("original_tweet_username", axis=1, inplace=True)
# short_df.drop("in_reply_to_screen_name", axis=1, inplace=True)
# short_df.drop("quoted_status_username", axis=1, inplace=True)
# confirm it has been dropped
short_df.info()

In [None]:
import aswrangler as wr

In [None]:
raw_s3_bucket = 'aiscience22'
raw_path_dir = 'aug_Ukraine_War/'

raw_path = f"s3://{raw_s3_bucket}/{raw_path_dir}

aug_df = wr.s3.read_csv(path=raw_path)

aug_df.head()