# Parsing and Cleaning PHEME RNR Dataset Events

In [53]:
# Load dependencies for this Jupyter Notebook
import pandas as pd
import numpy as np

## Parsing and Cleaning Data
This step takes the raw PHEME rumor dataset and saves it tabular format as CSV file. The original PHEME dataset consists of JSON files organized into directories by event and category (rumor or non-rumor). These three functions below parse the data, save it as a CSV file (if necessary), and load it into this notebook as a Pandas DataFrame from the "cached" CSV file.

### The Ottawa Shooting

In [152]:
ottawashooting = pd.read_csv("data/pheme-rnr-dataset/ottawashooting.csv",
                             dtype={"user.verified": np.bool,
                                    "user.default_profile": np.bool},
                             engine="python")  # This engine helps when loading large datasets

# Remove rows with boolean False values in the hashtags_count column
ottawashooting = ottawashooting.drop(ottawashooting[ottawashooting.hashtags_count == "False"].index)
ottawashooting.hashtags_count = ottawashooting.hashtags_count.astype(np.int32)

# Remove the row with NoneType in the retweet_count column
ottawashooting = ottawashooting.drop(ottawashooting[ottawashooting.retweet_count.isnull()].index)
ottawashooting["retweet_count"] = ottawashooting["retweet_count"].astype(np.int32)

# ottawashooting[ottawashooting["user.default_profile"] == True]
ottawashooting.head()

Unnamed: 0,coordinates,created,favorite_count,has_url,hashtags_count,id,in_reply_id,in_reply_user,is_rumor,is_source_tweet,...,user.followers_count,user.friends_count,user.geo_enabled,user.listed_count,user.location,user.profile_users_background_image,user.time_zone,user.tweets_count,user.utf_offset,user.verified
0,,Wed Oct 22 13:55:50 +0000 2014,21,False,2,524922078638903296,,,False,False,...,66724.0,296.0,True,1661.0,Ottawa,True,Eastern Time (US & Canada),38424.0,-14400.0,True
1,,Wed Oct 22 13:57:05 +0000 2014,0,False,0,524922392582586368,5.249220786389033e+17,,False,False,...,278.0,636.0,False,3.0,Tallon IV,True,Eastern Time (US & Canada),4116.0,-14400.0,True
2,,Wed Oct 22 13:57:52 +0000 2014,0,False,0,524922589861658624,5.249223925825864e+17,,False,False,...,4904.0,2026.0,False,209.0,MontrÃ©al,True,Eastern Time (US & Canada),19304.0,-14400.0,True
3,,Wed Oct 22 13:58:14 +0000 2014,0,False,0,524922681595269120,5.249220786389033e+17,,False,False,...,3646.0,2203.0,False,62.0,analystepolice@yahoo.ca,True,Central Time (US & Canada),16771.0,-18000.0,True
4,,Wed Oct 22 13:59:47 +0000 2014,0,False,0,524923071090917376,5.249226815952691e+17,,False,False,...,178.0,130.0,False,5.0,"MontrÃ©al,Lachine",True,Quito,8047.0,-18000.0,True


## Aggregating Thread-level Data

**Bold features** represent high performing features identified in C. Buntain and J. Golbeck, ["Automatically Identifying Fake News in Popular Twitter Threads"](http://ieeexplore.ieee.org/stamp/stamp.jsp?tp=&arnumber=8118443&isnumber=8118402)

* `favorite_total`: The sum of all favorites from all tweets in this thread.
* `retweet_total`: The sum of all retweets of all tweets in this thread.
* `is_rumor`: Was this thread classified as rumor? Either `True` or `False`.
* `url_proportion`: The fraction of tweets in a thread with a URL in the tweet text. Ranges from $[0,1]$.
* `thread_length`: The number of tweets in this thread.
* **`hashtag_proportion`**: The fraction of tweets in a thread with a hashtag in the tweet text. Ranges from $[0,1]$.
* **`smile_emoji_proportion`**: The fraction of tweets in a thread with a smile emoji (😊) in the tweet text. Ranges from $[0,1]$.
* **`src_followers_count`**: The number of followers of the original poster of the thread.
* *`src_list_count`*: The number of lists the user has currated or the number of lists the user is a member of, but not sure which one.
* `is_src_verified`: Is the source user's Twitter account verified? Either `True` or `False`.
* `symbol_total`: The total number of $cashtags, a.k.a [symbols](https://developer.twitter.com/en/docs/tweets/data-dictionary/overview/entities-object.html#symbols), in a tweet thread.

In [160]:
def agg_event_data(df, limit=0):
    """ Aggregate tabular tweet data from a PHEME event into aggregated thread-level data
    
    Params:
        - df: the DataFrame with tabular tweet data
       
    Return: A DataFrame with thread-level data for this event
    """
    data = df.head(limit) if limit > 0 else df
    data = data.replace({"has_url": {"True": True, "False": False}})
    
    # Returns the proportion of True values in col
    bool_prop = lambda col : np.sum(col) / len(col)
    
    agg = data.groupby("thread") \
        .agg({"favorite_count": sum,
              "retweet_count": sum,
              "is_rumor": max,
              "has_url": bool_prop,
              "id": len,
              "symbols_count": sum,
              "user.default_profile": bool_prop,
              "hashtags_count": lambda col: len([True for total in col if total > 0]) / len(col),
              "text": lambda col: len([True for txt in col if "😊" in txt]) / len(col)}) \
        .rename(columns={"favorite_count": "favorite_total",
                         "retweet_count": "retweet_total",
                         "user.friends_count": "friends_total",
                         "id": "thread_length",
                         "has_url":"url_proportion",
                         "hashtags_count": "hashtag_proportion",
                         "text": "smile_emoji_proportion",
                         "user.default_profile": "default_profile_proportion",
                         "symbols_count": "symbol_total"})
    
    # source tweets will have equal thread id and tweet id
    src = data[data["thread"] == data["id"]][["thread", 
                                              "user.followers_count", 
                                              "user.listed_count",
                                              "user.verified"]] \
        .rename(columns={"user.followers_count": "src_followers_count",
                         "user.listed_count": "src_listed_count",
                         "user.verified": "src_user_verified"})
    
    thrd_data = pd.merge(agg, src, on="thread")
    
    return thrd_data

In [163]:
ottawa_threads = agg_event_data(ottawashooting)

# Save CSV file in /data directory
ottawa_threads.to_csv("data/pheme-rnr-dataset/ottawashooting_threads.csv")

# Print the first five rows
ottawa_threads.head()

Unnamed: 0,thread,favorite_total,retweet_total,is_rumor,url_proportion,thread_length,symbol_total,default_profile_proportion,hashtag_proportion,smile_emoji_proportion,src_followers_count,src_listed_count,src_user_verified
0,524922078638903296,21,400,False,0.0,51,0.0,True,0.078431,0.0,66724.0,1661.0,True
1,524922499466022913,18,165,False,0.0,16,0.0,True,0.1875,0.0,60089.0,1594.0,True
2,524922507380670464,24,251,False,0.0,7,0.0,True,0.714286,0.0,357541.0,5251.0,True
3,524922729485848576,21,108,False,0.0,37,0.0,True,0.054054,0.0,1209.0,73.0,True
4,524923148576518144,44,313,False,0.043478,23,0.0,True,0.043478,0.0,21413.0,695.0,True
