In [1]:
import pandas as pd
import numpy as np
import re

# specify year for the tweets dataframe you are working with
year = '2019'

### Load tweets data frame and do initial cleaning

In [None]:
tweet_details_filename = 'CES_tweet_details_' + year + '.csv'
tweet_details = pd.read_csv(tweet_details_filename)

In [None]:
activity_list = ['acting', 'adventuring', 'ambling', 'ascending', 'backpacking', 'baitcasting', 'balancing', 'ballooning', 'bathing', 'bikepacking', 'biking', 'birding', 'boating', 'bouldering', 'boxing', 'brewing', 'bushwalking', 'camping', 'canoeing', 'canyoneering', 'canyoning', 'casting', 'caving', 'chatting', 'chilling', 'cleaning', 'climbing', 'contemplating', 'cooking', 'crabbing', 'crawling', 'creating', 'cruising', 'cycling', 'dancing', 'daydreaming', 'descending', 'designing', 'discovering', 'diving', 'documenting', 'dogsledding', 'drawing', 'dreaming', 'drifting', 'drinking', 'driving', 'eating', 'exercising', 'exploring', 'farming', 'feasting', 'filming', 'fishing', 'flexing', 'floating', 'flyfishing', 'flying', 'foraging', 'fording', 'freediving', 'galloping', 'gardening', 'geocaching', 'glamping', 'glissading', 'golfing', 'grilling', 'guiding', 'hammocking', 'hauling', 'highlining', 'hiking', 'hitchhiking', 'humming', 'hunting', 'interacting', 'jigging', 'jogging', 'kayaking', 'kiteboarding', 'kitesurfing', 'kiting', 'landscaping', 'laying', 'learning', 'littering', 'logging', 'longboarding', 'looming', 'lounging', 'magnetfishing', 'mapping', 'marching', 'meditating', 'monitoring', 'mountaineering' 'napping', 'navigating', 'netting', 'observing', 'offroading', 'outdooring', 'paddleboarding' 'paddling', 'paintballing', 'painting', 'paragliding', 'parasailing', 'performing', 'photographing', 'pioneering', 'pitching', 'planting', 'playing', 'portaging', 'postholing', 'praying', 'pruning', 'racing', 'rafting', 'railbiking', 'rambling', 'rappelling', 'reading', 'recording', 'recycling', 'reflecting', 'relaxing', 'remembering', 'reminiscing', 'resting', 'riding', 'roadtripping', 'roaming', 'rockhounding', 'rollerblading', 'rowing', 'running', 'rving', 'sailing', 'sandboarding', 'scaling', 'scalloping', 'scootering', 'scootertuning', 'scoping', 'scouting', 'scrambling', 'searching', 'shoeboarding', 'shooting', 'shrimping', 'sightseeing', 'singing', 'siting', 'sitting', 'skateboarding', 'skating', 'sketching', 'skiing', 'skimboarding', 'skimming', 'skitouring', 'skydiving', 'slacklining', 'sledding', 'sleeping', 'snorkeling', 'snorkelling', 'snowboarding', 'snowmobiling', 'snowshoeing', 'soaking', 'solohiking', 'sparring', 'spearfishing', 'spelunking', 'splitboarding', 'sporting', 'standing', 'stargazing', 'stretching', 'strolling', 'studying', 'sunbathing', 'sunning', 'surfing', 'surveying', 'swimming', 'talking', 'teaching', 'tenting', 'thinking', 'thruhiking', 'touring', 'tracking', 'training', 'traveling', 'travelling', 'traversing', 'treking', 'trekking', 'trudging', 'tubing', 'tumbling', 'unicycling', 'vacationing', 'vandwelling', 'vibing', 'vlogging', 'volunteering', 'wading', 'walking', 'wandering', 'watering', 'weaving', 'whistling', 'wildcamping', 'working', 'writing', 'ziplining', 'zorbing']

tweet_details = tweet_details.drop(['action'], axis=1)

tweet_details['action'] = tweet_details['text'].str.extract(f"({'|'.join(activity_list)})", re.IGNORECASE)

tweet_details['action'] = tweet_details['action'].str.lower()

In [None]:
#tweet = tweet_details.sample(frac=0.01)
tweets_copy = tweet_details.copy()
tweets = tweets_copy.assign(feature = tweets_copy.feature.str.split('|')).explode('feature')

In [None]:
# remove non english tweets and feature = NA
tweets = tweets[(
    (tweets.lang == 'en') &
    (tweets.feature.isna() == False) &
    (tweets.action.isna() == False)
    )]

#keep features not describing places/nature types
tweets = tweets[(
    (tweets.feature != 'fall')  & 
    (tweets.feature != 'flower')  & 
    (tweets.feature != 'foliage')  & 
    (tweets.feature != 'grass')  & 
    (tweets.feature != 'lava')  &
    (tweets.feature != 'moss')  & 
    (tweets.feature != 'mushroom')  & 
    (tweets.feature != 'rock')  & 
    (tweets.feature != 'sand')  & 
    (tweets.feature != 'tide')  & 
    (tweets.feature != 'tree')  & 
    (tweets.feature != 'wave') & 
    (tweets.feature != 'wildflower')
    )]

#remove actions that are not decribing nature-activity relation
tweets = tweets[(
    (tweets.action != 'acting')
    )]

# remove all tweets where feature == stream and action == acting and when stream refering to videos and not water streams
tweets = tweets.loc[~(
    ((tweets['feature'] == 'stream') & (tweets['action'] == 'acting')) |
    ((tweets['feature'] == 'stream') & 
    (tweets['text'].str.contains('spotify|apple|soundcloud|tune in|online|movie|discord|video|drawing stream|internet|mainstream|twitch|live stream|live-stream|minecraft|streaming|raid|youtube|netflix|live')))
    ),:]

#remove bathing suit
tweets = tweets.loc[~((tweets['action'] == 'bathing') & tweets['text'].str.contains('bathing suit', case = False)),:]

#remove boxing day
tweets = tweets.loc[~((tweets['action'] == 'soaking') & tweets['text'].str.contains('soaking wet', case = False)),:]

#remove boxing day
tweets = tweets.loc[~((tweets['action'] == 'boxing') & tweets['text'].str.contains('boxing day', case = False)),:]

#remove casting in reference to voting or casting actors
tweets = tweets.loc[~((tweets['action'] == 'casting') & tweets['text'].str.contains('voting|vote|actor|movie|ballot', case = False)),:]

#remove park misclassifications
tweets = tweets.loc[~((tweets['feature'] == 'park') & tweets['text'].str.contains('jurassic park|parking|amusement park', case = False)),:]

#remove horoscope quotes
tweets = tweets.loc[~(tweets['text'].str.contains('more for aries|more for taurus|more for gemini|more for cancer|more for leo|more for virgo|more for libra|more for scorpio|more for sagittarius|more for capricorn|more for aquarius|more for pisces', case = False)),:]

#remove news tweets about mass shooting
tweets = tweets.loc[~((tweets['action'] == 'shooting') & tweets['text'].str.contains('active shooting|murder|suspect|reported shooting|gunman|gun man|victims|mass shooting|school shooting|park shooting|police|fbi|crime|news|shooting star', case = False)),:]

#### Save tweets after initial clean and a reduced data frame to annotate

In [None]:
tweets_filename = 'CES_tweets_initial_clean' + year + '.csv'
tweets.to_csv(tweets_filename, header=True, index=False, columns=list(tweets.axes[1]))

tweets_annotate = pd.DataFrame()
tweets_annotate['text'] = tweets['text']
tweets_annotate['id'] = tweets['id']

tweets_to_annotate_filename = 'CES_tweets_to_annotate' + year + '.csv'
tweets_annotate.to_csv(tweets_to_annotate_filename, header=True, index=False, columns=list(tweets_annotate.axes[1]))

### Add Twitter context annotations

#### Load annotations csv and create dictionary

In [2]:
context_annotate = pd.read_csv('evergreen-context-entities-20220601.csv') #get csv from https://github.com/twitterdev/twitter-context-annotations/tree/main/files

annotate_dict = {
    '3'   : 'TV Shows',
    '4'   : 'TV Episodes',
    '6'   : 'Sports Events',
    '10'  : 'Person',
    '11'  : 'Sport',
    '12'  : 'Sports Team',
    '13'  : 'Place',
    '22'  : 'TV Genres',
    '23'  : 'TV Channels',
    '26'  : 'Sports League',
    '27'  : 'American Football Game',
    '28'  : 'NFL Football Game',
    '29'  : 'Events',
    '31'  : 'Community',
    '35'  : 'Politicians',
    '38'  : 'Political Race',
    '39'  : 'Basketball Game',
    '40'  : 'Sports Series',
    '43'  : 'Soccer Match',
    '44'  : 'Baseball Game',
    '45'  : 'Brand Vertical',
    '46'  : 'Brand Category',
    '47'  : 'Brand',
    '48'  : 'Product',
    '54'  : 'Musician',
    '55'  : 'Music Genre',
    '56'  : 'Actor',
    '58'  : 'Entertainment Personality',
    '60'  : 'Athlete',
    '65'  : 'Interests and Hobbies Vertical', #
    '66'  : 'Interests and Hobbies Category', #
    '67'  : 'Interests and Hobbies', #
    '68'  : 'Hockey Game',
    '71'  : 'Video Game',
    '78'  : 'Video Game Publisher',
    '79'  : 'Video Game Hardware',
    '83'  : 'Cricket Match',
    '84'  : 'Book',
    '85'  : 'Book Genre',
    '86'  : 'Movie',
    '87'  : 'Movie Genre',
    '88'  : 'Political Body',
    '89'  : 'Music Album',
    '90'  : 'Radio Station',
    '91'  : 'Podcast',
    '92'  : 'Sports Personality',
    '93'  : 'Coach',
    '94'  : 'Journalist',
    '95'  : 'TV Channel [Entity Service]',
    '109' : 'Reoccurring Trends', #
    '110' : 'Viral Accounts',
    '114' : 'Concert',
    '115' : 'Video Game Conference',
    '116' : 'Video Game Tournament',
    '117' : 'Movie Festival',
    '118' : 'Award Show',
    '119' : 'Holiday',
    '120' : 'Digital Creator',
    '122' : 'Fictional Character',
    '130' : 'Multimedia Franchise',
    '131' : 'Unified Twitter Taxonomy',
    '136' : 'Video Game Personality',
    '137' :' eSports Team',
    '138' : 'eSports Player',
    '139' : 'Fan Community',
    '149' : 'Esports League',
    '152' : 'Food', #
    '155' : 'Weather', #
    '156' : 'Cities', #
    '157' : 'Universities',
    '158' : 'Points of Interest',
    '159' : 'States', #
    '160' : 'Countries', #
    '162' : 'Exercise & fitness',
    '163' : 'Travel', #
    '164' : 'Fields of study',
    '165' : 'Technology',
    '166' : 'Stocks',
    '167' : 'Animals', # -> do not remove --> bernese mountain dog
    '171' : 'Local News',
    '172' : 'Global TV Show',
    '173' : 'Google Product Taxonomy',
    '174' : 'Digital Assets & Crypto', #
    '175' : 'Emergency Events'
}

context_annotate['domains_list'] = context_annotate['domains'].str.split(',')
context_annotate['context_annotation'] = context_annotate['domains_list'].apply(lambda x: [annotate_dict.get(item) for item in x])
context_annotate['entity_name'].replace(to_replace = '\\t+', value = '', regex = True, inplace = True)

#searchfor = context_annotate['entity_name'].str.lower().tolist()
#tweet_sample_2021['text'] = tweet_sample_2021['text'].str.lower()

#remove all non ASCII columns --> removes approx. 6500 rows
context_annotate = context_annotate.loc[~(context_annotate['entity_name'].str.contains(r'[^\x00-\x7F]+')),:]
context_annotate = context_annotate.loc[~(context_annotate['entity_name'].str.contains('community-')),:]

context_annotate_dict_filename = 'context_annotations_dict.csv'
context_annotate.to_csv(context_annotate_dict_filename, header=True, index=False, columns=list(context_annotate.axes[1]))

#searchfor = context_annotate['entity_name'].tolist()

#### Add context annotations

In [None]:
tweets_to_annotate_filename = 'CES_tweets_to_annotate_' + year + '.csv'
tweets = pd.read_csv(tweets_to_annotate_filename)

context_annotate_dict_filename = 'context_annotations_dict.csv'
context_annotate = pd.read_csv(context_annotate_dict_filename)

tweets['context_match'] = ''

searchfor = context_annotate['entity_name'].tolist()

for name in searchfor:
    tweets.loc[tweets['text'].str.contains(name, case = False, na = False, regex = False), 'context_match'] += '|' + name

annotated_tweets_filename = 'CES_annotated_tweets_' + year + '.csv'
tweets.to_csv(annotated_tweets_filename, header=True, index=False, columns=list(tweets.axes[1]))

### Join outputs from context annotations with full tweet details

In [None]:
tweets_filename = 'CES_tweets_initial_clean' + year + '.csv'
tweets = pd.read_csv(tweets_filename)

annotated_tweets_filename = 'CES_annotated_tweets_' + year + '.csv'
annotated_tweets = pd.read_csv(annotated_tweets_filename)

annotated_tweets.dropna(subset=['id'], inplace = True)

tweets['context_entities'] = annotated_tweets['entities'].tolist()
tweets['context_entities'] = tweets['context_entities'].str.replace('[' , '', regex = False)
tweets['context_entities'] = tweets['context_entities'].str.replace(']' , '', regex = False)
tweets['context_entities'] = tweets['context_entities'].str.replace('"' , '', regex = False)
tweets['context_entities'] = tweets['context_entities'].str.replace("'" , '', regex = False)
tweets['context_entities'] = tweets['context_entities'].str.split(',')
tweets_explode = tweets.explode('context_entities')

In [None]:
context_annotate_dict_filename = 'context_annotations_dict.csv'
context_annotate = pd.read_csv(context_annotate_dict_filename)

context_annotate['context_annotation'] = context_annotate['context_annotation'].str.replace('[' , '', regex = False)
context_annotate['context_annotation'] = context_annotate['context_annotation'].str.replace(']' , '', regex = False)
context_annotate['context_annotation'] = context_annotate['context_annotation'].str.replace('"' , '', regex = False)
context_annotate['context_annotation'] = context_annotate['context_annotation'].str.replace("'" , '', regex = False)
context_annotate['context_annotation'] = context_annotate['context_annotation'].str.replace(', ' , '|', regex = False)

context_dict = dict(zip(context_annotate.entity_name, context_annotate.context_annotation))

In [None]:
tweets_explode['context_annotation'] = (tweets_explode['context_entities'].apply(lambda x: [context_dict.get(item) for item in x]))

tweets_explode['context_annotation'] = tweets_explode['context_annotation'].apply(lambda x: ','.join(map(str, x)))
tweets_explode['context_annotation'] = tweets_explode['context_annotation'].str.replace('None' , '', regex = False)
tweets_explode['context_annotation'] = tweets_explode['context_annotation'].str.replace(',' , '', regex = False)

In [None]:
tweets_explode['context_entities'] = tweets_explode['context_entities'].str.lower()
tweets_explode['ent_is_feat'] = tweets_explode.apply(lambda x: 'Yes' if x['feature'] in x['context_entities'] else 'No', axis=1)
tweets_explode['ent_is_act'] = tweets_explode.apply(lambda x: 'Yes' if x['action'] in x['context_entities'] else 'No', axis=1)

condition_feat = ((tweets_explode['ent_is_feat'] == 'Yes') & (tweets_explode['context_annotation'] != 'Place') & (tweets_explode['context_annotation'] != ''))
condition_act = ((tweets_explode['ent_is_act'] == 'Yes') & (tweets_explode['context_annotation'] != ''))

tweets_explode['remove_feat'] = np.where(condition_feat, 'Yes', '')
tweets_explode['remove_act'] = np.where(condition_act, 'Yes', '')

tweets_annotated = tweets_explode.groupb(['action', 'id', 'feature'], as_index = False).agg({'author_id' : 'first', 'conversation_id' : 'first', 'datetime' : 'first', 'text' : 'first', 'annotation_type' : 'first', 'annotation_normalized_text' : 'first', 'context_entities' : ','.join, 'context_annotation' : ','.join, 'remove_feat' : ' '.join, 'remove_act' : ' '.join})

tweets_remove = tweets_annotated.loc[~((tweets_annotated['remove_feat'].str.contains('Yes', case = False)) | (tweets_annotated['remove_act'].str.contains('Yes', case = False))),:]

In [None]:
del tweets_explode, tweets_annotated, condition_feat, condition_act

In [None]:
tweets_remove['annotation_normalized_text'] = tweets_remove['annotation_normalized_text'] .fillna('')
tweets_remove['annotation_type'] = tweets_remove['annotation_type'] .fillna('')

tweets_remove['ent_is_feat'] = tweets_remove.apply(lambda x: 'Yes' if x['feature'] in x['annotation_normalized_text'] else 'No', axis=1)
tweets_remove['ent_is_act'] = tweets_remove.apply(lambda x: 'Yes' if x['action'] in x['annotation_normalized_text'] else 'No', axis=1)

condition_feat = ((tweets_remove['ent_is_feat'] == 'Yes') & (tweets_remove['annotation_type'] != 'Place') & (tweets_remove['annotation_type'] != ''))
condition_act = ((tweets_remove['ent_is_act'] == 'Yes') & (tweets_remove['annotation_type'] != ''))

tweets_remove['remove_feat'] = np.where(condition_feat, 'Yes', '')
tweets_remove['remove_act'] = np.where(condition_act, 'Yes', '')

tweets_remove = tweets_remove.loc[~((tweets_remove['remove_feat'].str.contains('Yes', case = False)) | (tweets_remove['remove_act'].str.contains('Yes', case = False))),:]

tweets_annotated = tweets_remove.copy()

In [None]:
del tweets_remove, condition_feat, condition_act

In [None]:
tweets_annotated = tweets_annotated.drop(['remove_act', 'remove_feat', 'ent_is_feat', 'ent_is_act'], axis=1)

annotated_tweets_filename = 'CES_tweets_annotations_cleaned' + year + '.csv'
tweets_annotated.to_csv(annotated_tweets_filename, header=True, index=False, columns=list(tweets_annotated.axes[1]))

#### Create summarized data frame

In [None]:
tweets_sum = tweets_annotated.groupby(['action', 'feature', 'date'])['text'].count().reset_index(name='tweets')

tweets_sum['date'] = pd.to_datetime(tweets_sum['date'])

In [None]:
tweets_sum_filename = 'tweets_sum' + year + '.csv'
tweets_sum.to_csv(tweets_sum_filename, header=True, index=False, columns=list(tweets_sum.axes[1]))

### Get tweet counts for cleaning process

In [None]:
tweets_details_filename = 'CES_tweets_details.' + year + 'csv'
tweets_details = pd.read_csv(tweets_details_filename)
tweets_original = tweets_details.groupby(['id'], as_index = False).agg({'text' : 'first'})

annotation_tweets_filename = 'CES_tweets_to_annotate' + year + '.csv'
annotation_tweets = pd.read_csv(annotation_tweets_filename)
tweets_initial_clean = annotation_tweets.groupby(['id'], as_index = False).agg({'text' : 'first'})

tweets_cleaned_filename = 'CES_tweets_annotations_cleaned.' + year + 'csv'
tweets_cleaned = pd.read_csv(tweets_cleaned_filename)
tweets_annotations_cleaned = tweets_cleaned.groupby(['id'], as_index = False).agg({'text' : 'first'})