# Data

In [3]:
import json
import ijson
import re
from collections import defaultdict
from itertools import islice

import numpy as np
import pandas as pd

In [4]:
# Open the JSON file
with open("sal.json") as f:
    sal_data = json.load(f)

print(type(sal_data))

<class 'dict'>


In [5]:
with open("smallTwitter.json", encoding="utf-8") as f:
    twi_data = json.load(f)

print(type(twi_data))

<class 'list'>


In [6]:
print("Number of sal keys:", len(sal_data.keys()))
print("Number of twitters:", len(twi_data))

Number of sal keys: 15340
Number of twitters: 119160


In [7]:
sal_data["abbotsbury"]

{'ste': '1', 'gcc': '1gsyd', 'sal': '10002'}

In [7]:
sal_data["richmond (vic.)"]

{'ste': '2', 'gcc': '2gmel', 'sal': '22170'}

In [8]:
sal_data["aarons pass"]  # rurual location

{'ste': '1', 'gcc': '1rnsw', 'sal': '10001'}

In [9]:
len(sal_data.keys())

15340

In [10]:
lst = []
for k in sal_data.keys():
    # number - r - loc
    if sal_data[k]["gcc"][1] != "r":
        lst.append(sal_data[k]["gcc"])


print(sorted(list(set(lst))))

['1gsyd', '2gmel', '3gbri', '4gade', '5gper', '6ghob', '7gdar', '8acte', '9oter']


In [11]:
id_set = set()
for tweet in twi_data:
    if "_id" in tweet:
        id_set.add(tweet["_id"])

if len(twi_data) == len(id_set):
    print("All _id values are unique")
else:
    print("There are duplicate _id values")

All _id values are unique


In [12]:
twi_data[0]  # Australia (can be ignored)

{'_id': '1412193387575316482',
 '_rev': '2-0fa70896c4b97c5fa391af1b9ea8e0d1',
 'data': {'author_id': '836119507173154816',
  'conversation_id': '1412193387575316482',
  'created_at': '2021-07-05T23:35:15.000Z',
  'entities': {'urls': [{'start': 83,
     'end': 106,
     'url': 'https://t.co/q8etqvtYTz',
     'expanded_url': 'https://twitter.com/PiperJackson_xx/status/1412193387575316482/photo/1',
     'display_url': 'pic.twitter.com/q8etqvtYTz'}]},
  'geo': {'place_id': '3f14ce28dc7c4566'},
  'lang': 'en',
  'public_metrics': {'retweet_count': 0,
   'reply_count': 0,
   'like_count': 0,
   'quote_count': 0},
  'text': '🌼❤ Kindness is the ability and desire to have a positive impact upon others. xx💕xx https://t.co/q8etqvtYTz',
  'sentiment': 0.3684210526315789},
 'includes': {'places': [{'full_name': 'Australia',
    'geo': {'type': 'Feature',
     'bbox': [112.921114, -43.740482, 159.109219, -9.142176],
     'properties': {}},
    'id': '3f14ce28dc7c4566'}]},
 'matching_rules': [{'id':

In [14]:
twi_data[100]  # Sydney, NSW

{'_id': '1412192485812555778',
 '_rev': '2-76667587bc37ce06a551b61a9ec36656',
 'data': {'author_id': '156677140',
  'conversation_id': '1412178746828681221',
  'created_at': '2021-07-05T23:31:40.000Z',
  'entities': {'annotations': [{'start': 60,
     'end': 63,
     'probability': 0.4782,
     'type': 'Person',
     'normalized_text': 'Albo'}],
   'mentions': [{'start': 0,
     'end': 8,
     'username': 'z_p1ngu',
     'id': '441036228'},
    {'start': 9,
     'end': 19,
     'username': 'OtherAudi',
     'id': '1341641077674057728'},
    {'start': 20, 'end': 27, 'username': 'AlboMP', 'id': '254515782'}]},
  'geo': {'place_id': '0073b76548e5984f'},
  'lang': 'en',
  'public_metrics': {'retweet_count': 0,
   'reply_count': 0,
   'like_count': 0,
   'quote_count': 0},
  'text': '@z_p1ngu @OtherAudi @AlboMP Even this is more detailed than Albo',
  'sentiment': 0},
 'includes': {'places': [{'full_name': 'Sydney, New South Wales',
    'geo': {'type': 'Feature',
     'bbox': [150.520928608

In [15]:
twi_data[299]  # Shepparton - Mooroopna, VIC

{'_id': '1412189836245495808',
 '_rev': '2-327e3959a6d7c9c272aa077af18a354c',
 'data': {'author_id': '2780377073',
  'conversation_id': '1412189776837550086',
  'created_at': '2021-07-05T23:21:08.000Z',
  'entities': {},
  'geo': {'place_id': '0050d04d64e25ba9'},
  'lang': 'en',
  'public_metrics': {'retweet_count': 0,
   'reply_count': 0,
   'like_count': 0,
   'quote_count': 0},
  'text': 'Pretty big!',
  'sentiment': 1},
 'includes': {'places': [{'full_name': 'Shepparton - Mooroopna, Victoria',
    'geo': {'type': 'Feature',
     'bbox': [145.338536704, -36.45701376, 145.461960992, -36.3452964595],
     'properties': {}},
    'id': '0050d04d64e25ba9'}]},
 'matching_rules': [{'id': 1412189062442586000,
   'tag': 'Australia-based users or Australia-located tweets, but no re-tweets'}]}

In [13]:
# Check all unique place full name in twitter data
place_full_name_lst = []
for tweet in twi_data:
    # Get full_name of the place
    place_full_name = tweet["includes"]["places"][0]["full_name"]
    place_full_name_lst.append(place_full_name)
    # HERE consider cutting the name

print(list(set(place_full_name_lst)))

['Canberra, Australian Capital Territory', 'Queensland, Australia', 'Coffs Harbour, New South Wales', 'Victoria, Australia', 'Anglesea, Victoria', 'Margaret River, Western Australia', 'Sunshine, Melbourne', 'Warrnambool, Victoria', 'Townsville, Queensland', 'Mansfield, Victoria', 'Cairns, Queensland', 'Geelong, Victoria', 'Ayr, Queensland', 'Wollongong, New South Wales', 'Serpentine, Western Australia', 'Wangaratta, Victoria', 'Tasmania, Australia', 'Perth, Western Australia', 'Hobart, Tasmania', 'Queanbeyan, New South Wales', 'New South Wales, Australia', 'South Australia, Australia', 'Central Coast, New South Wales', 'Gisborne, Victoria', 'Mossman Gorge, Mossman', 'Helensburgh, New South Wales', 'Scoresby, Melbourne', 'Torquay - Jan Juc, Victoria', 'Byron Bay, New South Wales', 'Brisbane, Queensland', 'Sunbury, Victoria', 'Toowoomba, Queensland', 'Auburn, South Australia', 'Australia', 'Whitton, New South Wales', 'Windsor, Melbourne', 'Eveleigh, Sydney', 'Sydney, New South Wales', 'P

In [14]:
print(len(list(set(place_full_name_lst))))

77


In [15]:
print(len(list(place_full_name_lst)))

715


# Q1 (count the number of tweets in the various capital cities)

In [8]:
def get_capital_cities(sal_data):
    capital_city_lst = []

    for k in sal_data.keys():
        gcc = sal_data[k]["gcc"]
        # get rid of rural locations
        if gcc[1] != "r" and gcc not in capital_city_lst:
            capital_city_lst.append(gcc)

    return capital_city_lst

In [9]:
get_capital_cities(sal_data)

['1gsyd',
 '2gmel',
 '3gbri',
 '4gade',
 '5gper',
 '6ghob',
 '7gdar',
 '8acte',
 '9oter']

In [10]:
def get_sal_city_match(sal_data):
    sal_city_match = {}
    capital_cities = get_capital_cities(sal_data)

    for region in sal_data.keys():
        gcc = sal_data[region]["gcc"]

        if gcc in capital_cities:
            sal_city_match[region] = gcc

    return sal_city_match

In [11]:
len(get_sal_city_match(sal_data))

3394

In [12]:
def get_sal_name(full_name):
    sal_names = full_name.split(",")[0:]  # split by comma and take the first part
    sal_names = [name.strip().lower() for name in sal_names]
    
    if len(sal_names) > 1:
        sub_names = []
        for name in sal_names[1:]:
            sal_names.extend(re.split(r"\W+", name))
        sal_names.extend([name for name in sub_names if name])
        
    return sal_names

In [13]:
def get_city_tweet_counts(sal_data, twitter_data):
    capital_city_lst = get_capital_cities(sal_data)
    sal_city_match = get_sal_city_match(sal_data)

    city_tweet_counts = {city: 0 for city in capital_city_lst}
    author_lst = []

    for tweet in twitter_data:
        if tweet["includes"] and "places" in tweet["includes"]:
            full_name = tweet["includes"]["places"][0]["full_name"]
            author_id = tweet['data']["author_id"]
            
            if full_name in sal_city_match.keys():
                greater_city = sal_city_match[sub_name]
            else:  
                sal_name = get_sal_name(full_name)

                for sub_name in sal_name:
                    if sub_name in sal_city_match.keys():
                        if sal_city_match[sub_name] == '5gper':
                            author_lst.append(author_id)
                        greater_city = sal_city_match[sub_name]
                        city_tweet_counts[greater_city] += 1
                        break
    print(len(author_lst))
    city_tweets_df = pd.DataFrame.from_dict(
        city_tweet_counts, orient="index", columns=["Number of Tweets Made"]
    ).rename_axis("Greater Capital City")

    return city_tweets_df

In [14]:
get_city_tweet_counts(sal_data, twi_data)

90348


Unnamed: 0_level_0,Number of Tweets Made
Greater Capital City,Unnamed: 1_level_1
1gsyd,29126
2gmel,32846
3gbri,10066
4gade,7262
5gper,7160
6ghob,727
7gdar,504
8acte,2656
9oter,1


# Q2 (count the number of tweets made by the same individual and return the top 10 tweeters)

In [None]:
def get_top_tweeters(twitter_data):
    author_tweet_counts = {}
    for tweet in twitter_data:
        author_id = tweet['data']["author_id"]
        if author_id in author_tweet_counts:
            author_tweet_counts[author_id] += 1
        else:
            author_tweet_counts[author_id] = 1

    top_ten = sorted(author_tweet_counts.items(), key=lambda x: x[1], reverse=True)[:10]

    top_ten_df = pd.DataFrame(
        top_ten, columns=["Author Id", "Number of Tweets Made"]
    ).rename_axis("Rank")
    top_ten_df.index += 1

    return top_ten_df

In [None]:
get_top_tweeters(twi_data)

Unnamed: 0_level_0,Author Id,Number of Tweets Made
Rank,Unnamed: 1_level_1,Unnamed: 2_level_1
1,51378153,32
2,384233102,23
3,156677140,17
4,1244795045934280704,13
5,279323894,11
6,213903403,11
7,4648031797,9
8,99367063,9
9,986277960,8
10,7050962,7


# Q3 (tweeters that have tweeted in the most Greater Capital cities and the number of times they have tweeted from those locations)

In [55]:
def get_author_city_counts(sal_data, twitter_data):
    author_city_counts = {}
    sal_city_match = get_sal_city_match(sal_data)

    
    for tweet in twitter_data:
        if tweet["includes"] and "places" in tweet["includes"]:
            full_name = tweet["includes"]["places"][0]["full_name"]
            
            if full_name in sal_city_match.keys():
                greater_city = sal_city_match[sub_name]
                author_id = tweet['data']["author_id"]
            else:            
                sal_name = get_sal_name(full_name)

                for sub_name in sal_name:
                    if sub_name in sal_city_match.keys():
                        greater_city = sal_city_match[sub_name]
                        author_id = tweet['data']["author_id"]
                        if author_id in author_city_counts:
                            if greater_city in author_city_counts[author_id]:
                                author_city_counts[author_id][greater_city] += 1
                            else:
                                author_city_counts[author_id][greater_city] = 1
                        else:
                            author_city_counts[author_id] = {greater_city: 1}
                        break 
                        
    # to make a different sample for testing
    author_city_counts['940868397528698880']['2gmel'] = 5
    author_city_counts['940868397528698880']['3gbri'] = 2
    author_city_counts['7050962']['3gbri'] = 10
                    
    # sort the dict by the unique number of cities
    author_city_counts = sorted(author_city_counts.items(), key=lambda x: len(x[1]), reverse=True)

    rows = []
    
    for i, (author_id, city_tweet_counts) in enumerate(author_city_counts):
        num_unique_cities = len(city_tweet_counts)
        num_tweets = sum(city_tweet_counts.values())
        row = {
            'Author Id': author_id,
            'Number of Unique City Locations': num_unique_cities,
            '#Tweets': f"#{num_tweets} tweets - {', '.join(f'{count}{city[1:]}' for city, count in city_tweet_counts.items())}"
        }
        rows.append(row)
        
    author_city_counts_df = pd.DataFrame(rows).rename_axis("Rank")
    author_city_counts_df.index += 1
                
    return author_city_counts_df

In [56]:
get_author_city_counts(sal_data, twi_data)

Unnamed: 0_level_0,Author Id,Number of Unique City Locations,#Tweets
Rank,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
1,940868397528698880,3,"#11 tweets - 4gsyd, 5gmel, 2gbri"
2,7050962,2,"#17 tweets - 7gsyd, 10gbri"
3,14450834,1,#1 tweets - 1gsyd
4,167824089,1,#1 tweets - 1gsyd
5,384233102,1,#23 tweets - 23gsyd
...,...,...,...
280,1351649162,1,#1 tweets - 1acte
281,1027167886148689920,1,#1 tweets - 1acte
282,137315172,1,#1 tweets - 1acte
283,7598552,1,#1 tweets - 1acte
