In [122]:
import json
import csv
from collections import defaultdict
import couchdb

In [123]:
# Dict to store all data. sa4 as key.
data_dict = defaultdict(dict)

In [124]:
# data_dict

#### Get Tweet Counts and Sentiment scores for each SA4 (Must be Tunneled into Couchdb instance)

In [125]:
user = 'admin'
password = 'password'
COUCH_ADDRESS = 'localhost'

# Connect to Couch DB Server
# server = couchdb.Server("http://{}:{}@{}:5984/".format(user, password, COUCH_ADDRESS))
server = couchdb.Server("http://{}:{}@{}:15984/".format(user, password, COUCH_ADDRESS))
db = server['tweets']

In [126]:
tweet_counts = {}
sent_sum = {}
# Store Tweet Counts
for code in db.view('Results/TweetCount', group='true'):
    tweet_counts[code.key] = code.value

# Store Sentiment Scores
for code in db.view('Results/SentimentSum', group='true'):
    sent_sum[code.key] = code.value

In [127]:
for key in tweet_counts.keys():
    count = tweet_counts[key]
    sent = sent_sum[key]
    score = sent/count
    
    # store in data dict
    data_dict[key]['sentiment_score'] = score
    data_dict[key]['tweet_counts'] = count
    data_dict[key]['sent_sum'] = sent
    

#### Process data from Aurin & SA4 Geojson 

In [128]:
# Import sa4 geojson file and aurin data files
sa4_geo_file = json.load(open('SA4_geojson.json'))
sa4_centroids = json.load(open('sa4_geojson_centroid.json'))


# crime data needs lga_sa4 conversion
crime_data = json.load(open('crimedata.json'))
income_data = json.load(open('equivalisedincomedata.json'))
family_data = json.load(open('familycommunitydata.json'))
unemployment_data = json.load(open('SA4_unemployment.json'))
population_data = json.load(open('populationdata.json'))

lga_sa4 = open('lga_sa4.csv', 'r')

In [129]:
# Process sa4 lga conversion
lga_sa4_dict = defaultdict(list)
for line in lga_sa4:
    line = line.strip('\n')
    (lga, sa4) = line.split(",")
    lga_sa4_dict[lga] = sa4
    
# Create DF for data analysis


In [130]:
def process_aurin(data_set, features):
    for feature in features:
        for item in data_set['features']:
            if item['properties']['sa4_code_2016'] in data_dict.keys():
                data_dict[item['properties']['sa4_code_2016']][feature] = item['properties'][feature]

In [131]:
def add_to_dict(key, d, val):
    if key in d:
        d[key] = d[key] + val
    else:
        d[key] = val

In [132]:
# Process crime sa4_lga conversion
# Add sa4 to each LGA in crime_data
for item in crime_data['features']:
    if item['properties']['lga_code'] in lga_sa4_dict.keys():
        
        a = item['properties']['total_division_a_offences']
        b = item['properties']['total_division_b_offences']
        c = item['properties']['total_division_c_offences']
        d = item['properties']['total_division_d_offences']
        e = item['properties']['total_division_e_offences']
        f = item['properties']['total_division_f_offences']
        sum_crimes = a+b+c+d+e+f
        
        item['properties']['sa4_code_2016'] = lga_sa4_dict[item['properties']['lga_code']]
        item['properties']['sum_crimes'] = sum_crimes

In [133]:
# Process income and family features
income_features = ['equivalised_total_household_income_census_median_weekly']
family_features = ['rent_mortgage_payments_census_average_monthly_household_payment']
crime_features = ['total_division_a_offences', 'total_division_b_offences', 'total_division_c_offences', 
                  'total_division_d_offences', 'total_division_e_offences', 'total_division_f_offences', 
                  'sum_crimes']
process_aurin(income_data, income_features)
process_aurin(family_data, family_features)
process_aurin(crime_data, crime_features)

In [134]:
# Process for Unemployment job search weeks data
for item in unemployment_data['features']:
    if str(item['properties']['sa4_code']) in data_dict.keys():
        data_dict[str(item['properties']['sa4_code'])]['unemployed_rate'] = item['properties']['unemployed_tot_000']
        data_dict[str(item['properties']['sa4_code'])]['avg_duration_job_search_wks'] = item['properties']['avg_duration_job_search_wks']

In [138]:
# Add population data
for item in population_data['features']:
    if item['properties']['sa4_code16'] in data_dict.keys():
        data_dict[item['properties']['persons_total']] = item['properties']['persons_total']


In [139]:
# Put centroid in
for item in sa4_centroids['features']:
    if item['properties']['SA4_CODE16'] in data_dict.keys():
        data_dict[item['properties']['SA4_CODE16']]['centroid'] = item['geometry']['coordinates']
    

In [141]:
# population_data['features']

In [142]:
data_dict['102']

{'sentiment_score': 0.008370183196462414,
 'tweet_counts': 18996,
 'sent_sum': 159,
 'equivalised_total_household_income_census_median_weekly': 774,
 'rent_mortgage_payments_census_average_monthly_household_payment': 1890,
 'unemployed_rate': 9.20597806,
 'avg_duration_job_search_wks': 13.63987744,
 'centroid': [151.29, -33.31]}

### Analysis of tweet data

In [None]:
# sentiment score vs household income

In [119]:
unemployment_data

{'type': 'FeatureCollection',
 'bbox': [0.0, 0.0, -1.0, -1.0],
 'features': [{'type': 'Feature',
   'properties': {'unemployed_tot_000': 9.20597806,
    'month': 'Apr',
    'sa4_name': 'Central Coast',
    'years': 2020,
    'sa4_code': 102,
    'avg_duration_job_search_wks': 13.63987744},
   'id': 'abs_rm3_unemp_persons_duration_job_search_sa4_1991_2020_vw.fid--4832820e_1795caad0ca_-47ba'},
  {'type': 'Feature',
   'properties': {'unemployed_tot_000': 6.76042518,
    'month': 'Apr',
    'sa4_name': 'Sydney - Baulkham Hills and Hawkesbury',
    'years': 2020,
    'sa4_code': 115,
    'avg_duration_job_search_wks': 12.38739883},
   'id': 'abs_rm3_unemp_persons_duration_job_search_sa4_1991_2020_vw.fid--4832820e_1795caad0ca_-47b9'},
  {'type': 'Feature',
   'properties': {'unemployed_tot_000': 14.3306232,
    'month': 'Apr',
    'sa4_name': 'Sydney - Blacktown',
    'years': 2020,
    'sa4_code': 116,
    'avg_duration_job_search_wks': 18.03434379},
   'id': 'abs_rm3_unemp_persons_duratio

#### Process Output json

In [143]:
# Output processed geojson
def merge_data(data, sa4_geo):
    output={"type": "FeatureCollection", "features":[]}
    for row in sa4_geo['features']:
        key = row['properties']['SA4_CODE16']
        if key in data.keys():
            for feature in data[key].keys():
                row['properties'][feature]= data[key][feature]
        else:
            for feature in data[key].keys():
                row['properties'][feature]='No Record'
   
    with open('output.json', 'w') as outfile:
        json.dump(sa4_geo, outfile)

In [145]:
# merge_data(data_dict, sa4_geo_file)
check = json.load(open('output.json'))

In [146]:
for item in check['features']:
    if item['properties']['SA4_CODE16'] == '117':
        print(item)

{'type': 'Feature', 'geometry': {'type': 'Polygon', 'coordinates': [[[151.17, -33.95], [151.18, -33.95], [151.18, -33.96], [151.18, -33.97], [151.19, -33.97], [151.19, -33.96], [151.18, -33.96], [151.18, -33.95], [151.19, -33.95], [151.19, -33.96], [151.19, -33.97], [151.2, -33.97], [151.19, -33.97], [151.19, -33.96], [151.19, -33.95], [151.19, -33.96], [151.2, -33.96], [151.21, -33.96], [151.21, -33.97], [151.22, -33.97], [151.21, -33.97], [151.21, -33.98], [151.22, -33.98], [151.23, -33.98], [151.23, -33.97], [151.24, -33.98], [151.24, -33.97], [151.23, -33.97], [151.23, -33.96], [151.22, -33.96], [151.23, -33.96], [151.22, -33.96], [151.23, -33.96], [151.23, -33.95], [151.23, -33.94], [151.23, -33.93], [151.23, -33.92], [151.23, -33.93], [151.22, -33.93], [151.22, -33.92], [151.21, -33.92], [151.21, -33.91], [151.21, -33.9], [151.22, -33.9], [151.22, -33.89], [151.22, -33.88], [151.23, -33.88], [151.23, -33.87], [151.23, -33.86], [151.22, -33.87], [151.22, -33.86], [151.21, -33.86],

In [120]:
len(data_dict)

107