In [61]:
import json
import csv
from collections import defaultdict
import numpy as np
import pandas as pd
import couchdb

In [62]:
# Dict to store all data. sa4 as key.
data_dict = defaultdict(dict)

In [63]:
# data_dict

#### Get Tweet Counts and Sentiment scores for each SA4 (Must be Tunneled into Couchdb instance)

In [64]:
user = 'admin'
password = 'password'
COUCH_ADDRESS = 'localhost'

# Connect to Couch DB Server
# server = couchdb.Server("http://{}:{}@{}:5984/".format(user, password, COUCH_ADDRESS))
server = couchdb.Server("http://{}:{}@{}:15984/".format(user, password, COUCH_ADDRESS))
db = server['tweets']

In [65]:
tweet_counts = {}
sent_sum = {}
# Store Tweet Counts
for code in db.view('Results/TweetCount', group='true'):
    tweet_counts[code.key] = code.value

# Store Sentiment Scores
for code in db.view('Results/SentimentSum', group='true'):
    sent_sum[code.key] = code.value

In [66]:
for key in tweet_counts.keys():
    count = tweet_counts[key]
    sent = sent_sum[key]
    score = sent/count
    
    # store in data dict
    data_dict[key]['sentiment_score'] = score
    data_dict[key]['tweet_counts'] = count
    data_dict[key]['sent_sum'] = sent
    

#### Process data from Aurin & SA4 Geojson 

In [67]:
# Import sa4 geojson file and aurin data files
sa4_geo_file = json.load(open('SA4_geojson.json'))
sa4_centroids = json.load(open('sa4_geojson_centroid.json'))


# crime data needs lga_sa4 conversion
crime_data = json.load(open('crimedata.json'))
income_data = json.load(open('equivalisedincomedata.json'))
family_data = json.load(open('familycommunitydata.json'))
unemployment_data = json.load(open('SA4_unemployment.json'))
population_data = json.load(open('populationdata.json'))
industry_data = json.load(open('industry.json'))

lga_sa4 = open('lga_sa4.csv', 'r')

In [68]:
# Process sa4 lga conversion
lga_sa4_dict = defaultdict(list)
for line in lga_sa4:
    line = line.strip('\n')
    (lga, sa4) = line.split(",")
    lga_sa4_dict[lga] = sa4
    
# Create DF for data analysis


In [69]:
def process_aurin(data_set, features):
    for feature in features:
        for item in data_set['features']:
            if item['properties']['sa4_code_2016'] in data_dict.keys():
                data_dict[item['properties']['sa4_code_2016']][feature] = item['properties'][feature]

In [70]:
def add_to_dict(key, d, val):
    if key in d:
        d[key] = d[key] + val
    else:
        d[key] = val

In [71]:
# Process crime sa4_lga conversion
# Add sa4 to each LGA in crime_data
for item in crime_data['features']:
    if item['properties']['lga_code'] in lga_sa4_dict.keys():
        
        a = item['properties']['total_division_a_offences']
        b = item['properties']['total_division_b_offences']
        c = item['properties']['total_division_c_offences']
        d = item['properties']['total_division_d_offences']
        e = item['properties']['total_division_e_offences']
        f = item['properties']['total_division_f_offences']
        sum_crimes = a+b+c+d+e+f
        
        item['properties']['sa4_code_2016'] = lga_sa4_dict[item['properties']['lga_code']]
        item['properties']['sum_crimes'] = sum_crimes

In [72]:
# Process income and family features
income_features = ['equivalised_total_household_income_census_median_weekly']
family_features = ['rent_mortgage_payments_census_average_monthly_household_payment']
crime_features = ['total_division_a_offences', 'total_division_b_offences', 'total_division_c_offences', 
                  'total_division_d_offences', 'total_division_e_offences', 'total_division_f_offences', 
                  'sum_crimes']

process_aurin(income_data, income_features)
process_aurin(family_data, family_features)
process_aurin(crime_data, crime_features)

In [73]:
# Process for Unemployment job search weeks data
for item in unemployment_data['features']:
    if str(item['properties']['sa4_code']) in data_dict.keys():
        data_dict[str(item['properties']['sa4_code'])]['unemployed_rate'] = item['properties']['unemployed_tot_000']
        data_dict[str(item['properties']['sa4_code'])]['avg_duration_job_search_wks'] = item['properties']['avg_duration_job_search_wks']

In [74]:
# Add population data
for item in population_data['features']:
    if item['properties']['sa4_code16'] in data_dict.keys():
        data_dict[item['properties']['sa4_code16']]['persons_total'] = item['properties']['persons_total']


In [75]:
# Put centroid in
for item in sa4_centroids['features']:
    if item['properties']['SA4_CODE16'] in data_dict.keys():
        data_dict[item['properties']['SA4_CODE16']]['centroid'] = item['geometry']['coordinates']
    

In [76]:
for item in industry_data['features']:
    if item['properties']['sa4_code_2016'] in data_dict.keys():
        data_dict[item['properties']['sa4_code_2016']]['num_recreation_busi'] = item['properties']['number_businesses_industry_30_june_arts_recreation_services_num']
        data_dict[item['properties']['sa4_code_2016']]['num_scientific_busi'] = item['properties']['nmbr_bsnsss_indstry_30_jne_prfssnl_scntfc_tchncl_srvcs_nm']
        data_dict[item['properties']['sa4_code_2016']]['num_mining_busi'] = item['properties']['number_of_businesses_by_industry_as_at_30_june_mining_num']
        data_dict[item['properties']['sa4_code_2016']]['num_finance_busi'] = item['properties']['nmbr_bsnsss_indstry_30_jne_fnncl_insrnce_srvcs_nm']
        data_dict[item['properties']['sa4_code_2016']]['num_agri_busi'] = item['properties']['nmbr_bsnsss_indstry_30_jne_agrcltre_frstry_fshng_nm']

### Analysis of tweet data

In [77]:
# sentiment score vs household income
sent_scores = []
income = []
mortgage = []
unemployment = []
duration_jobsearch = []
num_recreation_busi = []
num_scientific_busi = []
num_mining_busi = []
num_finance_busi = []
num_agri_busi = []

for item in data_dict:
    try:
        sent_scores.append(data_dict[item]['sentiment_score'])
        income.append(data_dict[item]['equivalised_total_household_income_census_median_weekly']) 
        mortgage.append(data_dict[item]['rent_mortgage_payments_census_average_monthly_household_payment'])
        unemployment.append(data_dict[item]['unemployed_rate'])
        duration_jobsearch.append(data_dict[item]['avg_duration_job_search_wks'])
    except:
        print(item)

801


In [81]:
data_dict

defaultdict(dict,
            {'101': {'sentiment_score': 0.2846828662370934,
              'tweet_counts': 38643,
              'sent_sum': 11001,
              'equivalised_total_household_income_census_median_weekly': 795,
              'rent_mortgage_payments_census_average_monthly_household_payment': 1805,
              'unemployed_rate': 5.76200061,
              'avg_duration_job_search_wks': 21.29717775,
              'persons_total': 225666,
              'centroid': [149.24, -35.56],
              'num_recreation_busi': 264,
              'num_scientific_busi': 1795,
              'num_mining_busi': 59,
              'num_finance_busi': 1048,
              'num_agri_busi': 5059},
             '102': {'sentiment_score': 0.008370183196462414,
              'tweet_counts': 18996,
              'sent_sum': 159,
              'equivalised_total_household_income_census_median_weekly': 774,
              'rent_mortgage_payments_census_average_monthly_household_payment': 1890,
      

In [78]:
# sent_scores

In [79]:
data_df=pd.DataFrame.from_dict(data_dict, orient='index')

In [80]:
data_df.corr()

Unnamed: 0,sentiment_score,tweet_counts,sent_sum,equivalised_total_household_income_census_median_weekly,rent_mortgage_payments_census_average_monthly_household_payment,unemployed_rate,avg_duration_job_search_wks,persons_total,num_recreation_busi,num_scientific_busi,num_mining_busi,num_finance_busi,num_agri_busi,total_division_a_offences,total_division_b_offences,total_division_c_offences,total_division_d_offences,total_division_e_offences,total_division_f_offences,sum_crimes
sentiment_score,1.0,-0.040376,0.025819,-0.058381,-0.087398,-0.072462,0.060013,-0.042855,-0.062942,-0.022107,0.003448,-0.010999,0.055682,0.343562,0.382714,0.332001,0.146144,0.250979,0.188998,0.352827
tweet_counts,-0.040376,1.0,0.989362,0.303421,0.221303,0.28858,-0.143239,0.258604,0.662931,0.630515,0.413188,0.67056,-0.11312,0.022374,0.2198,0.196972,0.184892,-0.053644,-0.050953,0.157614
sent_sum,0.025819,0.989362,1.0,0.291482,0.22061,0.294079,-0.141949,0.258684,0.664456,0.631479,0.425523,0.667835,-0.113343,0.024104,0.222653,0.199405,0.18513,-0.054669,-0.047092,0.159677
equivalised_total_household_income_census_median_weekly,-0.058381,0.303421,0.291482,1.0,0.904217,0.195451,-0.463919,0.226448,0.536205,0.656746,0.376348,0.593276,-0.50242,0.193747,0.440954,0.317484,0.228467,0.053529,0.396397,0.346146
rent_mortgage_payments_census_average_monthly_household_payment,-0.087398,0.221303,0.22061,0.904217,1.0,0.311829,-0.498076,0.35177,0.54641,0.646868,0.287662,0.569307,-0.610097,0.330655,0.540447,0.428065,0.330079,0.185802,0.522932,0.459986
unemployed_rate,-0.072462,0.28858,0.294079,0.195451,0.311829,1.0,-0.111898,0.910267,0.610682,0.565576,0.057184,0.50073,-0.352382,0.574023,0.567434,0.661857,0.560054,0.403944,0.604531,0.572063
avg_duration_job_search_wks,0.060013,-0.143239,-0.141949,-0.463919,-0.498076,-0.111898,1.0,-0.178032,-0.294929,-0.323298,-0.10357,-0.293062,0.250221,-0.124142,-0.133021,-0.177694,-0.223935,-0.161674,-0.01973,-0.149221
persons_total,-0.042855,0.258604,0.258684,0.226448,0.35177,0.910267,-0.178032,1.0,0.645307,0.604895,0.034336,0.523038,-0.369973,0.667731,0.664901,0.719857,0.592793,0.485678,0.695104,0.663436
num_recreation_busi,-0.062942,0.662931,0.664456,0.536205,0.54641,0.610682,-0.294929,0.645307,1.0,0.9532,0.241742,0.912836,-0.326404,0.259135,0.44515,0.429596,0.375984,0.127455,0.271864,0.385062
num_scientific_busi,-0.022107,0.630515,0.631479,0.656746,0.646868,0.565576,-0.323298,0.604895,0.9532,1.0,0.353589,0.969027,-0.368311,0.234784,0.433107,0.408195,0.334566,0.08587,0.295902,0.364259


In [55]:
data_df

Unnamed: 0,sentiment_score,tweet_counts,sent_sum,equivalised_total_household_income_census_median_weekly,rent_mortgage_payments_census_average_monthly_household_payment,unemployed_rate,avg_duration_job_search_wks,persons_total,centroid,num_recreation_busi,...,num_mining_busi,num_finance_busi,num_agri_busi,total_division_a_offences,total_division_b_offences,total_division_c_offences,total_division_d_offences,total_division_e_offences,total_division_f_offences,sum_crimes
101,0.284683,38643,11001,795,1805,5.762001,21.297178,225666,"[149.24, -35.56]",264,...,59,1048,5059,,,,,,,
102,0.008370,18996,159,774,1890,9.205978,13.639877,339236,"[151.29, -33.31]",410,...,52,1916,512,,,,,,,
103,0.361404,855,309,742,1625,5.622973,28.566150,211224,"[148.36, -33.22]",192,...,109,1006,6709,,,,,,,
104,0.246459,353,87,648,1568,2.280920,138.112214,140070,"[152.77, -29.82]",150,...,33,749,1881,,,,,,,
106,0.827586,812,672,786,1801,7.075872,25.035527,272452,"[150.98, -32.35]",250,...,85,2248,2914,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
603,0.358566,251,90,625,1277,0.854961,57.138309,38053,"[146.82, -42.61]",41,...,11,134,958,,,,,,,
604,0.482456,114,55,644,1271,3.025266,92.978487,111259,"[145.56, -41.73]",71,...,43,511,1870,,,,,,,
701,0.106405,7697,819,1316,2383,5.404876,16.707763,148884,"[131.11, -12.53]",148,...,54,706,437,,,,,,,
702,0.862069,29,25,954,1945,3.117814,36.041044,98607,"[133.37, -19.49]",53,...,19,180,449,,,,,,,


#### Process Output json

In [57]:
# Output processed geojson
def merge_data(data, sa4_geo):
    output={"type": "FeatureCollection", "features":[]}
    for row in sa4_geo['features']:
        key = row['properties']['SA4_CODE16']
        if key in data.keys():
            for feature in data[key].keys():
                row['properties'][feature]= data[key][feature]
        else:
            for feature in data[key].keys():
                row['properties'][feature]='No Record'
   
    with open('output.json', 'w') as outfile:
        json.dump(sa4_geo, outfile)

In [59]:
# merge_data(data_dict, sa4_geo_file)
# check = json.load(open('output.json'))

In [None]:
# for item in check['features']:
#     if item['properties']['SA4_CODE16'] == '117':
#         print(item)

In [19]:
len(data_dict)

81

In [18]:
data_dict.keys()

dict_keys(['101', '102', '103', '104', '106', '107', '108', '110', '111', '112', '113', '114', '115', '116', '117', '118', '119', '120', '121', '122', '123', '124', '125', '126', '127', '128', '201', '202', '203', '204', '205', '206', '207', '208', '209', '210', '211', '212', '213', '214', '215', '216', '217', '301', '302', '303', '304', '305', '306', '307', '309', '310', '311', '312', '313', '314', '316', '317', '318', '319', '401', '402', '403', '404', '405', '406', '407', '501', '502', '503', '504', '505', '506', '509', '601', '602', '603', '604', '701', '702', '801'])