In [1]:
import json
import csv
from collections import defaultdict
import numpy as np
import pandas as pd
from sklearn import preprocessing
import couchdb

In [2]:
# Dict to store all data. sa4 as key.
data_dict = defaultdict(dict)

In [3]:
# data_dict

#### Get Tweet Counts and Sentiment scores for each SA4 (Must be Tunneled into Couchdb instance)

In [12]:
user = 'admin'
password = 'password'
COUCH_ADDRESS = 'localhost'

# Connect to Couch DB Server
# server = couchdb.Server("http://{}:{}@{}:5984/".format(user, password, COUCH_ADDRESS))
server = couchdb.Server("http://{}:{}@{}:15984/".format(user, password, COUCH_ADDRESS))
db = server['tweets']

front_end_db = server['front_end']

In [5]:
tweet_counts = {}
sent_sum = {}
# Store Tweet Counts
for code in db.view('Results/TweetCount', group='true'):
    tweet_counts[code.key] = code.value

# Store Sentiment Scores
for code in db.view('Results/SentimentSum', group='true'):
    sent_sum[code.key] = code.value

In [6]:
for key in tweet_counts.keys():
    count = tweet_counts[key]
    sent = sent_sum[key]
    score = sent/count
    
    # store in data dict
    data_dict[key]['sentiment_score'] = score
    data_dict[key]['tweet_counts'] = count
    data_dict[key]['sent_sum'] = sent
    

#### Process data from Aurin & SA4 Geojson 

In [7]:
# Import sa4 geojson file and aurin data files
sa4_geo_file = json.load(open('SA4_geojson.json'))
sa4_centroids = json.load(open('sa4_geojson_centroid.json'))


# crime data needs lga_sa4 conversion
crime_data = json.load(open('crimedata.json'))
income_data = json.load(open('equivalisedincomedata.json'))
family_data = json.load(open('familycommunitydata.json'))
unemployment_data = json.load(open('SA4_unemployment.json'))
population_data = json.load(open('populationdata.json'))
industry_data = json.load(open('industry.json'))
socio_advantage_data = json.load(open('socioirsaddata.json'))
personal_income = json.load(open('personalincomedata.json'))

lga_sa4 = open('lga_sa4.csv', 'r')

In [8]:
# Process sa4 lga conversion
lga_sa4_dict = defaultdict(list)
for line in lga_sa4:
    line = line.strip('\n')
    (lga, sa4) = line.split(",")
    lga_sa4_dict[lga] = sa4
    
# Create DF for data analysis


In [9]:
def process_aurin(data_set, features):

    for feature in features:
        try:
            for item in data_set['features']:
                if item['properties']['sa4_code_2016'] in data_dict.keys():
                    data_dict[item['properties']['sa4_code_2016']][feature] = item['properties'][feature]
        except:
            continue

In [10]:
def add_to_dict(key, d, val):
    if key in d:
        d[key] = d[key] + val
    else:
        d[key] = val

In [11]:
# Process crime sa4_lga conversion
# Add sa4 to each LGA in crime_data
for item in crime_data['features']:
    if item['properties']['lga_code'] in lga_sa4_dict.keys():
        
        a = item['properties']['total_division_a_offences']
        b = item['properties']['total_division_b_offences']
        c = item['properties']['total_division_c_offences']
        d = item['properties']['total_division_d_offences']
        e = item['properties']['total_division_e_offences']
        f = item['properties']['total_division_f_offences']
        sum_crimes = a+b+c+d+e+f
        
        item['properties']['sa4_code_2016'] = lga_sa4_dict[item['properties']['lga_code']]
        item['properties']['sum_crimes'] = sum_crimes

In [None]:
# Add sa4 to each LGA in soci_advantage_data
# for item in socio_advantage_data['features']:
#     if item['properties']['lga_code_2006_'] in lga_sa4_dict.keys():
#         item['properties']['sa4_code_2016'] = lga_sa4_dict[item['properties']['lga_code_2006_']]

In [None]:
# Process income and family features
income_features = ['equivalised_total_household_income_census_median_weekly']
family_features = ['rent_mortgage_payments_census_average_monthly_household_payment', 'sa4_name_2016']
# crime_features = ['total_division_a_offences', 'total_division_b_offences', 'total_division_c_offences', 
#                   'total_division_d_offences', 'total_division_e_offences', 'total_division_f_offences', 
#                   'sum_crimes']
# socio_features = ['irsad_score']
personal_income_features = ['mean_aud', 'median_aud']

process_aurin(income_data, income_features)
process_aurin(family_data, family_features)
process_aurin(personal_income, personal_income_features)
# process_aurin(crime_data, crime_features)
# process_aurin(socio_advantage_data, socio_features)

In [None]:
# # Process for Unemployment job search weeks data
# for item in unemployment_data['features']:
#     if str(item['properties']['sa4_code']) in data_dict.keys():
#         data_dict[str(item['properties']['sa4_code'])]['unemployed_rate'] = item['properties']['unemployed_tot_000']
#         data_dict[str(item['properties']['sa4_code'])]['avg_duration_job_search_wks'] = item['properties']['avg_duration_job_search_wks']

In [None]:
# # Add population data
# for item in population_data['features']:
#     if item['properties']['sa4_code16'] in data_dict.keys():
#         data_dict[item['properties']['sa4_code16']]['persons_total'] = item['properties']['persons_total']


In [None]:
# Put centroid in
for item in sa4_centroids['features']:
    if item['properties']['SA4_CODE16'] in data_dict.keys():
        data_dict[item['properties']['SA4_CODE16']]['centroid'] = item['geometry']['coordinates']
    

In [None]:
# for item in industry_data['features']:
#     if item['properties']['sa4_code_2016'] in data_dict.keys():
#         data_dict[item['properties']['sa4_code_2016']]['num_recreation_busi'] = item['properties']['number_businesses_industry_30_june_arts_recreation_services_num']
#         data_dict[item['properties']['sa4_code_2016']]['num_scientific_busi'] = item['properties']['nmbr_bsnsss_indstry_30_jne_prfssnl_scntfc_tchncl_srvcs_nm']
#         data_dict[item['properties']['sa4_code_2016']]['num_mining_busi'] = item['properties']['number_of_businesses_by_industry_as_at_30_june_mining_num']
#         data_dict[item['properties']['sa4_code_2016']]['num_finance_busi'] = item['properties']['nmbr_bsnsss_indstry_30_jne_fnncl_insrnce_srvcs_nm']
#         data_dict[item['properties']['sa4_code_2016']]['num_agri_busi'] = item['properties']['nmbr_bsnsss_indstry_30_jne_agrcltre_frstry_fshng_nm']

In [None]:
# personal_income

In [36]:
# out = []
# for key, value in data_dict.items():
#     curr_d = value
#     curr_d['sa4_code'] = key
#     out.append(curr_d)

### Update Sentiment Scores periodically

In [38]:

#get file from DB
in_file = front_end_db.get_attachment('test_3', 'output_123.json').read()
in_json = json.loads(in_file.decode('utf8').replace("'", '"'))


In [69]:
# in_json[0]

In [44]:
# Calculate sentiment score and normalised sentiment score
sentiment_score = {}
for key in tweet_counts.keys():
    count = tweet_counts[key]
    sent = sent_sum[key]
    score = sent/count
    
    sentiment_score[key] = score


In [46]:
# sentiment_score

In [51]:
# Update file
for item in in_json:
    if item['sa4_code'] in sentiment_score.keys():
        key = item['sa4_code']
        item['sentiment_score'] = sentiment_score[key]
        item['sent_sum'] = sent_sum[key]
        item['tweet_counts'] = tweet_counts[key]

In [53]:
# Put attachment to DB
doc = front_end_db['test_3']
front_end_db.put_attachment(doc, in_json, 'out_data.json', "application/json")

### Analysis of tweet data

In [None]:
# sent_scores

In [None]:
data_df=pd.DataFrame.from_dict(data_dict, orient='index')

In [None]:
# data_df

In [None]:
data_df.corr()

In [None]:
min_max_scaler = preprocessing.MinMaxScaler()
scaler_data = np.array(data_df.sentiment_score).reshape(-1, 1)
sentiment_scaled = min_max_scaler.fit_transform(scaler_data)
data_df['sentiment_scaled'] = sentiment_scaled


In [None]:
data_df

In [None]:
dict_out = data_df.to_dict('index')

In [None]:
dict_out

In [None]:
import seaborn as sns; sns.set_theme(color_codes=True)


ax = sns.regplot(x="equivalised_total_household_income_census_median_weekly", y="sentiment_score", data=data_df)

#### Process Output json

In [None]:
# Output processed geojson
def merge_data(data, sa4_geo):
    output={"type": "FeatureCollection", "features":[]}
    for row in sa4_geo['features']:
        key = row['properties']['SA4_CODE16']
        if key in data.keys():
            for feature in data[key].keys():
                row['properties'][feature]= data[key][feature]
        else:
            for feature in data[key].keys():
                row['properties'][feature]='No Record'
   
    with open('output.json', 'w') as outfile:
        json.dump(sa4_geo, outfile)

In [None]:
# merge_data(data_dict, sa4_geo_file)


In [None]:
# check = json.load(open('output.json'))
# for item in check['features']:
#     if item['properties']['SA4_CODE16'] == '117':
#         print(item)

In [None]:
len(data_dict)

In [None]:
data_dict.keys()

In [None]:
sa4_geo_file['features'][2]