In [1]:
import numpy as np
import pandas as pd
import folium
import json
import os

In [2]:
os.getcwd()
os.chdir("../")
print(os.getcwd())

/Users/charliezhang/Desktop/4dataviz


In [3]:
survey = pd.read_stata("data/zuobiao_raw_frame.dta")
cn_province = pd.read_csv("data/province_cn.csv")
geodata= pd.read_json("data/cn_province.json")

In [4]:
province = list()
for i in range(len(geodata['features'])):
    province.append(geodata['features'][i]['properties']["NAME_1"])
    
geodata["province"] = province

geodata.head(5)

Unnamed: 0,type,features,province
0,FeatureCollection,"{'type': 'Feature', 'geometry': {'type': 'Poly...",Anhui
1,FeatureCollection,"{'type': 'Feature', 'geometry': {'type': 'Poly...",Beijing
2,FeatureCollection,"{'type': 'Feature', 'geometry': {'type': 'Poly...",Chongqing
3,FeatureCollection,"{'type': 'Feature', 'geometry': {'type': 'Poly...",Fujian
4,FeatureCollection,"{'type': 'Feature', 'geometry': {'type': 'Poly...",Gansu


In [5]:
cn_province.head(5)

Unnamed: 0,city,lat,lng,country,admin_name,GB
0,Hefei,31.8639,117.2808,China,Anhui,31
1,Beijing,39.905,116.3914,China,Beijing,11
2,Chongqing,29.5628,106.5528,China,Chongqing,50
3,Fuzhou,26.0769,119.2917,China,Fujian,35
4,Lanzhou,36.0617,103.8318,China,Gansu,62


## Data Wragling

In [6]:
# Drop oversea respondents and merge the dataset
survey = survey[survey.overseas == 0].reset_index().drop(columns="index")
data = pd.merge(survey, cn_province, left_on="provgb", right_on="GB")

In [7]:
data.head(5)

Unnamed: 0,id1,id2,ipwgt,year,month,day,provgb,overseas,gender,birthyear,age,educ,income,city,lat,lng,country,admin_name,GB
0,1,2012_1,1.0,2012,2.0,12.0,37.0,0.0,1.0,1989.0,23.0,3.0,1.5,Jinan,36.6667,116.9833,China,Shandong,37
1,2,2012_10,0.5,2012,2.0,12.0,37.0,0.0,0.0,1982.0,30.0,3.0,3.5,Jinan,36.6667,116.9833,China,Shandong,37
2,18,2012_100010,1.0,2012,7.0,24.0,37.0,0.0,1.0,1996.0,16.0,2.0,1.5,Jinan,36.6667,116.9833,China,Shandong,37
3,37,2012_100028,1.0,2012,7.0,24.0,37.0,0.0,0.0,1996.0,16.0,2.0,1.5,Jinan,36.6667,116.9833,China,Shandong,37
4,44,2012_100034,1.0,2012,7.0,24.0,37.0,0.0,1.0,1994.0,18.0,2.0,3.5,Jinan,36.6667,116.9833,China,Shandong,37


In [8]:
# Create a dummy variable for having a college degree
data["college"] = np.where(data["educ"]>=3, 1, 0)

# Aggregate count, gender, and age
d = data[["GB", "id2"]].groupby("GB").count().merge(
    cn_province, on="GB").drop(columns=["city", "country"])
d = data[["GB", "age"]].groupby("GB").mean().merge(d, on="GB")
d = data[["GB", "gender"]].groupby("GB").mean().merge(d, on="GB")
d = data[["GB", "college"]].groupby("GB").mean().merge(d, on="GB")

# rename the columns and display the data
d.columns = ["GB", "education", "gender", "age", "count", "lat", "lng", "province"]
d.head(5)

Unnamed: 0,GB,education,gender,age,count,lat,lng,province
0,11,0.878846,0.69907,24.867495,97677,39.905,116.3914,Beijing
1,12,0.849381,0.727331,25.359186,10988,39.1467,117.2056,Tianjin
2,13,0.802932,0.776699,26.148926,9687,38.0422,114.5086,Hebei
3,14,0.82917,0.752101,25.916531,9167,37.8733,112.5425,Shanxi
4,15,0.797017,0.790396,25.988947,1542,40.8151,111.6629,Nei Mongolia


## Drawing Map

In [10]:
# Initiate the map
sv_map = folium.Map(
    location=[d.lat.mean(), d.lng.mean()], tiles="cartodbpositron", zoom_start=4)

# Create a feature group
sv = folium.map.FeatureGroup(name="Survey Statistics")

# Draw boundaries
for segment in geodata['features']:
    folium.GeoJson(data=segment['geometry'],
                   style_function=lambda x: {"color": "#000000", "weight": 1, 'fillOpacity': 0}).add_to(sv)
    sv_map.add_child(sv)


# Draw circle on map representing the respondent sizes
for i in range(0, len(d)):

    # Set up popup window
    iframe = (folium.IFrame("<p style='color:blue;font-family:verdana;font-weight:bold'>" + str(d.iloc[i]['province'] + " Respondent Statistics") + '<p/>' +
                            "<ul style='font-family:verdana'>" +
                            '<li> Number of Responses: ' +
                            str(d.iloc[i]['count']) + '</li>'
                            '<li> Average Age of Respondents: ' +
                            str(np.round(d.iloc[i]['age'], 1)) + '</li>' +
                            '<li> Percentage of Male Respondants: ' +
                            str(np.round(d.iloc[i]['gender']* 100, 1)) + '</li>' + 
                            '<li> Percentage of College Degree or Above: ' +
                            str(np.round(d.iloc[i]['education']* 100, 1)) + '</li> </ul>',
                            width=440, height=160))

    popup = folium.Popup(iframe, max_width=400)

    folium.CircleMarker(
        location=[d.iloc[i]['lat'], d.iloc[i]['lng']],
        popup=popup,
        radius=np.sqrt(d.iloc[i]["count"])/5,
        color='green',
        alpha=0.4,
        fill=True,
        fill_color="green",
        fill_opacity=0.4).add_to(sv)
    sv_map.add_child(sv)


folium.LayerControl().add_to(sv_map)

<folium.map.LayerControl at 0x7fdb837b5850>

In [12]:
sv_map.save("survey_statistics.html")