## Community detection for all metropolitan areas

In [26]:
from IPython.core.display import display, HTML
display(HTML("<style>.container { width:90% !important; }</style>"))

In [27]:
import pandas as pd
import numpy as np
import geopandas as gpd
import seaborn as sns
import networkx as nx

import scipy
import csv

import matplotlib.cm as cm
import matplotlib.pyplot as plt
import json
import community as community_louvain
from copy import deepcopy
# from modularity_maximization.utils import get_modularity

from itertools import product
import networkx.algorithms.community as nx_comm
from scipy.spatial.distance import pdist, squareform

import math
from time import time

%matplotlib inline

In [28]:
from oct2py import octave
#octave.addpath('/home/ubuntu/GenLouvain/')
#octave.addpath('/home/ubuntu/GenLouvain/private/')
_ = octave.addpath('/home/barcsab/projects/urban_communities/scripts')
_ = octave.addpath('/home/ubuntu/GenLouvain/')
_ = octave.addpath('/home/ubuntu/GenLouvain/private/')



### data

In [29]:
# three networks - data IN
mobility = pd.read_csv("../data/usageousers_city_mobility_CT_networks.rpt.gz") ## basis of position and node importance calculations
follow_hh = pd.read_csv("../data/usageousers_city_follower_CT_HH_networks.rpt.gz")
follow_hh = follow_hh.rename(columns={"tract_home.1": "tract_home_1"})

# census tract name -> cbsacode
cbsacode = pd.read_csv("../data/cbsacode_shortname_tracts.csv",sep=";", index_col=0)
cbsacode['clean_name'] = cbsacode["short_name"].map(lambda s: s.split("/")[0].replace(' ','_').replace('.','').lower())

# census data
census = pd.read_csv("../data/censusdata_top50_2012.csv")
census_2 = pd.read_csv("../data/censusdata_top50_2017.csv")

# reading geojson data, converting it to geopandas dataframe
tract_geoms = gpd.GeoDataFrame.from_features(
    [json.loads(e.strip('\n')) for e in open('../data/censustract_geoms_top50.geojson').readlines()]
)

# Cartesian coordinate projection of tract centroids
tract_geoms['centroid'] = tract_geoms['geometry'].centroid
tract_center_dict = tract_geoms\
    .set_geometry('centroid',crs={'init':'epsg:4326'})\
    .to_crs({'init':'epsg:3785'})\
    .set_index('full_geoid')['centroid'].map(lambda p: p.coords[0]).to_dict()

  return _prepare_from_string(" ".join(pjargs))


In [40]:
def create_graphs(city, g_type):
    """
    For a given city name, it generates a mobility and follower (home-home) graph.
    
    e.g. g_mob, g_fol_hh = create_graphs("Boston")
    
    It uses the previously loaded `mobility` and `follow_hh` pandas.DataFrames, in which
    the edges are listed for every city.
    
    Parameters:
    -----------
    city : str
        name of the city, see cbsacode dataframe -> clean_name
        
    g_type : str
        either "mob" as mobility or "fol_hh" as follow_hh 
        selects the type of graph to return
        
    Returns:
    --------
    
    g : networkx.Graph
        weighted undirected graph based on city name and g_type (e.g. follow_hh graph of Boston)
        
    """
    # city cbsacode based on name
    city_code = cbsacode[cbsacode.clean_name == city].iloc[0].cbsacode
    
    # select graph type
    if g_type == "mob":
        # filtering large dataframes for the given city code
        mob_df = mobility[(mobility["cbsacode"] == city_code)&(mobility["tract_home"]!=mobility["tract_work"])]

        # create graphs
        # create empty graphs
        g_mob = nx.DiGraph() # mobility graph - weights are counts

        # fill in the networks with data
        mob_df['w_edges'] = list(zip(mob_df.tract_home,mob_df.tract_work,mob_df.cnt))
        g_mob.add_weighted_edges_from(mob_df["w_edges"], weight='cnt')

        # ineffective and slow!
        for e in g_mob.edges():
            r = (e[1],e[0])

            if r in g_mob.edges():
                c1 = g_mob.edges[e]['cnt']
                c2 = g_mob.edges[r]['cnt']

                g_mob.edges[e]['cnt'] = c1 + c2
                g_mob.edges[r]['cnt'] = c1 + c2

        # then let's convert the mobility graph to udirected
        g_mob = g_mob.to_undirected()

        g = g_mob
        
    elif g_type == "fol_hh":            
        # filtering large dataframes for the given city code
        fol_hh_df = follow_hh[(follow_hh["cbsacode"] == city_code)&(follow_hh["tract_home"]!=follow_hh["tract_home_1"])]

        # create graphs
        # create empty graphs
        g_fol_hh = nx.Graph() # follow home-home graph - weights are counts

        # this is an undirected graph already in the dataframe
        fol_hh_df['w_edges'] = list(zip(fol_hh_df.tract_home,fol_hh_df.tract_home_1,fol_hh_df.cnt))
        g_fol_hh.add_weighted_edges_from(fol_hh_df["w_edges"], weight='cnt')
        
        g = g_fol_hh
        
    # TODO --> DONE
    # check data - if all nodes of the graph are in the tract_geom dataframe
    # e.g. in create_graphs()
    # if someone's not there, that is data error, print the tract_id, and leave the node out of the graph G
    # only after this should we calculate the Expert input data    
    while not set(g.nodes).issubset(set(tract_geoms.full_geoid)): # KERDES: ezt hogyan ellenőrizzem le?
        print('DATA ERROR. Node do(es) not have corresponding geodata, so dropped.')
        print('Dropped node(s):')
        nodes_to_drop = set(g.nodes).difference(set(tract_geoms.full_geoid))
        g.remove_nodes_from(nodes_to_drop)
    return g

In [31]:
def SpaMod(A,D,N,binnumber): # binnumber instead of b = binsize
    """
    Function that calculates the matrix for the clustering 
    based on spatial null model a la Expert.
    
    Parameters:
    -----------
    
    A : scipy.sparse.csr.csr_matrix
        adjacency matrix
    D : numpy.ndarray
        Distance matrix between the nodes
    N : numpy.matrix
        a measure of the importance of a node
        the number of users living(home-location) in the given tract
    binnumber : int
        number of distance bins (used in the estimation of the deterrence function)
    Returns:
    --------
    
    KERDES - ellenorizni
    ModularitySpa : 
    ModularityGN :
    """
    
    tic = time()
    
    print("Beginning of modularity function...");   
    # felesleges?? KERDES -- symmetrised matrix (doesn't change the outcome of community detection (arXiv:0812.1770))
    A = A + A.T ### KERDES KELL-e?? TODO ATGONDOLNI? ILLETVE LE KELL-e osztani 2-vel   / 2     
    b = D.max()/(binnumber-1) # MODIFIED
    
    # deterrence function
    det, detbins = np.histogram(
        D.flatten(),
        range = (0, np.ceil(D.max()/b)*b), # JAVITAS
        weights = np.array(A.todense()).flatten(), 
        bins=int(np.ceil(D.max()/b))
    )
    normadet, _ = np.histogram(
        D.flatten(), 
        range = (0, np.ceil(D.max()/b)*b),
        weights = np.array(N*N.T).flatten(), 
        bins=int(np.ceil(D.max()/b))
    )
    det = det / normadet
    det[np.isnan(det)] = 0
    
    toc = time()
    print("Done.","%.2f" % (toc-tic))
    
    #tic = toc
    
    print("Null modell...")
    
    # copmutation of the randomised correlations (preserving space), spatial
    # null-model
    nullmodelSpa = det[np.digitize(D,detbins,right=True)-1]
    
    toc = time()
    print("Done.","%.2f" % (toc-tic))
    
    #tic = toc
    
    print("Modularity calc...")
    
    # the modularity matrix for the spatial null-model
    ModularitySpa=A-np.multiply(N*N.T, nullmodelSpa*A.sum())/(np.multiply(N*N.T,nullmodelSpa).sum())
    szamlalo = np.multiply(N*N.T, nullmodelSpa*A.sum())
    nevezo = np.multiply(N*N.T,nullmodelSpa).sum()
    
    # the modularity matrix for the GN null-model
    degree = degree = A.sum(axis=0) # JAVITVA np.squeeze(np.asarray(A.sum(axis=0))) # degree or strength of nodes || asarry for further usage
    nullmodelGN = degree.T*degree/degree.sum() # Newman-Girvan null-model
    ModularityGN = A - nullmodelGN
    
    toc = time()
    print("Done.","%.2f" % (toc-tic))
    
    return ModularitySpa, ModularityGN

In [32]:
# CONSENSUS CLUSTERING
def consen(city, algorithm_type, g_type):
    """
    Function that does the consensus clustering based on the results
    of multiple runs of previous algorithms.
    
    Parameters:
    -----------
    
    city : str
        cityname to runt he consensus clustering for (see cbsacode.clean_name)
    algorithm_type: str
        either "ms" or "mgn" 
        selects the clustering algoritm type: spatail (a la Expert) or ordinary Louvain clustering with Girvan-Newman
    g_type : str
        either "mobility" or "follow_hh"
        selects the type of graph
        
    Returns:
    --------
    
    s_louv : dict
        tract_geoid -> partition label (int)
    """
    
    tic = time()

    print("Reading in necessary data...")
    csv = '../data/consensus_' + city + '_' + algorithm_type + '_' + g_type + '.csv'

    # results of multiple iterations from previous runs
    iters = pd.read_csv(csv)
    iters = iters.set_index('geoid')
    iters['clusts'] = [np.array(l) for l in iters.values.tolist()]

    toc = time()
    print("Done.","%.2f" % (toc-tic))
    tic = toc

    print("Creating all possible node pairs...")
    # create all possible node pairs
    geoid_pairs = list(product(list(iters.index), list(iters.index)))
    consen_df = pd.DataFrame(geoid_pairs, columns=['geoid_1','geoid_2'])

    # remove selfloops
    consen_df = consen_df[consen_df.geoid_1!=consen_df.geoid_2]
    toc = time()
    print("Done.","%.2f" % (toc-tic))
    tic = toc

    print("Joining interation results to node pairs...")
    # joining iteration results as lists to both elements of the tract pair
    consen_df = pd.merge(consen_df, iters['clusts'], left_on = 'geoid_1', right_on = 'geoid')
    consen_df = pd.merge(consen_df, iters['clusts'], left_on = 'geoid_2', right_on = 'geoid')
    consen_df = consen_df.rename(columns = {'clusts_x': 'clusts_1', 'clusts_y': 'clusts_2'})
    toc = time()
    print("Done.","%.2f" % (toc-tic))
    tic = toc

    print("Counting same partitioning for node pairs...")
    # how many times are the two tracts (geoid_1 and geoid_2) clustered to the same community?
    # --> weights of a graph on which clustering gives the consensus clustering
    print("Calculating difference...")
    diff = np.array(consen_df['clusts_2'].tolist()) == np.array(consen_df['clusts_1'].tolist())
    del consen_df['clusts_1'], consen_df['clusts_2']
    toc = time()
    print("Done.","%.2f" % (toc-tic))
    tic = toc

    print("Getting number of zero differences...") 
    consen_df['w'] = diff.sum(axis=1)
    del diff
    consen_df = deepcopy(consen_df[consen_df['w']!=0])
    toc = time()
    print("Done.","%.2f" % (toc-tic))
    tic = toc

    print("Last Louvain...")
    # graph for consensus clustering
    print("Creating graph...")
    g_cons = nx.Graph() 
    g_cons.add_weighted_edges_from(consen_df[['geoid_1','geoid_2','w']].values, weight='w')
    toc = time()
    print("Done.","%.2f" % (toc-tic))
    tic = toc

    del consen_df, iters

    print("Running Louvain...")
    # Louvain community detection 
    s_louv = community_louvain.best_partition(g_cons, weight='w')
    toc = time()
    print("Done.","%.2f" % (toc-tic))
    tic = toc

    return s_louv

In [33]:
city_l = cbsacode.clean_name.unique()

In [None]:
# ITT TARTOK 0325
# unique lists for city names
city_l = cbsacode.clean_name.unique()

tract_outdeg_mob = mobility.groupby('tract_home')[['cnt']].sum()

for city in city_l:
    for g_type in ['mob','fol_hh']:
        G = create_graphs(city, g_type) # corresponding weighted undirected graph
        
        
        # TODO --> DONE
        # index conversion dicts
        # for i,node_id in enumerate(G.nodes()):
        # for elem in enumerate(["alma","korte"]):
        #     print(elem)
        # geoid -> integer 0-... N-1
        # az elozo dict megforditottja
        # int -> geoid
        index_geoid_dict = dict(list(enumerate(G.nodes)))
        geoid_index_dict = dict(zip(list(index_geoid_dict.values()), list(index_geoid_dict.keys())))

        
        # Dataprep for Expert algorithm
        A = nx.adjacency_matrix(G)
        coords = np.array([tract_center_dict[n] for n in G.nodes()])
        d = pdist(coords)
        D = squareform(pdist(coords))
        
        # importance - number of user home in each tract
        # TODO we should check if all nodes in the follow_hh graph have an importance!
        # otherwise, the N... line is going to throw an error
        if not set(G.nodes).issubset(set(tract_outdeg_mob.reset_index().tract_home)): # test if the node is in any city KERDES : adott városra teszteljem?
            print('Error. Node(s) without importance value(s) They are dropped.') ## --> DONE
            missing_nodes = list(set(G.nodes)-set(tract_outdeg_mob.reset_index().tract_home))
            for node in missing_nodes:
                ## KERDES - ezt ki is dobjam??
                G.remove_node(node)    
        N = np.matrix([tract_outdeg_mob.loc[k].iloc[0] for k in G.nodes()]).T
        
        
        # Calculate clusterings for the given graph and write the outcome of runs to csvs
        S_ms_df = pd.DataFrame()
        S_mgn_df = pd.DataFrame()
        for _ in range(10):
            # TODO Eszter!!!! sometimes it gives an error in the first line
            # new thing: nout = 3
            Ms,Mgn = SpaMod(A,D,N,200) ### KERDES what should be the number of bins? 100?
            S_ms,Q_ms,n_it_ms = octave.iterated_genlouvain(Ms, nout=3)
            S_ms_df[len(S_ms_df.columns)] = S_ms.T[0]
            S_mgn,Q_mgn,n_it_mgn = octave.iterated_genlouvain(Mgn, nout=3)
            S_mgn_df[len(S_mgn_df.columns)] = S_mgn.T[0]

            # TODO itt egy lepesben meg lehet cisnalni mindket algorithm_type-ot!!! --> DONE
            for (algorithm_type, df) in [('ms',S_ms_df),('mgn',S_mgn_df)]:
                df['geoid'] = df.index.map(index_geoid_dict)
                # TODO --> DONE  S_df["geoid"] = S_df.index.map(a_masodik_dicted) -- kérdés: ez mit tud, amit az alatta levő sor nem?
                #S_df['geoid'] = list(G.nodes()) ## KERDES JO??? - szerintem igen (Eszter)
                df = df.set_index('geoid')
                csv_name = 'consensus_' + city + '_' + algorithm_type + '_' + g_type + '.csv'
                df.to_csv('../data/'+ csv_name)
                # TODO --> DONE mindket algorithm type-ra kimenteni a csv-t

In [34]:
from itertools import product

In [35]:
# unique lists for city names
city_l = cbsacode.clean_name.unique()

In [77]:
all_combs = product(city_l, ['mob','fol_hh'], ['ms','mgn'])
city, g_type, algorithm_type = list(all_combs)[0]
city, g_type, algorithm_type

('new_york', 'mob', 'ms')

In [240]:
len(city_l)
c_l = city_l[:2]

In [241]:
c_l

array(['new_york', 'los_angeles'], dtype=object)

In [244]:
S_cons = consen('new_york', 'ms', 'mob')

Reading in necessary data...
Done. 0.03
Creating all possible node pairs...
Done. 9.29
Joining interation results to node pairs...
Done. 18.46
Counting same partitioning for node pairs...
Calculating difference...
Done. 19.98
Getting number of zero differences...
Done. 1.77
Last Louvain...
Creating graph...
Done. 25.86
Running Louvain...
Done. 93.89


In [285]:
city_l

array(['new_york', 'los_angeles', 'riverside', 'san_diego', 'chicago',
       'dallas', 'philadelphia', 'houston', 'washington', 'miami',
       'atlanta', 'boston', 'providence', 'san_francisco', 'san_jose',
       'detroit', 'phoenix', 'seattle', 'minneapolis', 'st_louis',
       'tampa', 'baltimore', 'denver', 'pittsburgh', 'portland',
       'charlotte', 'sacramento', 'san_antonio', 'orlando', 'cincinnati',
       'cleveland', 'kansas_city', 'las_vegas', 'columbus',
       'indianapolis', 'austin', 'virginia_beach', 'nashville',
       'milwaukee', 'jacksonville', 'memphis', 'oklahoma_city',
       'louisville', 'hartford', 'richmond', 'new_orleans', 'buffalo',
       'raleigh', 'birmingham', 'salt_lake_city'], dtype=object)

In [286]:
city = 'jacksonville'
csv = '../data/consensus_' + city + '_' + algorithm_type + '_' + g_type + '.csv'
iters = pd.read_csv(csv)

In [287]:
iters.geoid[11]

'14000US36119006801'

In [288]:
cbsacode[cbsacode['geoid'] == iters.geoid[11]]

Unnamed: 0,geoid,cbsacode,short_name,clean_name
4449,14000US36119006801,35620.0,New York,new_york


In [274]:
city

'new_york'

In [248]:
S_cons_la = consen('los_angeles', 'ms', 'mob')

Reading in necessary data...
Done. 0.03
Creating all possible node pairs...
Done. 3.32
Joining interation results to node pairs...
Done. 6.81
Counting same partitioning for node pairs...
Calculating difference...
Done. 7.64
Getting number of zero differences...
Done. 0.98
Last Louvain...
Creating graph...
Done. 16.51
Running Louvain...
Done. 84.22


In [250]:
S_cons_la

{'14000US36005021800': 0,
 '14000US36005026500': 0,
 '14000US36047016000': 0,
 '14000US34035050500': 0,
 '14000US36103158512': 0,
 '14000US36081126700': 0,
 '14000US36047039100': 0,
 '14000US36061004100': 1,
 '14000US36119002202': 2,
 '14000US36119006801': 2,
 '14000US34013021802': 2,
 '14000US34027040805': 2,
 '14000US36061021000': 2,
 '14000US36061013100': 2,
 '14000US34003024200': 1,
 '14000US34003014000': 1,
 '14000US34029720203': 0,
 '14000US34025808701': 0,
 '14000US36059520402': 0,
 '14000US36103123201': 0,
 '14000US36103134904': 1,
 '14000US36103110501': 1,
 '14000US36059405900': 1,
 '14000US36081155102': 1,
 '14000US34025802400': 2,
 '14000US34013019200': 2,
 '14000US34025810003': 0,
 '14000US36061011300': 0,
 '14000US36085022600': 0,
 '14000US36085002001': 0,
 '14000US36081137700': 0,
 '14000US36061003800': 0,
 '14000US36061013500': 0,
 '14000US36061009900': 0,
 '14000US36103158007': 1,
 '14000US36047051001': 0,
 '14000US34035050903': 0,
 '14000US34035050802': 0,
 '14000US340

In [290]:
cbsacode[cbsacode['geoid'] == '14000US36119014607']

Unnamed: 0,geoid,cbsacode,short_name,clean_name
4572,14000US36119014607,35620.0,New York,new_york


['14000US36005021800',
 '14000US36005026500',
 '14000US36047054400',
 '14000US36119002202',
 '14000US36119006801',
 '14000US36005036100',
 '14000US36061021000',
 '14000US36061013100',
 '14000US36061011300',
 '14000US36061013500',
 '14000US36005007800',
 '14000US36061002100',
 '14000US36047051001',
 '14000US36119009701',
 '14000US36119008800',
 '14000US36061024700',
 '14000US36061005200',
 '14000US36061010200',
 '14000US36071015100',
 '14000US36047087800',
 '14000US36047061002',
 '14000US36061012500',
 '14000US36081000100',
 '14000US36119012102',
 '14000US36027060400',
 '14000US36027220300',
 '14000US36061009300',
 '14000US36061018200',
 '14000US36061007200',
 '14000US36061001502',
 '14000US36005009800',
 '14000US36061017200',
 '14000US36071013500',
 '14000US36119009400',
 '14000US36061001300',
 '14000US36061011600',
 '14000US36061009100',
 '14000US36061009600',
 '14000US36061020800',
 '14000US36005004100',
 '14000US36061007600',
 '14000US34013005000',
 '14000US34013003500',
 '14000US36

In [243]:
cbsacode[cbsacode['geoid'] == '14000US36081011200']

Unnamed: 0,geoid,cbsacode,short_name,clean_name
3316,14000US36081011200,35620.0,New York,new_york


In [242]:
# ==============================================================
# calculating consensus and putting together the overall dataset
all_consensus_df = pd.DataFrame()
for city in c_l:
    
    for g_type in ['mob','fol_hh']:
        
        for algorithm_type in ['ms','mgn']:
            
            # storing iteration results, empty dataframe for nodes
            consensus_df = pd.DataFrame()
            # TODO atirni az uj fuggveny szerint
            # de az uj fuggveny meg nem eleg gyors
            ## eddig ment HIBA, KÉRDÉS
            S_cons = consen(city, algorithm_type, g_type)
            consensus_df['S_cons'] = S_cons.values()
            consensus_df['city'] = city
            consensus_df['algorithm_type'] = algorithm_type
            consensus_df['g_type'] = g_type
            consensus_df['geoid'] = S_cons.keys()
            consensus_df = consensus_df.set_index('geoid')
            
            # calculating modularity
            G = create_graphs(city, g_type)
            mod = community.modularity(S_cons,G)
            
            #print('city')

            #csv_name = 'consensus_2_' + city + '_' + algorithm_type + '_' + g_type + '.csv'
            #consensus_df.to_csv('../data/'+ csv_name)
            all_consensus_df = pd.concat([all_consensus_df, consensus_df])

#all_consensus_df.to_csv('../data/all_cons.csv')

Reading in necessary data...
Done. 0.03
Creating all possible node pairs...
Done. 9.18
Joining interation results to node pairs...
Done. 18.36
Counting same partitioning for node pairs...
Calculating difference...
Done. 19.58
Getting number of zero differences...
Done. 1.71
Last Louvain...
Creating graph...
Done. 26.42
Running Louvain...
Done. 94.03


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


Reading in necessary data...
Done. 0.02
Creating all possible node pairs...
Done. 8.45
Joining interation results to node pairs...
Done. 17.34
Counting same partitioning for node pairs...
Calculating difference...
Done. 18.65
Getting number of zero differences...
Done. 1.56
Last Louvain...
Creating graph...
Done. 18.81
Running Louvain...
Done. 61.75
Reading in necessary data...
Done. 0.02
Creating all possible node pairs...
Done. 7.72
Joining interation results to node pairs...
Done. 16.52
Counting same partitioning for node pairs...
Calculating difference...
Done. 17.51
Getting number of zero differences...
Done. 1.34
Last Louvain...
Creating graph...
Done. 15.66
Running Louvain...
Done. 81.68


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


KeyError: '14000US36081011200'

In [None]:
def modularity(row):
    city = row['city']
    algorithm_type = row['algorithm_type']
    g_tpye = row['g_type']
    G = create_graphs(city, g_type)
    df = deepcopy(con_df[(con_df['city'] == city) & (con_df['algorithm_type'] == algorithm_type) & (con_df['g_type'] == g_type)])
    part = pd.Series(df.S_cons.values,index=df.geoid).to_dict()
    mod = community.modularity(part,G)
    return mod

# Adding data to overall table

In [37]:
tract_sum = cbsacode.groupby('clean_name')[['clean_name']].count() # number of tracts per city
tract_sum = tract_sum.rename(columns={'clean_name': 'sum_tracts'})

In [68]:
all_graphs_df = pd.DataFrame(columns=['clean_name','g_type','tracts_in_network'])
all_graphs_df.head()

Unnamed: 0,clean_name,g_type,tracts_in_network


In [69]:
graph_combs = product(city_l, ['mob','fol_hh'])

for city, g_type in graph_combs:
    G = create_graphs(city, g_type)
    all_graphs_df = all_graphs_df.append({'clean_name': city, 'g_type' : g_type, 'tracts_in_network' : len(G.nodes)}, ignore_index=True)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


In [71]:
tract_no_df = pd.merge(all_graphs_df, tract_sum, how = 'left', left_on = 'clean_name', right_index = True)

In [94]:
tract_no_df.head()

Unnamed: 0,clean_name,g_type,tracts_in_network,sum_tracts
0,new_york,mob,4603,4609
1,new_york,fol_hh,4389,4609
2,los_angeles,mob,2895,2890
3,los_angeles,fol_hh,2842,2890
4,riverside,mob,817,819


In [112]:
type(all_combs)

itertools.product

In [111]:
pd.DataFrame(all_combs)

In [120]:
all_combs = product(city_l, ['mob','fol_hh'], ['ms','mgn'])
city, g_type, algorithm_type = list(all_combs)[0]
city, g_type, algorithm_type

('new_york', 'mob', 'ms')

### Modularity calculation

In [132]:
import community

In [234]:
con_df = pd.read_csv('../data/all_cons.csv')

In [177]:
def modularity(row):
    city = row['city']
    algorithm_type = row['algorithm_type']
    g_tpye = row['g_type']
    G = create_graphs(city, g_type)
    df = deepcopy(con_df[(con_df['city'] == city) & (con_df['algorithm_type'] == algorithm_type) & (con_df['g_type'] == g_type)])
    part = pd.Series(df.S_cons.values,index=df.geoid).to_dict()
    mod = community.modularity(part,G)
    return mod

In [184]:
all_combs = product(city_l, ['ms','mgn'], ['mob','fol_hh']) ####KERDES MIERT KELL EGY CELLÁBA RAKNI AZ ALL_COMBS LEGYÁRTÁST, A MOD_DF LÉTREHOZÁSSAL??
mod_df = pd.DataFrame(list(all_combs))
mod_df = mod_df.rename(columns = {0 : 'city', 1 : 'algorithm_type', 2 : 'g_type'})

In [194]:
G = create_graphs('los_angeles', 'mob')

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


In [195]:
df = deepcopy(con_df[(con_df['city'] == 'los_angeles') & (con_df['algorithm_type'] == 'ms') & (con_df['g_type'] == 'mob')])

In [196]:
part = pd.Series(df.S_cons.values,index=df.geoid).to_dict()

In [197]:
mod = community.modularity(part,G)

KeyError: '14000US06037107020'

In [235]:
a=set(con_df['geoid']) - set(cbsacode[cbsacode['clean_name']=='new_york'].geoid)

In [237]:
con_df[con_df['geoid'] == '14000US36005016300']

Unnamed: 0,geoid,S_cons,city,algorithm_type,g_type
914,14000US36005016300,1,new_york,ms,mob
8687,14000US36005016300,1,new_york,mgn,mob
12227,14000US36005016300,3,new_york,ms,fol_hh
17889,14000US36005016300,5,new_york,mgn,fol_hh


In [236]:
con_df[con_df['geoid'] == '14000US36005016300']

{'14000US36005016300',
 '14000US36005017100',
 '14000US36081038302',
 '14000US36081064102',
 '14000US36085022800'}

In [233]:
cbsacode[cbsacode.geoid=='14000US36005017100']

Unnamed: 0,geoid,cbsacode,short_name,clean_name


In [212]:
set(con_df.geoid)-

{'14000US36005032600',
 '14000US36059405100',
 '14000US36081041500',
 '14000US36059409200',
 '14000US34035053803',
 '14000US36047008800',
 '14000US34025808402',
 '14000US36119002600',
 '14000US36119001403',
 '14000US36081007900',
 '14000US36059413700',
 '14000US34023005800',
 '14000US36047029400',
 '14000US36005039800',
 '14000US36103159508',
 '14000US36119000600',
 '14000US36047112800',
 '14000US36061013100',
 '14000US36081157901',
 '14000US36085015601',
 '14000US36027070201',
 '14000US36081053900',
 '14000US36047039800',
 '14000US36081094203',
 '14000US36005011900',
 '14000US36047016000',
 '14000US36059517600',
 '14000US36081035300',
 '14000US36081018000',
 '14000US34017006700',
 '14000US36005026500',
 '14000US34031203600',
 '14000US34029723000',
 '14000US36047066200',
 '14000US36061027900',
 '14000US34013016400',
 '14000US36005012300',
 '14000US34029731101',
 '14000US36059416402',
 '14000US36081012400',
 '14000US34031163500',
 '14000US36047119800',
 '14000US36103110501',
 '14000US34

In [192]:
mod_df.loc[4]

city              los_angeles
algorithm_type             ms
g_type                    mob
Name: 4, dtype: object

In [193]:
modularity(mod_df.loc[4])

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


KeyError: '14000US06037107020'

In [186]:
mod_df['modularity'] = mod_df.apply(lambda row: modularity(row), axis=1)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


KeyError: ('14000US06037107020', 'occurred at index 4')

In [159]:
modularity(mod_df.loc[0])

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


KeyError: '14000US36005021800'

# depreciated

In [128]:
con_df

Unnamed: 0,geoid,S_cons,city,algorithm_type,g_type
0,14000US36005021800,0,new_york,ms,mob
1,14000US36005026500,0,new_york,ms,mob
2,14000US36047054400,1,new_york,ms,mob
3,14000US36119002202,0,new_york,ms,mob
4,14000US36119006801,0,new_york,ms,mob
5,14000US36005036100,2,new_york,ms,mob
6,14000US36061021000,0,new_york,ms,mob
7,14000US36061013100,1,new_york,ms,mob
8,14000US36061011300,1,new_york,ms,mob
9,14000US36061013500,0,new_york,ms,mob


In [None]:
for city in city_l:
    g_mob, g_fol_hh = create_graphs(city, mobility, follow_hh)
    for (G, g_type) in [(g_mob, 'mob'), (g_fol_hh, 'fol_hh')]:
        consen_df = pd.DataFrame()
        consen_df['geoid'] = G.nodes()
        S_df = pd.DataFrame()
        A = nx.adjacency_matrix(G)
        coords = np.array([tract_center_dict[n] for n in G.nodes()]) ## KERDES ez jó így? tract_center_dict
        d = pdist(coords)
        D = squareform(pdist(coords))
        #### KERDES -  jo, ha az importance mindig a mobilityből van számolva??????
        # importance - number of users
        tract_outdeg_mob = mobility.groupby('tract_home')[['cnt']].sum()
        N = np.matrix([tract_outdeg_mob.loc[k].iloc[0] for k in G.nodes()]).T
        # binsize
        max_dist = np.amax(D)
        b = max_dist/99 # number of bins = 100      
        for _ in range(3):
            print(_)
            Ms,Mgn,Dfn = octave.ModularitySpaGN(A,D,N,b, nout = 3)
            S,Q,n_it = octave.iterated_genlouvain(Ms, nout=3)
            S_df[len(S_df.columns)] = S.T[0]
        S_df['geoid'] = list(G.nodes()) ## KERDES JO???
        S_df = S_df.set_index('geoid')
        csv_name = 'consensus' + city + '_' + g_type + '.csv'
        S_df.to_csv('../data/'+ csv_name)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  if sys.path[0] == '':
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  from ipykernel import kernelapp as app


0


In [144]:
mobility.loc[0]

cbsacode                   14460
tract_home    14000US25017353300
tract_work    14000US25025070200
cnt                            2
Name: 0, dtype: object

In [141]:
mod_df[0]

KeyError: 0

In [None]:
        S_cons = consen(csv,G) ## eddig ment HIBA, KÉRDÉS
        consen_df['S_cons'] = S_cons
        consen_df['city'] = city
        consen_df['type'] = g_type
        consen_df = consen_df.set_index('geoid')
        pd.merge(all_consensus_df, consen_df, left_index = True, right_index = True)
all_consensus_df.to_csv('all_cons.csv')

In [None]:
csv_city = str(G) + '_rawclust.csv'+ city
        with open(csv_city, mode='w') as clust_file:
            clust_writer = csv.writer(clust_file, delimiter=',', quotechar='"') ### ez kerdeses!!!!

   

    employee_writer.writerow(['John Smith', 'Accounting', 'November'])

In [102]:
# unique values in the network counted by side of edge

mob_u_h = mobility.tract_home.tolist()
mob_u_w = mobility.tract_work.tolist()
u_geoid_mob = set([*mob_u_h,*mob_u_w])
cbsacode['in_mob'] = cbsacode.geoid.isin(u_geoid_mob).astype(int)

fol_u_h = follow_hh.tract_home.tolist()
fol_u_h1 = follow_hh.tract_home_1.tolist()
u_geoid_fol = set([*fol_u_h,*fol_u_h1])
cbsacode['in_fol'] = cbsacode.geoid.isin(u_geoid_fol).astype(int)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


KeyboardInterrupt: 

In [113]:
netw_count = cbsacode.groupby('cbsacode')[['in_mob','in_fol']].sum()

In [114]:
city_df = pd.merge(tract_sum, netw_count, left_index=True, right_index=True)

In [None]:
nx_comm.modularity()

G = nx.barbell_graph(3, 0)

nx_comm.modularity(G, [{0, 1, 2}, {3, 4, 5}])
0.35714285714285715

nx_comm.modularity(G, nx_comm.label_propagation_communities(G))
0.35714285714285715


In [114]:
con_df

Unnamed: 0,geoid,S_cons,city,algorithm_type,g_type
0,14000US36005021800,0,new_york,ms,mob
1,14000US36005026500,0,new_york,ms,mob
2,14000US36047054400,1,new_york,ms,mob
3,14000US36119002202,0,new_york,ms,mob
4,14000US36119006801,0,new_york,ms,mob
5,14000US36005036100,2,new_york,ms,mob
6,14000US36061021000,0,new_york,ms,mob
7,14000US36061013100,1,new_york,ms,mob
8,14000US36061011300,1,new_york,ms,mob
9,14000US36061013500,0,new_york,ms,mob


In [12]:
tract = mobility.groupby('tract_home')[['cnt']].sum()

In [13]:
tract

Unnamed: 0_level_0,cnt
tract_home,Unnamed: 1_level_1
14000US01007010001,9
14000US01007010002,14
14000US01007010003,10
14000US01007010004,22
14000US01009050101,21
14000US01009050102,29
14000US01009050200,20
14000US01009050300,12
14000US01009050400,1
14000US01009050500,17


In [92]:
mod

0.42897442877240455

In [109]:
c_df = con_df[:10000]

In [None]:
follow_hh = follow_hh.merge(tract_outdeg_mob, left_on='tract_home', right_index=True)

N = np.matrix([tract_outdeg_mob.loc[k].iloc[0] for k in g_fol_hh.nodes()]).T
# modify N

In [106]:
c_df

Unnamed: 0,geoid,S_cons,city,algorithm_type,g_type,modularity
0,14000US36005021800,0,new_york,ms,mob,0.428974
1,14000US36005026500,0,new_york,ms,mob,0.428974
2,14000US36047054400,1,new_york,ms,mob,0.428974
3,14000US36119002202,0,new_york,ms,mob,0.428974
4,14000US36119006801,0,new_york,ms,mob,0.428974
5,14000US36005036100,2,new_york,ms,mob,0.428974
6,14000US36061021000,0,new_york,ms,mob,0.428974
7,14000US36061013100,1,new_york,ms,mob,0.428974
8,14000US36061011300,1,new_york,ms,mob,0.428974
9,14000US36061013500,0,new_york,ms,mob,0.428974


-----------------------

In [25]:
octave.ModularitySpaGN(A,D,N,b, nout = 3)

Beginning of modularity function...
Preparation done.
Deterrence function...
 1
 2
 3
 4
 5
 6
 7
 8
 9
 10
 11
 12
 13
 14
 15
 16
 17
 18
 19
 20
 21
 22
 23
 24
 25
 26
 27
 28
 29
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
 100
 101
 102
 103
 104
 105
 106
 107
 108
 109
 110
 111
 112
 113
 114
 115
 116
 117
 118
 119
 120
 121
 122
 123
 124
 125
 126
 127
 128
 129
 130
 131
 132
 133
 134
 135
 136
 137
 138
 139
 140
 141
 142
 143
 144
 145
 146
 147
 148
 149
 150
 151
 152
 153
 154
 155
 156
 157
 158
 159
 160
 161
 162
 163
 164
 165
 166
 167
 168
 169
 170
 171
 172
 173
 174
 175
 176
 177
 178
 179
 180
 181
 182
 183
 184
 185
 186
 187
 188
 189
 190
 191
 192
 193
 194
 195
 196
 197
 198
 199
 200
 201
 202
 203
 204
 205
 206
 

 1539
 1540
 1541
 1542
 1543
 1544
 1545
 1546
 1547
 1548
 1549
 1550
 1551
 1552
 1553
 1554
 1555
 1556
 1557
 1558
 1559
 1560
 1561
 1562
 1563
 1564
 1565
 1566
 1567
 1568
 1569
 1570
 1571
 1572
 1573
 1574
 1575
 1576
 1577
 1578
 1579
 1580
 1581
 1582
 1583
 1584
 1585
 1586
 1587
 1588
 1589
 1590
 1591
 1592
 1593
 1594
 1595
 1596
 1597
 1598
 1599
 1600
 1601
 1602
 1603
 1604
 1605
 1606
 1607
 1608
 1609
 1610
 1611
 1612
 1613
 1614
 1615
 1616
 1617
 1618
 1619
 1620
 1621
 1622
 1623
 1624
 1625
 1626
 1627
 1628
 1629
 1630
 1631
 1632
 1633
 1634
 1635
 1636
 1637
 1638
 1639
 1640
 1641
 1642
 1643
 1644
 1645
 1646
 1647
 1648
 1649
 1650
 1651
 1652
 1653
 1654
 1655
 1656
 1657
 1658
 1659
 1660
 1661
 1662
 1663
 1664
 1665
 1666
 1667
 1668
 1669
 1670
 1671
 1672
 1673
 1674
 1675
 1676
 1677
 1678
 1679
 1680
 1681
 1682
 1683
 1684
 1685
 1686
 1687
 1688
 1689
 1690
 1691
 1692
 1693
 1694
 1695
 1696
 1697
 1698
 1699
 1700
 1701
 1702
 1703
 1704
 170

 2905
 2906
 2907
 2908
 2909
 2910
 2911
 2912
 2913
 2914
 2915
 2916
 2917
 2918
 2919
 2920
 2921
 2922
 2923
 2924
 2925
 2926
 2927
 2928
 2929
 2930
 2931
 2932
 2933
 2934
 2935
 2936
 2937
 2938
 2939
 2940
 2941
 2942
 2943
 2944
 2945
 2946
 2947
 2948
 2949
 2950
 2951
 2952
 2953
 2954
 2955
 2956
 2957
 2958
 2959
 2960
 2961
 2962
 2963
 2964
 2965
 2966
 2967
 2968
 2969
 2970
 2971
 2972
 2973
 2974
 2975
 2976
 2977
 2978
 2979
 2980
 2981
 2982
 2983
 2984
 2985
 2986
 2987
 2988
 2989
 2990
 2991
 2992
 2993
 2994
 2995
 2996
 2997
 2998
 2999
 3000
 3001
 3002
 3003
 3004
 3005
 3006
 3007
 3008
 3009
 3010
 3011
 3012
 3013
 3014
 3015
 3016
 3017
 3018
 3019
 3020
 3021
 3022
 3023
 3024
 3025
 3026
 3027
 3028
 3029
 3030
 3031
 3032
 3033
 3034
 3035
 3036
 3037
 3038
 3039
 3040
 3041
 3042
 3043
 3044
 3045
 3046
 3047
 3048
 3049
 3050
 3051
 3052
 3053
 3054
 3055
 3056
 3057
 3058
 3059
 3060
 3061
 3062
 3063
 3064
 3065
 3066
 3067
 3068
 3069
 3070
 307

 4271
 4272
 4273
 4274
 4275
 4276
 4277
 4278
 4279
 4280
 4281
 4282
 4283
 4284
 4285
 4286
 4287
 4288
 4289
 4290
 4291
 4292
 4293
 4294
 4295
 4296
 4297
 4298
 4299
 4300
 4301
 4302
 4303
 4304
 4305
 4306
 4307
 4308
 4309
 4310
 4311
 4312
 4313
 4314
 4315
 4316
 4317
 4318
 4319
 4320
 4321
 4322
 4323
 4324
 4325
 4326
 4327
 4328
 4329
 4330
 4331
 4332
 4333
 4334
 4335
 4336
 4337
 4338
 4339
 4340
 4341
 4342
 4343
 4344
 4345
 4346
 4347
 4348
 4349
 4350
 4351
 4352
 4353
 4354
 4355
 4356
 4357
 4358
 4359
 4360
 4361
 4362
 4363
 4364
 4365
 4366
 4367
 4368
 4369
 4370
 4371
 4372
 4373
 4374
 4375
 4376
 4377
 4378
 4379
 4380
 4381
 4382
 4383
 4384
 4385
 4386
 4387
 4388
 4389
Done.
Normalization...
 1
 2
 3
 4
 5
 6
 7
 8
 9
 10
 11
 12
 13
 14
 15
 16
 17
 18
 19
 20
 21
 22
 23
 24
 25
 26
 27
 28
 29
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68


 1429
 1430
 1431
 1432
 1433
 1434
 1435
 1436
 1437
 1438
 1439
 1440
 1441
 1442
 1443
 1444
 1445
 1446
 1447
 1448
 1449
 1450
 1451
 1452
 1453
 1454
 1455
 1456
 1457
 1458
 1459
 1460
 1461
 1462
 1463
 1464
 1465
 1466
 1467
 1468
 1469
 1470
 1471
 1472
 1473
 1474
 1475
 1476
 1477
 1478
 1479
 1480
 1481
 1482
 1483
 1484
 1485
 1486
 1487
 1488
 1489
 1490
 1491
 1492
 1493
 1494
 1495
 1496
 1497
 1498
 1499
 1500
 1501
 1502
 1503
 1504
 1505
 1506
 1507
 1508
 1509
 1510
 1511
 1512
 1513
 1514
 1515
 1516
 1517
 1518
 1519
 1520
 1521
 1522
 1523
 1524
 1525
 1526
 1527
 1528
 1529
 1530
 1531
 1532
 1533
 1534
 1535
 1536
 1537
 1538
 1539
 1540
 1541
 1542
 1543
 1544
 1545
 1546
 1547
 1548
 1549
 1550
 1551
 1552
 1553
 1554
 1555
 1556
 1557
 1558
 1559
 1560
 1561
 1562
 1563
 1564
 1565
 1566
 1567
 1568
 1569
 1570
 1571
 1572
 1573
 1574
 1575
 1576
 1577
 1578
 1579
 1580
 1581
 1582
 1583
 1584
 1585
 1586
 1587
 1588
 1589
 1590
 1591
 1592
 1593
 1594
 159

[array([[ 0.00000000e+00,  1.97217045e+00, -6.05491944e-03, ...,
         -3.16164664e-03, -4.87741810e-03, -2.69398819e-03],
        [ 1.97217045e+00,  0.00000000e+00, -4.61327196e-03, ...,
         -1.89958257e-03, -1.73467958e-03, -1.30100968e-03],
        [-6.05491944e-03, -4.61327196e-03,  0.00000000e+00, ...,
         -1.55512676e-03, -1.03662421e-03, -7.63535100e-04],
        ...,
        [-3.16164664e-03, -1.89958257e-03, -1.55512676e-03, ...,
          0.00000000e+00, -1.19732809e-03, -8.97996064e-04],
        [-4.87741810e-03, -1.73467958e-03, -1.03662421e-03, ...,
         -1.19732809e-03,  0.00000000e+00, -3.61430071e-03],
        [-2.69398819e-03, -1.30100968e-03, -7.63535100e-04, ...,
         -8.97996064e-04, -3.61430071e-03,  0.00000000e+00]]),
 <4389x4389 sparse matrix of type '<class 'numpy.float64'>'
 	with 19263321 stored elements in Compressed Sparse Column format>,
 array([[0.        ],
        [0.00030119],
        [0.00019874],
        ...,
        [0.        ],

In [None]:
%%writefile toto.m

function [a, b] = toto(c);
    a = c;
    b = c + 1;
end

In [None]:
a,b = octave.toto(3, nout=2)