## This notebook prepare raw data into network data for feature extraction

In [8]:
import pandas as pd
import networkx as nx
import numpy as np
from datetime import date

## Creating US air networks from source data

In [2]:
import re
import unicodedata

def strip_accents(text):
    """
    Strip accents from input String.

    :param text: The input string.
    :type text: String.

    :returns: The processed String.
    :rtype: String.
    """
    try:
        text = unicode(text, 'utf-8')
    except (TypeError, NameError): # unicode is a default on python 3 
        pass
    text = unicodedata.normalize('NFD', text)
    text = text.encode('ascii', 'ignore')
    text = text.decode("utf-8")
    return str(text)

def text_to_id(text):
    """
    Convert input text to id.

    :param text: The input string.
    :type text: String.

    :returns: The processed String.
    :rtype: String.
    """
    text = strip_accents(text.lower())
    text = re.sub(r"\d", "", text) 
    text=re.sub(r"^\s+", "", text) 
    text=re.sub(r"\s+$", "", text)
    text = re.sub(r"\s+","_", text, flags = re.I)
    #text = re.sub('[ ]+', '_', text)
    text = re.sub('[^a-zA-Z_-]', '', text)
    return text

In [4]:
n=np.arange(0,29,1)+1
dfs=[]
for i in n:    
    df=pd.read_csv('/home/weihua/Research/Link_Dynamics/data/us_air_raw_data/770841524_T_T100D_MARKET_ALL_CARRIER-%s.csv' %i)
    df=df[['YEAR','MONTH','ORIGIN_CITY_NAME','DEST_CITY_NAME','PASSENGERS']]
    df=df.rename(index=str, columns={"ORIGIN_CITY_NAME": "source", "DEST_CITY_NAME": "target",'PASSENGERS':'weight'})
    df['source']=df.apply(lambda row: text_to_id(str(row.source)), axis=1)
    df['target']=df.apply(lambda row: text_to_id(str(row.target)), axis=1)
    df=df.groupby(['YEAR','MONTH','source','target']).count()
    df=df.reset_index()
    dfs.append(df[df.weight !=0 ])
data=pd.concat(dfs, ignore_index=True)
data=data.reset_index().drop(columns='index')
data.set_index(['YEAR', 'MONTH'], inplace=True)
data.sort_index(inplace=True)

In [6]:
data.to_pickle('/home/weihua/Research/Link_Dynamics/data/networks/US_air_1990_2018.pkl')
data.head().T

YEAR,1990,1990,1990,1990,1990
MONTH,1,1.1,1.2,1.3,1.4
source,adak_island_ak,adak_island_ak,akron_oh,akron_oh,akron_oh
target,anchorage_ak,shemya_ak,atlanta_ga,birmingham_al,chicago_il
weight,1,1,1,1,2


In [9]:
df_air = pd.read_pickle('../data/networks/US_air_1990_2018.pkl')
df_air = df_air[df_air.source != df_air.target]
year = list(df_air.index.get_level_values(0).unique())
month = list(df_air.index.get_level_values(1).unique())
graphs_air = []
date_air = []
for y in year:
    for m in month:
#        if y != 2018 or m != 12:
        df = df_air.loc[y,m]
        date_air.append(date(y,m,1))
        G = nx.from_pandas_edgelist(df_air.loc[y,m], edge_attr=True)
        graphs_air.append(G)
graphs_air=graphs_air

## Creat brazil bus netwroks from source data

In [10]:
def match_datasets(data):
    allcities=pd.read_csv("../data/buses_list_of_cities.csv",index_col=0,names=["CityUF"],encoding="utf-8")
    allcities.head()    
    allcities["CityUF"]=allcities["CityUF"].str.upper()
    allcities["CityUF"]=allcities["CityUF"].str.strip()
    allcities["CityUF"]=allcities["CityUF"].str.replace(" , ",", ")
    allcities["CityUF"]=allcities["CityUF"].str.normalize('NFKD').str.encode('ascii', errors='ignore').str.decode('utf-8')
    setallcities=set(list(allcities.CityUF.astype(str)))    
    data=data[data['ORIGEM'].isin(list(setallcities))]
    data=data[data['DESTINO'].isin(list(setallcities))]    
    return data

def bus_network(year=2010,month=12):
    data=pd.read_csv("../data/cleaned_buses_data/{}.csv".format(year),index_col=None)
    if month is not False:
        data=data[data.MES==month]
    data=data[data.NUMEROLUGAROFERTADOIDA>0]
    data=match_datasets(data)
    data=data[["ORIGEM","DESTINO"]]
    data=data.groupby(data.columns.tolist()).size().reset_index().rename(columns={0:'weight'})
    data=data.rename(columns={'ORIGEM':'source','DESTINO':'target','NUMEROLUGAROFERTADOIDA':'weight'})
    return data

In [12]:
df_bus = pd.read_pickle('../data/networks/bus_2005_2014.pkl')
df_bus = df_bus[df_bus.source != df_bus.target]
year = list(df_bus.index.get_level_values(0).unique())
month = list(df_bus.index.get_level_values(1).unique())
graphs_bus = []
date_bus = []
for y in year:
    for m in month:
#        if y != 2018 or m != 12:
        df = bus_network(y,m)
        date_bus.append(date(y,m,1))
        G = nx.from_pandas_edgelist(df, edge_attr=True)
        graphs_bus.append(G)