In [202]:
import os
import sys
import csv
import time
import numpy as np
import scipy as sp
import pandas as pd
import sklearn as sk

import matplotlib.pyplot as plt
import seaborn as sns

%matplotlib inline
%precision 4

'%.4f'

In [185]:
_ROOT_DIR = os.getcwd()
_DATA_DIR = os.path.join(_ROOT_DIR, 'data')
_OUTPUT_DIR = os.path.join(_ROOT_DIR, 'output')

_GPLUS_PATH = os.path.join(_DATA_DIR, 'gplus')
_FB_PATH = os.path.join(_DATA_DIR, 'facebook')

In [189]:
_OUTPUT_DIR

'C:\\Users\\A605739\\Dropbox\\phd\\phd_experiment\\sna_data_processing\\output'

A package of SNAP social network data is consisted of a several sets of data files, each of which reprsents a circle:
* circles
* edges
* egofeat
* feat
* featnames
* followers (only for Google+)

The objective of the output is to consolidate the information scattered across different files into a single data table.

In [15]:
gp_file_group_ids = pd.Series(os.listdir(_GPLUS_PATH)).apply(lambda x: x.split('.')[0]).unique().tolist()
print( "the total of circles in GPlus: {}".format(len(gp_file_group_ids)) )

fb_file_group_ids = pd.Series(os.listdir(_FB_PATH)).apply(lambda x: x.split('.')[0]).unique().tolist()
print( "the total of circles in Facebook: {}".format(len(fb_file_group_ids)) )

the total of circles in GPlus: 132
the total of circles in Facebook: 10


In [20]:
for ii, file_group in enumerate(fb_file_group_ids):
    # display the number features in *.feat
    file_path = os.path.join(_FB_PATH, file_group+'.featnames')
    table = pd.read_csv(file_path, header=None)
    print("The number of feature of {} file group: {}".format(file_group, table.shape[0]))

The number of feature of 0 file group: 224
The number of feature of 107 file group: 576
The number of feature of 1684 file group: 319
The number of feature of 1912 file group: 480
The number of feature of 3437 file group: 262
The number of feature of 348 file group: 161
The number of feature of 3980 file group: 42
The number of feature of 414 file group: 105
The number of feature of 686 file group: 63
The number of feature of 698 file group: 48


In [30]:
file_group = fb_file_group_ids[0]

feat_fpath = os.path.join(_FB_PATH, file_group+'.feat')
featnames_fpath = os.path.join(_FB_PATH, file_group+'.featnames')

In [26]:
# get featnames
feat_df = pd.read_csv(feat_fpath, header=None, sep=" ")
feat_df.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,215,216,217,218,219,220,221,222,223,224
0,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,2,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,3,0,0,0,0,0,0,0,1,0,...,0,0,0,1,0,0,0,0,0,0
3,4,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,5,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [182]:
def featnames_df_proc(file_path):
    """ process .featnames file to clean the data and keep
        the processed data in dataframe(index, feat_name, value)
    """
    import re

    items = []
    with open(file_path, mode='r', encoding='utf-8') as f:
        for line in f.readlines():
            line = line.replace("anonymized feature", "").strip()
            comps = re.split(" ", line)
            idx, featname, val = comps[0], comps[1], comps[2]
            idx = int(idx)
            featname = featname[:-1].replace(";", "_")
            val = int(val.split(" ")[-1])
            item = {'index': idx, 'feat_name': featname, 'value': val}
            items.append(item)
        
    featname_df = pd.DataFrame(items)
    return featname_df

def feat_list_proc(file_path, featname_df, prefix=""):
    """ convert dummy variable form dataframe 
        into list of user's profile, stored as dictionary of featname:value
        
        Arguments:
        ==========
        * file_path: <string>
        * featname_df: <pandas.DataFrame>
        * prefix: <string>
    """
    
    def pair_key_value(x):
        """create key:value pair string"""
        x = [str(ii) for ii in x]
        return ":".join(x)

    featname_columns = ['feat_name', 'value']

    users = []
    with open(feat_fpath, mode='r', encoding='utf-8') as f:
        for line in f.readlines():
            line = line.strip()
            comps = line.split(' ')
            uid, feat_codes = comps[0], comps[1:]
        
            nzero_idx = [ii for ii, val in enumerate(feat_codes) if val != '0']
            kv_pairs = featname_df.loc[nzero_idx, featname_columns].apply(pair_key_value, axis=1).tolist()
            profile = {kv.split(":")[0]:kv.split(":")[1] for kv in kv_pairs}
            
            if prefix != "": 
                uid = prefix + '_' + uid
   
            profile['uid'] = uid
            users.append(profile)

    return users

In [208]:
# create a single table to keep profiles of all users appearing in
# the network 
start_at = time.time()

users = []
for file_group in fb_file_group_ids:
    feat_fpath = os.path.join(_FB_PATH, file_group+'.feat')
    featnames_fpath = os.path.join(_FB_PATH, file_group+'.featnames')
    
    featname_df = featnames_df_proc(featnames_fpath)
    user_profile_list = feat_list_proc(feat_fpath, featname_df)
    users.extend(user_profile_list)

end_at = time.time()
print("--- total of users: {:,} ---".format(len(users)))
print("--- time cost: {:.2f}s ---".format(end_at - start_at))

users_df = pd.DataFrame(users)
col_names = users_df.columns.tolist()
col_names = ['uid'] + [colname for colname in col_names if col_names != 'uid']
users_df = users_df[col_names]
users_df.to_csv(os.path.join(_OUTPUT_DIR, 'facebook_users.csv'), sep=',', header=True, index=False)

--- total of users: 4,167 ---
--- time cost: 5.42s ---


In [209]:
# compile all edge files together and output a single consolidated
# .csv file
start_at = time.time()

edge_pairs = []
for file_group in fb_file_group_ids:
    edge_fpath = os.path.join(_FB_PATH, file_group+'.edges')
    with open(edge_fpath, mode='r', encoding='utf-8') as f:
        for line in f.readlines():
            pair = line.strip().split(" ")
            pair_dict = {"user_a":pair[0], "user_b":pair[1]}
            edge_pairs.append(pair_dict)

end_at = time.time()
print("--- total of edges: {:,} ---".format(len(edge_pairs)))
print("--- time cost: {:.2f}s ---".format(end_at - start_at))

edges_df = pd.DataFrame(edge_pairs)
edges_df.to_csv(os.path.join(_OUTPUT_DIR, 'facebook_edges.csv'), sep=',', header=True, index=False)

--- total of edges: 170,174 ---
--- time cost: 0.21s ---


In [210]:
users_df.describe()

Unnamed: 0,uid,birthday,education_classes_id,education_concentration_id,education_degree_id,education_school_id,education_type,education_with_id,education_year_id,first_name,...,religion,uid.1,work_employer_id,work_end_date,work_from_id,work_location_id,work_position_id,work_projects_id,work_start_date,work_with_id
count,4167,1597,65,1220,483,2781,3103,33,2471,334,...,2,4167,646,929,4,617,390,20,1091,8
unique,4035,40,23,97,24,249,3,12,34,71,...,1,4035,139,40,2,48,60,10,61,4
top,428,5,336,14,22,538,55,350,66,1065,...,1154,428,140,157,683,84,193,712,157,728
freq,4,376,9,313,211,640,2620,10,358,18,...,2,4,49,373,2,89,82,3,314,2


In [None]:
"253 job_title:economics,

In [213]:
def gplus_featnames_df_proc(file_path):
    """ process .featnames file to clean the data and keep
        the processed data in dataframe(index, feat_name, value)
    """
    import re

    items = []
    with open(file_path, mode='r', encoding='utf-8') as f:
        for line in f.readlines():
            line = line.replace(":", ' ').replace(",", "")
            comps = line.split(" ")
            idx, featname, val = comps[0], comps[1], comps[2]
            idx = int(idx)
            featname = featname[:-1].replace(";", "_")
            val = int(val.split(" ")[-1])
            item = {'index': idx, 'feat_name': featname, 'value': val}
            items.append(item)
        
    featname_df = pd.DataFrame(items)
    return featname_df

'C:\\Users\\A605739\\Dropbox\\phd\\phd_experiment\\sna_data_processing\\data\\gplus\\100129275726588145876.featnames'

In [212]:
##
## process gplus data
##
start_at = time.time()

users = []
for file_group in gp_file_group_ids:
    feat_fpath = os.path.join(_GPLUS_PATH, file_group+'.feat')
    featnames_fpath = os.path.join(_GPLUS_PATH, file_group+'.featnames')
    
    featname_df = featnames_df_proc(featnames_fpath)
    user_profile_list = feat_list_proc(feat_fpath, featname_df)
    users.extend(user_profile_list)

end_at = time.time()
print("--- total of users: {:,} ---".format(len(users)))
print("--- time cost: {:.2f}s ---".format(end_at - start_at))

users_df = pd.DataFrame(users)
col_names = users_df.columns.tolist()
col_names = ['uid'] + [colname for colname in col_names if col_names != 'uid']
users_df = users_df[col_names]
users_df.to_csv(os.path.join(_OUTPUT_DIR, 'gplus_users.csv'), sep=',', header=True, index=False)


# compile all edge files together and output a single consolidated
# .csv file
start_at = time.time()

edge_pairs = []
for file_group in gp_file_group_ids:
    edge_fpath = os.path.join(_GPLUS_PATH, file_group+'.edges')
    with open(edge_fpath, mode='r', encoding='utf-8') as f:
        for line in f.readlines():
            pair = line.strip().split(" ")
            pair_dict = {"user_a":pair[0], "user_b":pair[1]}
            edge_pairs.append(pair_dict)

end_at = time.time()
print("--- total of edges: {:,} ---".format(len(edge_pairs)))
print("--- time cost: {:.2f}s ---".format(end_at - start_at))

edges_df = pd.DataFrame(edge_pairs)
edges_df.to_csv(os.path.join(_OUTPUT_DIR, 'gplus_edges.csv'), sep=',', header=True, index=False)

IndexError: list index out of range