In [1]:
import os
import sys
import csv
import numpy as np
import scipy as sp
import pandas as pd
import sklearn as sk

import matplotlib.pyplot as plt
import seaborn as sns

%matplotlib inline
%precision 4

'%.4f'

In [2]:
_ROOT_DIR = os.getcwd()
_DATA_DIR = os.path.join(_ROOT_DIR, 'data')
_GPLUS_PATH = os.path.join(_DATA_DIR, 'gplus')
_FB_PATH = os.path.join(_DATA_DIR, 'facebook')

In [12]:
!ls C:\Users\A605739\Dropbox\phd\phd_experiment\sna_data_processing\data\facebook

facebook
readme-Ego.txt


A package of SNAP social network data is consisted of a several sets of data files, each of which reprsents a circle:
* circles
* edges
* egofeat
* feat
* featnames
* followers (only for Google+)

The objective of the output is to consolidate the information scattered across different files into a single data table.

In [15]:
gp_file_group_ids = pd.Series(os.listdir(_GPLUS_PATH)).apply(lambda x: x.split('.')[0]).unique().tolist()
print( "the total of circles in GPlus: {}".format(len(gp_file_group_ids)) )

fb_file_group_ids = pd.Series(os.listdir(_FB_PATH)).apply(lambda x: x.split('.')[0]).unique().tolist()
print( "the total of circles in Facebook: {}".format(len(fb_file_group_ids)) )

the total of circles in GPlus: 132
the total of circles in Facebook: 10


In [None]:
def feat_extractor(line, group_id):
    items = line.replace("anonymized feature", "").split(" ")
    position = int(items[0])
    feature = items[1][:-1].replace(";", "_")
    value = int(items[2]) 
    return {"group":int(group_id), "position":int(position), \
            "feature":feature, "value":int(value)}

def profile_extracttor(line, circle_id, feat_dict):
    """Convert text line (from .feat data) to dictionary"""
    if isinstance(circle_id, str):
        circle_id = int(circle_id)
    items = line.split(" ")
    user_idx, feature = items[0], list(map(int, items[1:]))
    nonzero_feat_idx = [i for i, val in enumerate(feature) if val == 1]
    feat_dict = feat_dict.ix[feat_dict["group"]==circle_id,\
                             ["feature", "position", "value"]]
    feat_profile =  {row[0]: row[1] \
           for i, row in feat_dict.iterrows() \
           if row[1] in nonzero_feat_idx}
    user_id = "-".join((str(circle_id), str(user_idx)))
    return {"user_id": user_id, "num_filled_feature": len(feat_profile), "profile":feat_profile}

In [20]:
for ii, file_group in enumerate(fb_file_group_ids):
    # display the number features in *.feat
    file_path = os.path.join(_FB_PATH, file_group+'.featnames')
    table = pd.read_csv(file_path, header=None)
    print("The number of feature of {} file group: {}".format(file_group, table.shape[0]))

The number of feature of 0 file group: 224
The number of feature of 107 file group: 576
The number of feature of 1684 file group: 319
The number of feature of 1912 file group: 480
The number of feature of 3437 file group: 262
The number of feature of 348 file group: 161
The number of feature of 3980 file group: 42
The number of feature of 414 file group: 105
The number of feature of 686 file group: 63
The number of feature of 698 file group: 48


In [30]:
file_group = fb_file_group_ids[0]

feat_fpath = os.path.join(_FB_PATH, file_group+'.feat')
featnames_fpath = os.path.join(_FB_PATH, file_group+'.featnames')

In [26]:
# get featnames
feat_df = pd.read_csv(feat_fpath, header=None, sep=" ")
feat_df.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,215,216,217,218,219,220,221,222,223,224
0,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,2,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,3,0,0,0,0,0,0,0,1,0,...,0,0,0,1,0,0,0,0,0,0
3,4,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,5,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [89]:
def featnames_df_proc(file_path):
    """ process .featnames file to clean the data and keep
        the processed data in dataframe(index, feat_name, value)
    """
    import re

    items = []
    with open(file_path, mode='r', encoding='utf-8') as f:
        for line in f.readlines():
            line = line.replace("anonymized feature", "").strip()
            comps = re.split(" ", line)
            idx, featname, val = comps[0], comps[1], comps[2]
            idx = int(idx)
            featname = featname[:-1].replace(";", "_")
            val = int(val.split(" ")[-1])
            item = {'index': idx, 'feat_name': featname, 'value': val}
            items.append(item)
        
    featname_df = pd.DataFrame(items)
    return featname_df

def feat_list_proc(file_path, featname_df, prefix=""):
    """ convert dummy variable form dataframe 
        into list of user's profile, stored as dictionary of featname:value
        
        Arguments:
        ==========
        * file_path: <string>
        * featname_df: <pandas.DataFrame>
        * prefix: <string>
    """
    def pair_key_value(x):
        """create key:value pair string"""
        x = [str(ii) for ii in x]
        return ":".join(x)

    featname_columns = ['feat_name', 'value']

    users = []
    with open(feat_fpath, mode='r', encoding='utf-8') as f:
        for line in f.readlines():
            line = line.strip()
            comps = line.split(' ')
            uid, feat_codes = comps[0], comps[1:]
        
            nzero_idx = [ii for ii, val in enumerate(feat_codes) if val != '0']
            kv_pairs = featname_df.loc[nzero_idx, featname_columns].apply(pair_key_value, axis=1).tolist()
            profile = {kv for kv in kv_pairs}
        
            if prefix != "": 
                uid = prefix + '_' + uid
                
            user = {'uid':uid, 'profile':profile}
            users.append(user)
        
    return users

In [121]:
file_group = fb_file_group_ids[0]

feat_fpath = os.path.join(_FB_PATH, file_group+'.feat')
featnames_fpath = os.path.join(_FB_PATH, file_group+'.featnames')

featname_df = featnames_df_proc(featnames_fpath)
user_profile_list = feat_list_proc(feat_fpath, featname_df, prefix=file_group)