# CKM/sem-Yawen Zhang

## Step 1: get `data_dict`


In [6]:
# -*- coding: utf-8 -*-

file_path = "iConference_edited/iConf2015.txt"

def parse_string(string):
    string = " ".join(string.split()) # remove duplicate spaces
    no_whitespace = "".join(string.split())
    index_right_a = no_whitespace.find(">") # find the index of right left angle braket
    string_key = no_whitespace[1:index_right_a].lower()
    string_value = string[index_right_a+2:]
    return string_key, string_value

# data format:
# [
# {"author": "Mathiesen, Kay",
# "title": "Indigenous Peoples' Rights to Culture and Individual Rights to Access",
# "abstract": "xxx xxx xxx", 
# "ID": "iConf2015.txt-1"},
# {...}
# ]
def get_data_dict(file_path):
    with open(file_path, "r") as f:
        file = list(f)
    n = -1
    data_dict = []
    for line in file:
        line = line.strip()
        if line == "":
            n += 1
            data_dict.append({})
            data_dict[n]["ID"] = "%s-%s" % (file_path, n)
        else:
            string_key, string_value = parse_string(line)
            data_dict[n][string_key] = string_value
    return data_dict

## Step 2: get `cleaned_data`

There are some wrong data recordes in the 2015 file, for example:
- `{'organization': 'North_America', 'type': 'Building upon efforts from other fields to conceptualize, we develop a typology of crowdsourcing for information science. Through a number of dimensions within the scope of motivation, centrality, beneficiary, aggregation, type of work, and type of crowd, our typology provides a way to understand crowdsourcing."', 'ID': 'iConference_edited/iConf2015.txt-20', 'title': 'Accepted', 'author': 'United States of America'}`

To analyse data better, I keep the original `data_dict` but make other two dicts: `cleaned_data` and `wrong_data`.


In [7]:
def cleaned_and_wrong_data(data_dict):
    cleaned_data = []
    wrong_data = []
    for item in data_dict:
        if len(item['type'].split()) > 10 or item['title'] == "Accepted":
            wrong_data.append(item)
        else:
            cleaned_data.append(item)
    return cleaned_data, wrong_data

In [8]:
data_dict = get_data_dict(file_path)
cleaned_data, wrong_data = cleaned_and_wrong_data(data_dict)

print(cleaned_data[:2])
print(len(cleaned_data))

[{'acceptance': 'Accepted', 'abstract': 'A new artisanship crop moored to information technology has emerged, and it is being heavily embraced in Nigeria. Guided by Maslow’s hierarchy of needs, this study examined job satisfaction among the information technology artisans in Nigeria. Job satisfaction was measured by whether the artisans wish to stay on the job, perceive their income as high or low, or consider the profession as having a good prospect; and on a binary scale of yes or no. The study also investigated how the socio-demographic characteristics of the respondents relate to their needs. Data was collected from 950 artisans randomly selected from two major locations in Nigeria with the aid of a questionnaire and an interview schedule. A breakdown of the Maslow’s variables predicted different job satisfaction differently just as the demographic and social characteristics of the respondents predicted the artisans’ needs differently.', 'topics': 'information technology and work',

In [9]:
print(wrong_data[:2])

print("\n[ATTENTION!] There are at least %s wrong data recodes needed to be checked manually." % len(wrong_data))

[{'type': 'Building upon efforts from other fields to conceptualize, we develop a typology of crowdsourcing for information science. Through a number of dimensions within the scope of motivation, centrality, beneficiary, aggregation, type of work, and type of crowd, our typology provides a way to understand crowdsourcing."', 'title': 'Accepted', 'author': 'United States of America', 'organization': 'North_America', 'ID': 'iConference_edited/iConf2015.txt-20'}, {'type': 'This study adopts an interpretivist worldview within a critical paradigm to understand the world of Lanna culture and to explore storage, collection, access management, preservation and promotion of it in various libraries. The main approach is case studies underpinned by ethnography and field study."', 'title': 'Accepted', 'author': 'United Kingdom', 'organization': 'Western_Europe', 'ID': 'iConference_edited/iConf2015.txt-54'}]

[ATTENTION!] There are at least 19 wrong data recodes needed to be checked manually.


## Step 3: Count `type`

In [10]:
def get_specific_dict(data, key):
    result = {}
    for item in data:
        if key in item:
            value = item[key]
            if value in result:
                result[value] += 1
            else:
                result[value] = 1
    return result

print(get_specific_dict(cleaned_data, "type"))

{'Posters': 91, 'Sessions for Interaction and Engagement': 11, 'Completed Research Papers': 49, 'Workshops': 32, 'Preliminary Results Papers': 39}


In [32]:
type_dict = get_specific_dict(cleaned_data, "type")
with open("type_all_frequency.csv", "w") as f:
    for item in type_dict:
        f.write("%s,%s\n" % (item, type_dict[item]))

http://stackoverflow.com/questions/34771191/matplotlib-taking-time-when-being-imported

```
import matplotlib.pyplot as plt

D = {u'Label1':26, u'Label2': 17, u'Label3':30}

plt.bar(range(len(D)), D.values(), align='center')
plt.xticks(range(len(D)), list(D.keys()))

plt.show()
```

In [11]:
# import matplotlib.pyplot as plt

# D = {u'Label1':26, u'Label2': 17, u'Label3':30}

# plt.bar(range(len(D)), D.values(), align='center')
# plt.xticks(range(len(D)), list(D.keys()))

# plt.show()


# something's wrong with matplotlib

In [12]:
# import matplotlib as mpl

# import matplotlib.pyplot as plt
#
# D = {u'Label1':26, u'Label2': 17, u'Label3':30}
#
# plt.bar(range(len(D)), D.values(), align='center')
# plt.xticks(range(len(D)), list(D.keys()))
#
# plt.show()

# print(mpl.get_cachedir())

## Step 4: Count `region` & `country`


In [13]:
print(get_specific_dict(cleaned_data, "region"))

{'Asia': 9, 'Middle_East': 1, 'Western_Europe': 15, 'North_America': 176, 'Sub-Saharan_Africa': 1, 'South_America': 1}


**save as csv file**

In [29]:
region_dict = get_specific_dict(cleaned_data, "region")
with open("region_all_frequency.csv", "w") as f:
    for item in region_dict:
        f.write("%s,%s\n" % (item, region_dict[item]))

In [14]:
print(get_specific_dict(cleaned_data, "country"))

{'United Kingdom': 4, 'South Africa': 1, 'Germany': 4, 'Sweden': 2, 'Finland': 2, 'Denmark': 2, 'Chile': 1, "China, People's Republic of": 4, 'Korea, Republic of (South Korea)': 4, 'Canada': 15, 'Israel': 1, 'Japan': 1, 'United States of America': 161, 'Ireland': 1}


**save as csv file**

In [30]:
country_dict = get_specific_dict(cleaned_data, "country")
with open("country_all_frequency.csv", "w") as f:
    for item in country_dict:
        f.write("%s,%s\n" % (item, country_dict[item]))

## Step 5: Count `author`

In [15]:
def get_author_dict(data):
    result = {}
    for item in data:
        string = item["author"]
        list_author = string.split("; ")
        for item in list_author:
            if item in result:
                result[item] += 1
            else:
                result[item] = 1
    return result

print(get_author_dict(cleaned_data))

{'Gasson, Susan': 1, 'Chen, Feng': 1, 'Cottam, Joseph': 2, 'Gernandt, André (1)': 1, 'Zhao, Yuxiang': 1, 'Willis, Matt': 1, 'Shah, Chirag (2)': 1, 'Liu, Fang (1)': 1, 'Shin, Grace': 1, 'Morris, Rebecca Jane': 1, 'Wang, Jun': 1, 'Zavalina, Oksana L.': 1, 'Joshi, Ritu Virendra (2)': 1, 'Widdersheim, Michael Majewski': 1, 'Qin, Jian': 1, 'Costa, Mark': 1, 'Srinivasan, Janaki (1)': 1, 'Kusumakaulika, Nafiri': 1, 'Sabanovic, Selma (1)': 1, 'White, Kelvin': 1, 'Fu, Hengyi (1)': 1, 'Pan, Youneng (1,2)': 1, 'Hagen, Loni': 1, 'Sormunen, Eero (1)': 1, 'Zhu, Qinghua': 1, 'Honor, Leah': 1, 'Horseman, Rachel Lynn': 1, 'Edelblute, Trevor': 1, 'Brännback, Malin': 1, 'Morales, Miraida (1)': 1, 'Ennis-Cole, Demetria': 1, 'Kurup, Nikhil Gopinath': 1, 'Foster, Jonathan': 1, 'Lopatovska, Irene': 4, 'Bailey, Diane': 1, 'Gwizdka, Jacek': 2, 'Lipinsk, Mario (1)': 1, 'Shelton, Martin': 1, 'Ryu, Hohyon (4)': 1, 'Brooks, Catherine Francis': 1, 'Stock, Wolfgang G.': 1, 'Park, Hyoungjoo': 1, 'Yeh, Tom (1)': 1, 'K

In [16]:
author_dict = get_author_dict(cleaned_data)

for item in author_dict:
    if author_dict[item] >= 3:
        print("%s : %s" % (item, author_dict[item]))

Lopatovska, Irene : 4
Erickson, Ingrid : 6
Richardson, Debra : 4
Nathan, Lisa : 4
Sawyer, Steven : 4
Haimson, Oliver L. : 5
Eschler, Jordan : 4
Allen, Warren : 4
Liu, Xiaozhong : 3
Snyder, Jaime : 3
Tomlinson, Bill : 4
Todd, Ross : 4
Geiger, R. Staurt : 4
Penzenstadler, Birgit : 4
Voida, Stephen : 4
Juncker, Beth : 4
Menking, Amanda : 4
Fisher, Karen : 4
Shankar, Kalpana : 5
Burton, Matt : 4
Balling, Gitte : 4
Acker, Amelia : 4
Martens, Marianne : 4
Fleischmann, Kenneth : 4
Lin, Yu-Ru : 3
Ribes, David : 4
Centivany, Alissa Lorraine : 3
Meyers, Eric : 4
Shilton, Katie : 3
Hayes, Gillian R. : 5
O'Brien, Heather L. : 3


**save as csv file**

In [28]:
with open("author_all_frequency.csv", "w") as f:
    for item in author_dict:
        f.write("%s,%s\n" % (item, author_dict[item]))

## Step 6: Count `keywords`

In [17]:
def get_keyword_dict(data):
    result = {}
    for item in data:
        if "keywords" in item:
            string = item["keywords"]
            list_keyword = string.split(", ")
            for item in list_keyword:
                if item in result:
                    result[item] += 1
                else:
                    result[item] = 1
    return result

keyword_dict = get_keyword_dict(cleaned_data)

for item in keyword_dict:
    if keyword_dict[item] >= 3:
        print("%s : %s" % (item, keyword_dict[item]))

information policy : 6
gender : 5
ethics : 3
collaboration : 4
research agenda : 4
crowdsourcing : 5
Sociotechnical : 4
online communities : 4
values in design : 4
sexuality : 4
design : 8
information ethics : 3
Social Media : 4
digital youth : 5
metadata : 3
social network analysis : 4
research methods : 4
trace data : 4
education : 6
CSST : 4
data analysis : 4
sustainability : 7
social media : 7
trace ethnography : 4
Facebook : 4
privacy : 4
learning : 4
Twitter : 5
race : 4


**save as csv file**

In [27]:
with open("keywords_all_frequency.csv", "w") as f:
    for item in keyword_dict:
        f.write("%s,%s\n" % (item, keyword_dict[item]))


## Step 7: Count `abstract`

In [18]:
from nltk.tokenize import word_tokenize

def get_abstract_tokens(data):
    tokens = []
    stopwords=[word.strip().lower() for word in open("english_stopwords.txt")]
    for item in data:
        if "abstract" in item:
            list_token = word_tokenize(item["abstract"])
            for item in list_token:
                item = item.strip().lower()
                if item not in stopwords and len(item) > 1:
                    tokens.append(item)
    return tokens

test = (cleaned_data)[:5]
a = get_abstract_tokens(test)
print(a)

['new', 'artisanship', 'crop', 'moored', 'information', 'technology', 'emerged', 'heavily', 'embraced', 'nigeria', 'guided', 'maslow’s', 'hierarchy', 'needs', 'study', 'examined', 'job', 'satisfaction', 'among', 'information', 'technology', 'artisans', 'nigeria', 'job', 'satisfaction', 'measured', 'whether', 'artisans', 'wish', 'stay', 'job', 'perceive', 'income', 'high', 'low', 'consider', 'profession', 'good', 'prospect', 'binary', 'scale', 'yes', 'study', 'also', 'investigated', 'socio-demographic', 'characteristics', 'respondents', 'relate', 'needs', 'data', 'collected', '950', 'artisans', 'randomly', 'selected', 'two', 'major', 'locations', 'nigeria', 'aid', 'questionnaire', 'interview', 'schedule', 'breakdown', 'maslow’s', 'variables', 'predicted', 'different', 'job', 'satisfaction', 'differently', 'demographic', 'social', 'characteristics', 'respondents', 'predicted', 'artisans’', 'needs', 'differently', '2013', 'elections', 'israel', 'one', 'major', 'methods', 'interaction', 'p

In [20]:
import nltk

tokens = get_abstract_tokens(cleaned_data)
Freq_dist_nltk=nltk.FreqDist(tokens)
print(Freq_dist_nltk.most_common(150))

[('information', 232), ('data', 229), ('research', 139), ('study', 135), ('social', 124), ('use', 86), ('users', 79), ('analysis', 77), ('design', 77), ('paper', 72), ('using', 70), ('work', 66), ('workshop', 63), ('digital', 62), ('community', 62), ('also', 59), ('results', 58), ('media', 56), ('user', 56), ('participants', 56), ('different', 52), ('systems', 51), ('online', 50), ('new', 47), ('findings', 45), ('two', 45), ('researchers', 45), ('public', 44), ('within', 44), ('students', 43), ('knowledge', 42), ('learning', 41), ('behavior', 40), ('library', 40), ('used', 40), ('based', 39), ('development', 38), ('understanding', 37), ('support', 37), ('questions', 36), ('may', 36), ('methods', 35), ('identify', 35), ('process', 35), ('many', 35), ('science', 34), ('challenges', 34), ('present', 34), ('search', 34), ('practices', 34), ('system', 33), ('technology', 32), ('experience', 32), ('trace', 32), ('web', 32), ('one', 31), ('implications', 31), ('approach', 31), ('network', 30)

In [23]:
f150 = Freq_dist_nltk.most_common(150)
print(type(f150[0]))
# for item in f150:
    

<class 'tuple'>


**save as csv file**

In [25]:
open("abstract_token_most_common_150.csv", "w").write('\n'.join('%s,%s' % x for x in f150))

1688