In [71]:
import glob
from io import open
import pandas as pd
from pandas import DataFrame as df
from os import path
import re

In [72]:
import matplotlib.pyplot as plt
%matplotlib inline

## Merge CSV databases

### Reading the column names

In [73]:
labels = pd.read_csv("data/PsycInfo/csv/Acronym Key.csv", header=-1, names=["Acronym", "Name", "Keep"])

In [74]:
column_to_name = dict(labels.values[:,:2])

In [75]:
column_to_keep = labels[labels.Keep=="keep"].Acronym.values

In [76]:
dfs = []
for file in glob.glob("data/PsycInfo/csv/*csv"):
  word = path.basename(file).split('.')[0].split('_')[0]
  if word == "Acronym Key": continue
  df_ = pd.read_csv(file, encoding="iso-8859-1", header=1)
  df_.insert(0, "Term", [word]*len(df_))
  print(file, len(df_))
  dfs.append(df_)

data/PsycInfo/csv/bicultural.csv 1784
data/PsycInfo/csv/biethnic.csv 42
data/PsycInfo/csv/biracial.csv 882
data/PsycInfo/csv/cultural pluralism.csv 212
data/PsycInfo/csv/interracial_1.csv 1500
data/PsycInfo/csv/interracial_2.csv 1246
data/PsycInfo/csv/mixed ethnicity.csv 99
data/PsycInfo/csv/mixed race.csv 551
data/PsycInfo/csv/monocultural.csv 225
data/PsycInfo/csv/monoracial.csv 145
data/PsycInfo/csv/multicultural_1.csv 1000
data/PsycInfo/csv/multicultural_10.csv 1000
data/PsycInfo/csv/multicultural_11.csv 1437
data/PsycInfo/csv/multicultural_2.csv 1000
data/PsycInfo/csv/multicultural_3.csv 1000
data/PsycInfo/csv/multicultural_4.csv 1000
data/PsycInfo/csv/multicultural_5.csv 1000
data/PsycInfo/csv/multicultural_6.csv 1000
data/PsycInfo/csv/multicultural_7.csv 1000
data/PsycInfo/csv/multicultural_8.csv 1000
data/PsycInfo/csv/multicultural_9.csv 1000
data/PsycInfo/csv/multiculturalism_1.csv 1000
data/PsycInfo/csv/multiculturalism_2.csv 1000
data/PsycInfo/csv/multiculturalism_3.csv 1000

In [77]:
words_df = pd.concat(dfs)[list(column_to_keep)+["Term"]]
assert len(words_df[words_df.Term == 'biracial']) == 882
words_df = words_df.rename(columns=column_to_name)
words_df.head()

Unnamed: 0,Abstract,Accession Number,Author(s),Type of Book,PsycINFO Classification Code,Conference,Document Type,Grant/Sponsorship,Key Concepts,Institution,...,Population Group,Publication Status,Publication Type,Publisher,Cited References,Title,Tests & Measures,Volume,Date,Term
0,PURPOSE: Rates of alcohol use may be increasin...,Peer Reviewed Journal: 2015-52719-001.,"Kane, Jeremy C\n\nJohnson, Renee M\n\nRobinson...",,Health & Mental Health Treatment & Prevention ...,,,,"Acculturation, Intergenerational cultural diss...",,...,,First Posting,Journal\n\nPeer Reviewed Journal,Elsevier Science; Netherlands,,The impact of intergenerational cultural disso...,,,2015,bicultural
1,Given the negative developmental risks associa...,Peer Reviewed Journal: 2015-52548-001.,"Killoren, Sarah E\n\nZeiders, Katharine H\n\nU...",,Developmental Psychology [2800].,,,,"Adolescence, Cultural context, Mexican-America...","Killoren, Sarah E.: Department of Human Develo...",...,,First Posting,Journal\n\nPeer Reviewed Journal,Springer; Germany,,The sociocultural context of mexican-origin pr...,,,2015,bicultural
2,(from the chapter) Assessment science is an es...,Book: 2013-02670-011.,"Dana, Richard H",Handbook/Manual,Personality Scales & Inventories [2223].,,Chapter,,"personality tests, psychology, assessment, cul...",,...,Human,,Book\n\nEdited Book,American Psychological Association; US,"Aiken, L. S., West, S. G., & Millsap, R. E. (2...",Personality tests and psychological science: I...,California Brief Multicultural Competency Scal...,,2014,bicultural
3,Objective: The aim of the study was to explore...,Peer Reviewed Journal: 2015-46649-006.,"Goutaudier, N\n\nChauchard, E\n\nMelioli, T\n\...",,Psychosocial & Personality Development [2840].,,Journal Article,,"Acculturation, Adolescence, Cluster analysis, ...","Goutaudier, N.: Laboratoire CERPP-OCTOGONE, UF...",...,Human. Male. Female. Adolescence (13-17 yrs),,Journal\n\nPeer Reviewed Journal,Elsevier Masson SAS; France,"Aubry, B., & Tribalat, M. (2009). Les jeunes d...",Acculturation orientations and psychosocial ad...,Immigrant Acculturation Scale\nRosenberg Self-...,41.0,2015,bicultural
4,"(from the chapter) In Germany, the visit of th...",Book: 2014-27297-015.,"Leyendecker, Birgit\n\nWillard, Jessica\n\nAga...",,Cognitive & Perceptual Development [2820].,,Chapter,<b>Sponsor: </b>NORFACE. ERA-NET\n<b>Grant: </...,"children's bilingual development, parents, imm...","Leyendecker, Birgit: Ruhr University Bochum, B...",...,Human. Childhood (birth-12 yrs),,Book\n\nEdited Book,Ashgate Publishing Co; US,"Adesope, O. O., Lavin, T., Thompson, T., & Ung...",Learning a host country: A plea to strengthen ...,,,2014,bicultural


In [78]:
#words_df.to_csv("data/PsycInfo/processed/psychinfo_combined.csv.bz2", encoding='utf-8',compression='bz2')

## Load PsychINFO unified database

In [79]:
#psychinfo = pd.read_csv("data/PsycInfo/processed/psychinfo_combined.csv.bz2", encoding='utf-8', compression='bz2')
psychinfo = words_df

### Term appearance in abstract and title

In [80]:
abstract_occurrence = []
for x,y in psychinfo[["Term", "Abstract"]].fillna("").values:
  if x.lower() in y.lower():
    abstract_occurrence.append(1)
  else:
    abstract_occurrence.append(0)
psychinfo["term_in_abstract"] = abstract_occurrence

In [81]:
title_occurrence = []
for x,y in psychinfo[["Term", "Title"]].fillna("").values:
  if x.lower() in y.lower():
    title_occurrence.append(1)
  else:
    title_occurrence.append(0)
psychinfo["term_in_title"] = title_occurrence

In [82]:
psychinfo_search = psychinfo.drop('Abstract', 1)
psychinfo_search = psychinfo_search.drop('Title', 1)

In [83]:
term_ID = {"multiculturalism": 1, "polyculturalism": 2, "cultural pluralism": 3, 
           "monocultural": 4, "monoracial": 5, "bicultural": 6, 
           "biracial": 7, "biethnic": 8, "interracial": 9, 
           "multicultural": 10, "multiracial": 11, "polycultural": 12, 
           "polyracial": 13, "polyethnic": 14, "mixed race": 15, 
           "mixed ethnicity": 16, "other race": 17, "other ethnicity": 18}

In [84]:
psychinfo_search["term_ID"] = psychinfo_search.Term.map(term_ID)

In [85]:
psychinfo_search["Type of Book"].value_counts()

Handbook/Manual                                   1395
Textbook/Study Guide                               533
Conference Proceedings                              53
Reference Book                                      45
Classic Book                                        25
Handbook/Manual\n\nTextbook/Study Guide             16
Reference Book\n\nTextbook/Study Guide               6
Classic Book\n\nTextbook/Study Guide                 2
Reference Book\r\rTextbook/Study Guide               1
Conference Proceedings\n\nTextbook/Study Guide       1
Handbook/Manual\n\nReference Book                    1
Conference Proceedings\r\rTextbook/Study Guide       1
Name: Type of Book, dtype: int64

In [86]:
type_of_book = { 'Handbook/Manual': 1, 'Textbook/Study Guide': 2, 'Conference Proceedings': 3,
       'Reference Book': 2, 'Classic Book': 4,'Handbook/Manual\n\nTextbook/Study Guide': 5,
       'Reference Book\n\nTextbook/Study Guide': 5,'Classic Book\n\nTextbook/Study Guide': 5,
       'Handbook/Manual\n\nReference Book': 5,'Conference Proceedings\n\nTextbook/Study Guide': 5,
       'Reference Book\r\rTextbook/Study Guide': 5,'Conference Proceedings\r\rTextbook/Study Guide': 5}

In [87]:
psychinfo_search["type_of_book"] = psychinfo_search["Type of Book"].map(type_of_book)

In [88]:
psychinfo_search["cited_references"] = psychinfo_search['Cited References'].map(lambda text:len(text.strip().split("\n")),"ignore")

In [89]:
psychinfo_search['Document Type'].value_counts()

Journal Article                          14369
Dissertation                              4919
Chapter                                   4558
Review-Book                               1444
Comment/Reply                              548
Editorial                                  228
Chapter\n\nReprint                          78
Erratum/Correction                          66
Review-Media                                35
Abstract Collection                         29
Letter                                      18
Obituary                                    13
Chapter\n\nComment/Reply                    10
Reprint                                      9
Column/Opinion                               9
Bibliography                                 8
Journal Article\n\nReprint                   7
Chapter\r\rReprint                           6
Chapter\n\nJournal Article\n\nReprint        5
Encyclopedia Entry                           5
Bibliography\n\nChapter                      5
Chapter\r\rJo

In [95]:
document_type = {'Journal Article': 1, 'Dissertation': 2, 'Chapter': 3, 'Review-Book': 4,
       'Comment/Reply': 6, 'Editorial': 6, 'Chapter\n\nReprint': 3,
       'Erratum/Correction': 6, 'Review-Media': 6, 'Abstract Collection': 6,
       'Letter': 6, 'Obituary': 6, 'Chapter\n\nComment/Reply': 3, 'Column/Opinion': 6,
       'Reprint': 5, 'Bibliography': 5, 'Journal Article\n\nReprint': 1,
       'Chapter\r\rReprint': 3, 'Chapter\n\nJournal Article\n\nReprint': 3,
       'Bibliography\n\nChapter': 3, 'Encyclopedia Entry': 5,
       'Chapter\r\rJournal Article\r\rReprint': 3, 'Review-Software & Other': 6,
       'Publication Information': 6, 'Journal Article\r\rReprint': 1,
       'Reprint\n\nReview-Book': 4}

In [96]:
psychinfo_search['document_type'] = psychinfo_search['Document Type'].map(document_type)

In [97]:
psychinfo_search["conference_dich"] = psychinfo_search["Conference"].fillna("").map(lambda x: int((len(x) > 0)))


In [98]:
psychinfo_search['Publication Type'].value_counts()

Journal\n\nPeer Reviewed Journal           15714
Book\n\nEdited Book                         5402
Dissertation Abstract                       4919
Book\n\nAuthored Book                        890
Journal\r\rPeer Reviewed Journal             468
Electronic Collection                        454
Journal\n\nPeer-Reviewed Status-Unknown      234
Book\r\rEdited Book                          155
Book                                          30
Journal\r\rPeer-Reviewed Status-Unknown       14
Book\r\rAuthored Book                         13
Encyclopedia                                  11
Name: Publication Type, dtype: int64

In [99]:
publication_type = {'Journal\n\nPeer Reviewed Journal': 1, 'Book\n\nEdited Book': 3,
       'Dissertation Abstract': 2, 'Book\n\nAuthored Book': 3,
       'Journal\r\rPeer Reviewed Journal': 1, 'Electronic Collection': 1,
       'Journal\n\nPeer-Reviewed Status-Unknown': 1, 'Book\r\rEdited Book': 3,
       'Book': 3, 'Journal\r\rPeer-Reviewed Status-Unknown': 1,
       'Book\r\rAuthored Book': 3, 'Encyclopedia': 4}

In [100]:
psychinfo_search['publication_type'] = psychinfo_search['Publication Type'].map(publication_type)

In [107]:
#(psychinfo["publication_type"] * psychinfo["conference_dich"]).value_counts()

In [25]:
psychinfo_search['Language'].value_counts()

English           27823
French               83
Spanish              78
Italian              42
German               41
Portuguese           31
Dutch                29
Chinese              22
Greek                10
Hebrew                7
Turkish               6
Serbo-Croatian        5
Russian               5
Slovak                4
Japanese              3
Hungarian             3
Czech                 2
Polish                2
Danish                2
Norwegian             2
Romanian              2
Afrikaans             1
NonEnglish            1
Swedish               1
Finnish               1
Arabic                1
Name: Language, dtype: int64

In [38]:
language = {'English': 1, 'French': 2, 'Spanish': 3, 'Italian': 4, 'German': 5, 'Portuguese': 6,
       'Dutch': 7, 'Chinese': 8, 'Greek': 9, 'Hebrew': 10, 'Turkish': 10, 'Russian': 10,
       'Serbo-Croatian': 10, 'Slovak': 10, 'Japanese': 10, 'Hungarian': 10, 'Czech': 10,
       'Danish': 10, 'Romanian': 10, 'Polish': 10, 'Norwegian': 10, 'Swedish': 10, 'Finnish': 10,
       'NonEnglish': 10, 'Arabic': 10, 'Afrikaans': 10}

In [39]:
psychinfo_search['language'] = psychinfo_search['Language'].map(language)

In [40]:
#psychinfo_search["PsycINFO Classification Code"].value_counts().to_csv("data/PsycInfo/processed/PsycINFO_Classification_Code.csv")

In [41]:
#psychinfo_search["Tests & Measures"].value_counts().to_csv("data/PsycInfo/processed/Tests_&_Measures.csv")

In [42]:
#psychinfo_search["Key Concepts"].value_counts().to_csv("data/PsycInfo/processed/Key_Concepts.csv")

In [43]:
#psychinfo_search["Location"].value_counts().to_csv("data/PsycInfo/processed/Location.csv")

In [44]:
#psychinfo_search["MeSH Subject Headings"].value_counts().to_csv("data/PsycInfo/processed/MeSH_Subject_Headings.csv")

In [45]:
#psychinfo_search["Journal Name"].value_counts().to_csv("data/PsycInfo/processed/Journal_Name.csv")

In [46]:
#psychinfo_search["Institution"].value_counts().to_csv("data/PsycInfo/processed/Institution.csv")

In [110]:
psychinfo_search["Population Group"].value_counts()

Human                                                                                                                                                                                                                                                                                                                                                              11916
Human. Adulthood (18 yrs & older)                                                                                                                                                                                                                                                                                                                                   3491
Human.  Male.  Female. Adulthood (18 yrs & older)                                                                                                                                                                                                                                     

In [47]:
psychinfo_search["Methodology"].value_counts()

Empirical Study; Quantitative Study                                                                             5448
Empirical Study                                                                                                 3435
Empirical Study; Qualitative Study                                                                              1156
Empirical Study; Interview; Qualitative Study                                                                    681
Literature Review                                                                                                522
Empirical Study; Interview; Quantitative Study                                                                   508
Empirical Study; Qualitative Study; Quantitative Study                                                           294
Empirical Study; Longitudinal Study; Quantitative Study                                                          250
Empirical Study; Interview; Focus Group; Qualitative Study      

In [48]:
def GetCats(text):
  pattern = re.compile("([0-9]+)")
  results = [100*(int(x)//100) for x in pattern.findall(text)]
  if len(set(results))>1:
    return 4300 
  else:
    return results[0] 

In [49]:
psychinfo_search["PsycINFO_Classification_Code"] = psychinfo_search["PsycINFO Classification Code"].map(GetCats, "ignore")

In [52]:
lists = psychinfo["PsycINFO Classification Code"].map(GetCats, "ignore")
len(set([x for x in lists.dropna()]))
#Number of unique categories

23

In [64]:
psychinfo_search["grants_sponsorship"] = psychinfo_search["Grant/Sponsorship"].fillna("").map(lambda x: int(len(x) > 0))

In [41]:
#psychinfo_search.to_csv("data/PsycInfo/processed/psychinfo_term_search.csv.bz2", encoding='utf-8', compression='bz2')

In [42]:
#psychinfo_search = psychinfo_search.drop('Title', 1)

# PsycINFO Tasks

Keep the current spreadsheet and add the following: 
1. ~~Add Term in Abstract to spreadsheet~~ (control for the length of the abstract)**do this for NSF/NIH data as well**
1. ~~Add Term in Title to spreadsheet~~
1. ~~Copy the word data into a new column (title it 'terms')--> code them as the following: 1 = multiculturalism, 2 = polyculturalism, 3 = cultural pluralism, 4 = monocultural, 5 = monoracial, 6 = bicultural, 7 = biracial, 8 = biethnic, 9 = interracial, 10 = multicultural, 11 = multiracial, 12 = polycultural, 13 = polyracial, 14 = polyethnic, 15 = mixed race, 16 = mixed ethnicity, 17 = other race, 18 = other ethnicity~~
1. Search all options in set for the following categories: -- I will manually categorize them once you give all options in each set
    1. ~~"Type of Book"~~
    1. ~~"PsycINFO Classification Code"~~
       ~~1. (used the classification codes[recoded to most basic category levels] -- subcategories 
       created by PsycInfo (22)-- multiple categories = 4300)~~
    1. ~~"Document Type"~~
    1. ~~"Grant/Scholarship"~~ 
        1. ~~(create a dichotomized variable 0/1)~~
    1. ~~"Tests & Measures"--> csv (no longer necessary)~~
        1. ~~(Too many categories---needs to be reviewed manually/carefully in excel)~~
    1. ~~"Publication Type"~~
    1. ~~"Publication Status"~~
    1. "Population Group" 
        1. (populations are grouped together--can we cluster them? scan for how often humans 
        are mentioned? then, men? etc. Afterwards, we would do the mapping)
        1. We need: gender, age (abstract, years)
    1. "Methodology"
        1. (methods are grouped together--can we cluster them? scan for how often empirical 
        study is mentioned? then, field? etc. Afterwards, we would do the mapping)
    1. "Conference" 
        1. ~~Right now, this is text (~699 entries)--> dichotomize variable.~~ 
           ~~If it is a conference ie there is a text = 1, if there is NaN = 0.~~
        1. Then, I will incorporate this as a new category in "Publication Type" and remove this column).
    1. "Key Concepts"--> csv 
        1. (word co-occurrence or MDS)
    1. "Location"-->csv--> sent to Barbara
        1. (categorized by region--multiple regions)
    1. ~~"Language"~~
        ~~1. I am not sure about my "other" language (10) category -- I put everything with less 
        than 10 entries into one category.~~
    1. "MeSH Subject Headings"--> csv (may no longer be necessary?)
        1. (word co-occurrence or MDS)
    1. "Journal Name"-->csv--> sent to Jian Xin
        1. (categorized by psychology area)
    1. "Institution"-->csv --> sent to Barbara
        1. (categorized by state, region & country)
1. ~~Count the number of cited references for each entry~~

***Once we extract the csv files for these columns, I will categorize them. 

Once all of these corrections have been made, make a new spreadsheet and delete the following information: 
1. Volume
1. Publisher
1. Accession Number
1. Author(s) 
1. Issue
1. Cited References
1. Publication Status (had no variance)--only first posting
