In [36]:
%matplotlib notebook

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import matplotlib
import seaborn as sns
import scipy as sp
from scipy import sparse
from scipy.sparse import csr_matrix
from sklearn.feature_extraction.text import TfidfVectorizer

import pickle
import string
import time
import memory_profiler
%load_ext memory_profiler

The memory_profiler extension is already loaded. To reload it, use:
  %reload_ext memory_profiler


# Import data

In [15]:
%%memit
# data
clean_df=pd.read_pickle("/gpfs01/berens/user/rgonzalesmarquez/variables/clean_df")

peak memory: 43791.96 MiB, increment: 43360.40 MiB


# Create colors

## Colors by journals

In [6]:
def automatic_coloring(journals,words_may, words_min,list_colors):
    """
    Function that creates an array with colors, assigning a color to each paper depending on which words the name 
    of its journal contains. The colors that will be assigned are introduced in the function by list_colors.
    
    IMPORTANT REMARK: if the journal name contains two words belonging to the word list, the color of the word
    located the latest in the list will be assigned to it (first, the first word's color is assigned and then 
    the second overwrites the first).
    
    Input:
    - journals (clean_df['Journal'] in our case) - it has to be a dataframe with the journal names of the papers.
    - words_may - list of the words starting with capital letter.
    - words_min - list of the words with small letters.
    - list_colors - list of all the different colors for assigning to papers.
    
    Output:
    - word_colors - dict with legend of word-colors (which color has each journal-word)
    - journal_colors - array of colors for each paper.
    
    """
    
    
    N=len(words_may)
    
    dict_colors={}
    word_colors={}
    for i in range(N):
        # I create a dictionary with the legend word-color for informative purpose
        word_colors[words_may[i]]=list_colors[i]
        
        #sub1 is a string with the word in small letters and a space before it
        sub1=words_min[i]
        #sub2 is a string with the word starting with capital letter
        sub2=words_may[i]
        
        # .find returns a -1 in the case that it didn't find the str it was looking for
        indexes1= journals.str.find(sub1) 
        indexes2= journals.str.find(sub2)
        
        #containing_journals are the journals (the whole name) containing either the word in small letter or starting 
        #with capital letter
        containing_journals=journals[(indexes1!=-1) | (indexes2!=-1)]
        containing_journals=containing_journals.to_numpy()
        
        #unique_containing_j are the unique journal names from containing_journals
        unique_containing_j=np.unique(containing_journals)
        
        #here we assign one color (the same to all) to each unique journal name containing the chosen word
        for elem in unique_containing_j:
            dict_colors[elem]=list_colors[i]
    
    #create colors
    journal_colors=np.vectorize(dict_colors.get)(journals)
    
    #add grey to the rest of papers
    journal_colors=np.where(journal_colors==None,'lightgrey', journal_colors)
    journal_colors=np.where(journal_colors=='None','lightgrey', journal_colors)
    
    return word_colors, journal_colors

In [7]:
"""
Description of colors and words below.

Colors:
-chosen_colors: Scanpy colors selection (originally from http://godsnotwheregodsnot.blogspot.com/2013/11/kmeans-color-quantization-seeding.html)

Words:
-words_capital: Words selected manually from the 100 most relevant words using threshold 0.1.
                They start with capital letter.

-words_small: Words selected manually from the 100 most relevant words using threshold 0.1.
                They are all in small letters.
"""

# COLORS
chosen_colors = ['black', '#FFFF00', '#1CE6FF', '#FF34FF', '#FF4A46', '#008941', '#006FA6', '#A30059', '#7A4900', '#0000A6', 
                 '#63FFAC', '#B79762', '#004D43', '#8FB0FF', '#D16100', '#5A0007', '#BA0900', '#1B4400', '#4FC601', '#3B5DFF', '#00C2A0']


# WORDS
words_capital=['Cancer', 'Neuroscience', 'Cardiology', 'Ecology', 'Bioinformatics','Chemistry', 'Surgery',
               'Biology', 'Environmental', 'Material', 
               'Cell', 'Microbiology', 'Pediatric', 'Immunology', 'Food',
               'Psychology','Psychiatry', 'Genetics', 'Nutrition', 'Veterinary',
               'Engineering'] 

words_small=[' cancer', ' neuroscience', ' cardiology', ' ecology', ' bioinformatics',' chemistry', ' surgery',
               ' biology', ' environmental', ' material', 
               ' cell', ' microbiology', ' pediatric', ' immunology', ' food',
               ' psychology',' psychiatry', ' genetics', ' nutrition', ' veterinary',
               ' engineering']

In [20]:
%%time
#create the colorings
automatic_legend, automatic_colors = automatic_coloring(clean_df['Journal'],words_capital, words_small, chosen_colors)

CPU times: user 5min 53s, sys: 8.27 s, total: 6min 2s
Wall time: 6min 4s


In [None]:
#save results
np.save("variables/automatic_colors", automatic_colors)

#save results
f = open("variables/automatic_legend.pkl","wb")
pickle.dump(automatic_legend,f)
f.close()

## Colors by year

In [34]:
# Date
all_dates=clean_df['Date']
unique_dates=np.unique(all_dates)

In [37]:
%%time

# We do this to get the vocabulary, that divides the date strings in their different words
vectorizer = TfidfVectorizer(sublinear_tf=True)
tfidf_dates = vectorizer.fit_transform(unique_dates)
vocabulary_dates=vectorizer.vocabulary_

CPU times: user 115 ms, sys: 5.85 ms, total: 121 ms
Wall time: 118 ms


In [21]:
dates_list=list(vocabulary_dates.keys())
len_dates_list=map(len, dates_list)
len_dates=np.fromiter(len_dates_list, dtype=np.int64,count=len(dates_list))

dates_list=np.array(dates_list)
years=dates_list[len_dates==4]
print(years)

['1808' '1881' '1891' '1896' '1897' '1898' '1899' '1900' '1901' '1902'
 '1903' '1905' '1906' '1907' '1908' '1909' '1910' '1911' '1912' '1913'
 '1914' '1915' '1916' '1917' '1918' '1919' '1920' '1921' '1922' '1923'
 '1924' '1925' '1926' '1927' '1928' '1929' '1930' '1931' '1932' '1933'
 '1934' '1935' '1936' '1937' '1938' '1939' '1940' '1941' '1942' '1943'
 '1944' '1945' '1946' '1947' '1948' '1949' '1950' '1951' '1952' '1953'
 '1954' '1955' '1956' '1957' '1958' '1959' '1960' '1961' '1962' '1963'
 '1964' '1965' '1966' '1967' '1968' 'fall' '1969' '1970' '1971' '1972'
 '1973' '1974' '1975' '1976' '1977' '1978' '1979' '1980' '1981' '1982'
 '1983' '1984' '1985' '1986' '1987' '1988' '1989' '1990' '1991' '1992'
 '1993' '1994' '1995' 'june' 'july' '1996' '1997' '1998' '1999' '2000'
 '2004' '2001' '2002' '2003' '2005' '2006' '2007' '2008' '2009' '2010'
 '2011' '2012' '2013' '2014' '2015' 'sept' '2016' 'post' '2017' 'mary'
 'spec' '2018' '2019' 'bima' '2020' 'jukt' '2021']


In [50]:
# eliminate manually the words that are not years

years=['1808', '1881', '1891', '1896', '1897', '1898', '1899', '1900', '1901', '1902',
 '1903', '1905', '1906', '1907', '1908', '1909', '1910', '1911', '1912', '1913',
 '1914', '1915', '1916', '1917', '1918', '1919', '1920', '1921', '1922', '1923',
 '1924', '1925', '1926', '1927', '1928', '1929', '1930', '1931', '1932', '1933',
 '1934', '1935', '1936', '1937', '1938', '1939', '1940', '1941', '1942', '1943',
 '1944', '1945', '1946', '1947', '1948', '1949', '1950', '1951', '1952', '1953',
 '1954', '1955', '1956', '1957', '1958', '1959', '1960', '1961', '1962', '1963',
 '1964', '1965', '1966', '1967', '1968', '1969', '1970', '1971', '1972',
 '1973', '1974', '1975', '1976', '1977', '1978', '1979', '1980', '1981', '1982',
 '1983', '1984', '1985', '1986', '1987', '1988', '1989', '1990', '1991', '1992',
 '1993', '1994', '1995', '1996', '1997', '1998', '1999', '2000',
 '2004', '2001', '2002', '2003', '2005', '2006', '2007', '2008', '2009', '2010',
 '2011', '2012', '2013', '2014', '2015',  '2016', '2017', 
  '2018', '2019',  '2020',  '2021']

#save
np.save("variables/years",years)

In [51]:
# dicctionary years(number):color value

length_interval=2021-1970
cmap_values=np.linspace(0,1,length_interval+1)
year_numbers_list=np.arange(1970,2022).tolist()

dicc_years = dict(zip(year_numbers_list, cmap_values))

MEDLINE started its record in 1966 and later included almost 60 thousand noteworthy papers previously published. Therefore, the majority of the papers from PubMed are post 1970, so we used a color map going from blue (1970) to yellow (2021) and all of the papers dated before 1970 were also colored in the darkest hue of blue.

In [52]:
# We manually set papers dated from before 1970 to 0 (equivalent to the darkest hue).

years_out=[1808, 1881, 1891, 1896, 1897, 1898, 1899,1900, 1901, 1902, 1903, 1905, 1906, 1907, 1908, 
           1909, 1910, 1911, 1912, 1913, 1914, 1915, 1916, 1917, 1918, 1919, 1920, 1921, 1922, 1923, 
           1924, 1925, 1926, 1927, 1928, 1929, 1930, 1931, 1932, 1933, 1934, 1935, 1936, 1937, 1938, 
           1939, 1940, 1941, 1942, 1943, 1944, 1945, 1946, 1947, 1948, 1949, 1950, 1951, 1952, 1953, 
           1954, 1955, 1956, 1957, 1958, 1959, 1960, 1961, 1962, 1963, 1964, 1965, 1966, 1967, 1968, 1969] 

for elem in years_out:
    dicc_years[elem]=0

In [55]:
def years_coloring(publication_date, years, color_dict):
    """
    Input:
    - publication_date: the dataframe column with the publication date of the paper.
    - years: a list of all the years as strings.
    - color_dict: it is a dictionary where you have for each year a value in between 0 and 1 for the colormap.
    
    Output:
    - year_colors - array of colors for each paper.
    
    """
    
    # date_year: the year contained in the publication date for every paper is stored here
    date_year=np.zeros(len(publication_date))
    
    for i in range(len(years)):
        # .find returns a -1 in the case that it didn't find the str it was looking for
        indexes1= publication_date.str.find(years[i]) 
        
        date_year[indexes1!=-1]=int(years[i])
    
    #create colors
    year_colors=np.vectorize(color_dict.get)(date_year)
    
    return year_colors

In [56]:
%%time

colors_per_year= years_coloring(clean_df['Date'], years, dicc_years)

CPU times: user 16min 26s, sys: 29.3 s, total: 16min 56s
Wall time: 16min 56s


In [57]:
# save
np.save("variables/colors_per_year", colors_per_year)