# Цель работы:
 Исследовать сходство и различия в толковании понятий "machine learning" и "data science" через анализ публикаций в Arxive с использованием метода обобщения в таксономиях

# Шаг 1
Выделить подмножества публикаций в Arxive, ML и DS, отвечающие соответствующим запросам

In [2]:
import pandas as pd
import numpy as np

df = pd.read_json('arxiv-metadata-oai-snapshot.json', lines=True)

Filter ML and AI from past 10 years

In [3]:
df_LG = df[df['categories'].str.contains('cs.LG')]

In [4]:
df_AI = df[df['categories'].str.contains('cs.AI')]

In [5]:
df_ML = df[df['categories'].str.contains('stat.ML')]

In [6]:
df_LG = df_LG[df_LG['update_date'] > '2022']

In [7]:
df_AI = df_AI[df_AI['update_date'] > '2022']

In [8]:
df_ML = df_ML[df_ML['update_date'] > '2022']

In [9]:
print(f'ML articles: {len(df_LG)}')
print(f'AI articles: {len(df_AI)}')
print(f'ML (statistics) articles: {len(df_ML)}')

ML articles: 42929
AI articles: 20841
ML (statistics) articles: 9336


In [10]:
abstracts_LG = []
for a in df_LG['abstract']:
    abstracts_LG.append(a)

In [11]:
abstracts_AI = []

for a in df_AI['abstract']:
    abstracts_AI.append(a)

In [12]:
abstracts_ML = []

for a in df_ML['abstract']:
    abstracts_ML.append(a)

## Шаг 2
Выделить разумные нечеткие кластеры понятий таксономии согласно ML и согласно DS

In [13]:
import re

import numpy as np

from east.asts import base


def clear_text(text, lowerize=True):

    pat = re.compile(r'[^A-Za-z0-9 \-\n\r.,;!?А-Яа-я]+')
    cleared_text = re.sub(pat, ' ', text)

    if lowerize:
        cleared_text = cleared_text.lower()

    tokens = cleared_text.split()
    return tokens


def make_substrings(tokens, k=4):

    for i in range(max(len(tokens) - k + 1, 1)):
        yield ' '.join(tokens[i:i + k])


def get_corelevance_matrix(texts):

    matrix = np.empty((0, len(texts)), float)
    prepared_text_tokens = [clear_text(t) for t in texts]
    prepared_texts = [' '.join(t) for t in prepared_text_tokens]

    for text_tokens in prepared_text_tokens:
        ast = base.AST.get_ast(list(make_substrings(text_tokens)))
        row = np.array([ast.score(t) for t in prepared_texts])
        matrix = np.append(matrix, [row], axis=0)

    return matrix


def get_relevance_matrix(texts, strings):

    matrix = np.empty((0, len(strings)), float)
    prepared_text_tokens = [clear_text(t) for t in texts]

    prepared_string_tokens = [clear_text(s) for s in strings]
    prepared_strings = [' '.join(t) for t in prepared_string_tokens]
    
    #for print
    print(f'total len: {len(prepared_text_tokens)}')
    c = 0

    for text_tokens in prepared_text_tokens:
        ast = base.AST.get_ast(list(make_substrings(text_tokens)))
        row = np.array([ast.score(s) for s in prepared_strings])
        matrix = np.append(matrix, [row], axis=0)
        
        c += 1
        print(f'processed: {c}')
    return matrix


def save_matrix(matrix, filename="filename"):
    np.savetxt(filename, matrix)

Get strings of terms

In [1]:
with open("taxonomies/taxonomy_leaves.txt") as f:
    strings = [l.strip() for l in f.readlines()]

In [None]:
# Relevance between the texts and strings:

relevance_matrix_LG = get_relevance_matrix(abstracts_LG, strings)
print("Relevance mairix LG:")
print(relevance_matrix_LG)

save_matrix(relevance_matrix_LG, "matrixes/relevmtx_LG")

In [None]:
# Relevance between the texts and strings:

relevance_matrix_AI = get_relevance_matrix(abstracts_AI, strings)
print("Relevance mairix AI:")
print(relevance_matrix_AI)

save_matrix(relevance_matrix_AI, "matrixes/relevmtx_AI")

In [None]:
# Relevance between the texts and strings:

relevance_matrix_ML = get_relevance_matrix(abstracts_ML, strings)
print("Relevance mairix ML:")
print(relevance_matrix_ML)

save_matrix(relevance_matrix_ML, "matrixes/relevmtx_ML")

Obtain fuzzy clusters

In [2]:
import numpy as np
from pyfaddis.lapin import lapin
from pyfaddis.faddis import faddis

from operator import itemgetter

In [3]:
def normalize_topics(relevance_matrix):
    for i, row in enumerate(relevance_matrix):
        n_i = 0
        for j, el in enumerate(row):
            if el > 0.3:
                n_i += 1
        if n_i == 0:
            for j, el in enumerate(row):
                relevance_matrix[i][j] = 0
            continue
            
        for j, el in enumerate(row):
            relevance_matrix[i][j] = el/n_i

In [4]:
relevance_matrix_LG = np.loadtxt("matrixes/relevmtx_LG")
print(relevance_matrix_LG.shape)

normalize_topics(relevance_matrix_LG)

tc = relevance_matrix_LG.T.dot(relevance_matrix_LG)
print(tc.shape)

tc_transformed = lapin(tc)
B, member, contrib, intensity, lat, tt = faddis(tc_transformed)
np.savetxt("clusters/clusters_LG.dat", member)
np.save("clusters/LG_contrib", contrib)
np.save("clusters/LG_intensity", intensity)
np.save("clusters/LG_lat", lat)
np.save("clusters/LG_tt", tt)

(125815, 379)
(379, 379)
Cluster contribution is too small


In [24]:
relevance_matrix_AI = np.loadtxt("matrixes/relevmtx_AI")
print(relevance_matrix_AI.shape)
normalize_topics(relevance_matrix_AI)
tc = relevance_matrix_AI.T.dot(relevance_matrix_AI)
print(tc.shape)

tc_transformed = lapin(tc)
B, member, contrib, intensity, lat, tt = faddis(tc_transformed)
np.savetxt("clusters/clusters_AI.dat", member)
np.save("clusters/AI_contrib", contrib)
np.save("clusters/AI_intensity", intensity)
np.save("clusters/AI_lat", lat)
np.save("clusters/AI_tt", tt)

(51707, 379)
(379, 379)
Cluster contribution is too small


In [6]:
relevance_matrix_ML = np.loadtxt("matrixes/relevmtx_ML")
print(relevance_matrix_ML.shape)
normalize_topics(relevance_matrix_ML)
tc = relevance_matrix_ML.T.dot(relevance_matrix_ML)
print(tc.shape)

tc_transformed = lapin(tc)
B, member, contrib, intensity, lat, tt = faddis(tc_transformed)
np.savetxt("clusters/clusters_ML.dat", member)
np.save("clusters/ML_contrib", contrib)
np.save("clusters/ML_intensity", intensity)
np.save("clusters/ML_lat", lat)
np.save("clusters/ML_tt", tt)

(9336, 379)
(379, 379)
Cluster contribution is too small


# Analysis

In [32]:
print(f'ML articles: {relevance_matrix_LG.shape[0]}')
print(f'AI articles: {relevance_matrix_AI.shape[0]}')
print(f'ML (statistics) articles: {relevance_matrix_ML.shape[0]}')

ML articles: 125815
AI articles: 51707
ML (statistics) articles: 9336


In [33]:
member_LG = np.loadtxt('clusters/clusters_LG.dat')
member_AI = np.loadtxt('clusters/clusters_AI.dat')
member_ML = np.loadtxt('clusters/clusters_ML.dat')

In [34]:
print(member_LG.shape, member_AI.shape, member_ML.shape)

(379, 3) (379, 5) (379, 5)


In [35]:
terms_LG = []
for i, el in enumerate(strings):
    terms_LG.append((el, member_LG[i, 0], member_LG[i, 1], member_LG[i, 2]))

terms_AI = []
for i, el in enumerate(strings):
    terms_AI.append((el, member_AI[i, 0], member_AI[i, 1], member_AI[i, 2], member_AI[i, 3], member_AI[i, 4]))
    
terms_ML = []
for i, el in enumerate(strings):
    terms_ML.append((el, member_ML[i, 0], member_ML[i, 1], member_ML[i, 2], member_ML[i, 3], member_ML[i, 4]))

#### Machine Leaning (Computer Science)

In [36]:
contrib_LG = np.load('clusters/LG_contrib.npy')
print(f'Cluster contributions: {contrib_LG}')



Cluster contributions: [0.01692739 0.01692526 0.0033456 ]


In [37]:
print('Machine Learning, Cluster 1:')
terms_LG.sort(reverse=True, key=lambda x: x[1])
for el in terms_LG:
    if el[1] <= 0.08:
        break
    print(f'{el[0]}:\t\t{el[1]}')


Machine Learning, Cluster 1:
cluster analysis:		0.5549454007909905
multi-agent reinforcement learning:		0.33595380636547506
information extraction:		0.31555297513611746
learning to rank:		0.3152939036861742
data provenance:		0.3142129065888806
support vector machines:		0.2409914608686449
sequential decision making:		0.22789513135379083
data exchange:		0.20023478900054256
adversarial learning:		0.17243221747316048
markov decision processes:		0.15745770818135157
boosting:		0.1392249405577955
gaussian processes:		0.13505155210644487
data cleaning:		0.11781030962701397
recommender systems:		0.11212580293005102
apprenticeship learning:		0.08475824331458605


In [38]:
print('Machine Learning, Cluster 2:')
terms_LG.sort(reverse=True, key=lambda x: x[2])
for el in terms_LG:
    if el[2] <= 0.08:
        break
    print(f'{el[0]}:\t\t{el[2]}')

Machine Learning, Cluster 2:
cluster analysis:		0.5566761530423947
multi-agent reinforcement learning:		0.3361993303439965
learning to rank:		0.31552600514061074
information extraction:		0.3153827119858832
data provenance:		0.31447526229845785
support vector machines:		0.2408379732389745
sequential decision making:		0.22741081324050672
data exchange:		0.1998670718090123
adversarial learning:		0.17173058527541818
markov decision processes:		0.15687932436481677
boosting:		0.13865705233086464
gaussian processes:		0.13445885364091953
data cleaning:		0.11695798690403883
recommender systems:		0.11126430456500676
apprenticeship learning:		0.08372480181330283


In [39]:
print('Machine Learning, Cluster 3:')
terms_LG.sort(reverse=True, key=lambda x: x[3])
for el in terms_LG:
    if el[3] == 0.08:
        break
    print(f'{el[0]}:\t\t{el[3]}')

Machine Learning, Cluster 3:
sequential decision making:		0.4703981456475031
recommender systems:		0.37242375630583835
apprenticeship learning:		0.35255031914600954
data exchange:		0.32761947257467605
gaussian processes:		0.3276051237163546
information extraction:		0.2540758565002146
data cleaning:		0.20119786547199028
support vector machines:		0.20070214672177353
inverse reinforcement learning:		0.19701440057890002
distributed database recovery:		0.18381177408093108
data provenance:		0.15951872057866157
anomaly detection:		0.13926926384862984
2d pca:		0.11137371315397769
learning to rank:		0.0960608595038539
cluster analysis:		0.08485455752076987
data locking:		0.06633823139061323
multi-agent reinforcement learning:		0.052720726283058496
markov decision processes:		0.032647452068887674
boosting:		0.02858489104192856
adversarial learning:		0.020475690646848482
markov network models:		0.0003085880954526126
bayesian networks:		0.0002627319479288097
equational models:		5.274559166689788e-

#### AI

In [40]:
contrib_AI = np.load('clusters/AI_contrib.npy')
print(f'Cluster contributions: {contrib_AI}')


Cluster contributions: [0.01614585 0.01604364 0.00642044 0.00704061 0.00232388]


In [41]:
print('AI, Cluster 1:')
terms_AI.sort(reverse=True, key=lambda x: x[1])
for el in terms_AI:
    if el[1] <= 0.08:
        break
    print(f'{el[0]}:\t\t{el[1]}')

AI, Cluster 1:
cluster analysis:		0.49264524389990094
information extraction:		0.39266251614431313
adversarial learning:		0.3090368808866511
apprenticeship learning:		0.2969755641514707
data cleaning:		0.2963742390935148
data provenance:		0.2793264059491484
multi-agent reinforcement learning:		0.2597533222965544
inverse reinforcement learning:		0.2240701717121673
markov decision processes:		0.22171862801159653
distributed database recovery:		0.1653633818782248
data exchange:		0.1331433150899438
learning to rank:		0.13309972939696305
2d pca:		0.09264024816501125
anomaly detection:		0.08397207985560615


In [42]:
print('AI, Cluster 2:')
terms_AI.sort(reverse=True, key=lambda x: x[2])
for el in terms_AI:
    if el[2] <= 0.08:
        break
    print(f'{el[0]}:\t\t{el[2]}')

AI, Cluster 2:
cluster analysis:		0.49472716378995524
information extraction:		0.3944092553549417
adversarial learning:		0.3085993610369105
apprenticeship learning:		0.2985269155349466
data cleaning:		0.29721981927620983
data provenance:		0.27759689777721874
multi-agent reinforcement learning:		0.26022315544781094
inverse reinforcement learning:		0.22574027748212083
markov decision processes:		0.2237988441847634
data exchange:		0.13446319732925904
learning to rank:		0.13280145223728063
distributed data locking:		0.12534873726392287
database recovery:		0.0997711872927268
2d pca:		0.09464273479478982
anomaly detection:		0.08121060697278178


In [44]:
print('AI, Cluster 3:')
terms_AI.sort(reverse=True, key=lambda x: x[3])
for el in terms_AI:
    if el[3] <= 0.08:
        break
    print(f'{el[0]}:\t\t{el[3]}')

AI, Cluster 3:
2d pca:		0.3989504987373712
data provenance:		0.376971977167854
support vector machines:		0.3401616027423555
markov decision processes:		0.3081722296515502
anomaly detection:		0.3052088939637672
data exchange:		0.3029558713809817
gaussian processes:		0.3005734884284608
inverse reinforcement learning:		0.28534984211874015
adversarial learning:		0.20906480653673837
boosting:		0.16621456740684418
apprenticeship learning:		0.12891434461729293
cluster analysis:		0.1117960906562706
distributed data locking:		0.09923692134770266
sequential decision making:		0.09329489353500527


In [45]:
print('AI, Cluster 4:')
terms_AI.sort(reverse=True, key=lambda x: x[4])
for el in terms_AI:
    if el[4] <= 0.08:
        break
    print(f'{el[0]}:\t\t{el[4]}')

AI, Cluster 4:
2d pca:		0.41248969475401087
data provenance:		0.3691581957477952
support vector machines:		0.35624208117958267
anomaly detection:		0.3135740628271627
gaussian processes:		0.3120982978225472
data exchange:		0.3061260905807043
markov decision processes:		0.3022209102690163
inverse reinforcement learning:		0.27748720625730194
adversarial learning:		0.18795795516606512
boosting:		0.17510018390079676
distributed database recovery:		0.11922249146431198
apprenticeship learning:		0.1040210815727557
sequential decision making:		0.09725632599410741


In [46]:
print('AI, Cluster 5:')
terms_AI.sort(reverse=True, key=lambda x: x[5])
for el in terms_AI:
    if el[5] <= 0.08:
        break
    print(f'{el[0]}:\t\t{el[5]}')

AI, Cluster 5:
active learning settings:		0.7638199485207781
online learning theory:		0.5188313814228773
inconsistent:		0.21671652627093083
mixture modeling:		0.15583155426025344
document topic models:		0.14401678794741118
test collections:		0.08217422983710546


#### Machine Learning (Statistics)

In [47]:

contrib_ML = np.load('clusters/ML_contrib.npy')
print(f'Cluster contributions: {contrib_ML}')

Cluster contributions: [0.0109593  0.0109379  0.00591683 0.00649718 0.00315163]


In [48]:
print('Machine Learning (Statistics), Cluster 1:')
terms_ML.sort(reverse=True, key=lambda x: x[1])
for el in terms_ML:
    if el[1] <= 0.08:
        break
    print(f'{el[0]}:\t\t{el[1]}')

Machine Learning (Statistics), Cluster 1:
support vector machines:		0.5240686712757624
gaussian processes:		0.4845274720898923
data provenance:		0.3372976940499275
2d pca:		0.33560707262928197
sequential decision making:		0.2737629062115469
anomaly detection:		0.26707278524550965
bayesian network models:		0.21292658107295484
markov networks:		0.1359470326982739
recommender systems:		0.12096745088355292
inverse reinforcement learning:		0.10122986349601061
boosting:		0.09930585707490182


In [49]:
print('Machine Learning (Statistics), Cluster 2:')
terms_ML.sort(reverse=True, key=lambda x: x[2])
for el in terms_ML:
    if el[2] <= 0.08:
        break
    print(f'{el[0]}:\t\t{el[2]}')

Machine Learning (Statistics), Cluster 2:
support vector machines:		0.5256705700902765
gaussian processes:		0.48677073141400606
data provenance:		0.33772196449783487
2d pca:		0.3369103645223354
sequential decision making:		0.27310282881552517
anomaly detection:		0.2663413983791674
markov network models:		0.1861799234834077
bayesian networks:		0.16120439704871395
recommender systems:		0.12053424152562262
inverse reinforcement learning:		0.10085378597078183
boosting:		0.09864370067629626


In [51]:
print('Machine Learning (Statistics), Cluster 3:')
terms_ML.sort(reverse=True, key=lambda x: x[3])
for el in terms_ML:
    if el[3] <= 0.08:
        break
    print(f'{el[0]}:\t\t{el[3]}')

Machine Learning (Statistics), Cluster 3:
sequential decision making:		0.5155500146435154
inverse reinforcement learning:		0.4968479867227759
apprenticeship learning:		0.3231377455354561
data cleaning:		0.28405802108265066
learning to rank:		0.2410100686268029
recommender systems:		0.213489635933461
gaussian processes:		0.2046272929827413
boosting:		0.17887829732442265
distributed database recovery:		0.1554593518502896
cluster analysis:		0.15402236881563303
support vector machines:		0.14601005895274918
multi-agent reinforcement learning:		0.12073942310501187
markov decision processes:		0.11591681319838627
2d pca:		0.09262167933317465
