# Цель работы:
 Исследовать сходство и различия в толковании понятий "machine learning" и "data science" через анализ публикаций в Arxive с использованием метода обобщения в таксономиях

# Шаг 1
Выделить подмножества публикаций в Arxive, ML и DS, отвечающие соответствующим запросам

In [2]:
import pandas as pd
import numpy as np

df = pd.read_json('arxiv-metadata-oai-snapshot.json', lines=True)

Filter ML and AI from past 10 years

In [3]:
df_LG = df[df['categories'].str.contains('cs.LG')]

In [4]:
df_AI = df[df['categories'].str.contains('cs.AI')]

In [5]:
df_ML = df[df['categories'].str.contains('stat.ML')]

In [6]:
df_LG = df_LG[df_LG['update_date'] > '2022']

In [7]:
df_AI = df_AI[df_AI['update_date'] > '2022']

In [8]:
df_ML = df_ML[df_ML['update_date'] > '2022']

In [9]:
print(f'ML articles: {len(df_LG)}')
print(f'AI articles: {len(df_AI)}')
print(f'ML (statistics) articles: {len(df_ML)}')

ML articles: 42929
AI articles: 20841
ML (statistics) articles: 9336


In [10]:
abstracts_LG = []
for a in df_LG['abstract']:
    abstracts_LG.append(a)

In [11]:
abstracts_AI = []

for a in df_AI['abstract']:
    abstracts_AI.append(a)

In [12]:
abstracts_ML = []

for a in df_ML['abstract']:
    abstracts_ML.append(a)

## Шаг 2
Выделить разумные нечеткие кластеры понятий таксономии согласно ML и согласно DS

In [13]:
import re

import numpy as np

from east.asts import base


def clear_text(text, lowerize=True):

    pat = re.compile(r'[^A-Za-z0-9 \-\n\r.,;!?А-Яа-я]+')
    cleared_text = re.sub(pat, ' ', text)

    if lowerize:
        cleared_text = cleared_text.lower()

    tokens = cleared_text.split()
    return tokens


def make_substrings(tokens, k=4):

    for i in range(max(len(tokens) - k + 1, 1)):
        yield ' '.join(tokens[i:i + k])


def get_corelevance_matrix(texts):

    matrix = np.empty((0, len(texts)), float)
    prepared_text_tokens = [clear_text(t) for t in texts]
    prepared_texts = [' '.join(t) for t in prepared_text_tokens]

    for text_tokens in prepared_text_tokens:
        ast = base.AST.get_ast(list(make_substrings(text_tokens)))
        row = np.array([ast.score(t) for t in prepared_texts])
        matrix = np.append(matrix, [row], axis=0)

    return matrix


def get_relevance_matrix(texts, strings):

    matrix = np.empty((0, len(strings)), float)
    prepared_text_tokens = [clear_text(t) for t in texts]

    prepared_string_tokens = [clear_text(s) for s in strings]
    prepared_strings = [' '.join(t) for t in prepared_string_tokens]
    
    #for print
    print(f'total len: {len(prepared_text_tokens)}')
    c = 0

    for text_tokens in prepared_text_tokens:
        ast = base.AST.get_ast(list(make_substrings(text_tokens)))
        row = np.array([ast.score(s) for s in prepared_strings])
        matrix = np.append(matrix, [row], axis=0)
        
        c += 1
        print(f'processed: {c}')
    return matrix


def save_matrix(matrix, filename="filename"):
    np.savetxt(filename, matrix)

Get strings of terms

In [14]:
with open("taxonomies/taxonomy_leaves.txt") as f:
    strings = [l.strip() for l in f.readlines()]

In [None]:
# Relevance between the texts and strings:

relevance_matrix_LG = get_relevance_matrix(abstracts_LG, strings)
print("Relevance mairix LG:")
print(relevance_matrix_LG)

save_matrix(relevance_matrix_LG, "matrixes/relevmtx_LG")

In [None]:
# Relevance between the texts and strings:

relevance_matrix_AI = get_relevance_matrix(abstracts_AI, strings)
print("Relevance mairix AI:")
print(relevance_matrix_AI)

save_matrix(relevance_matrix_AI, "matrixes/relevmtx_AI")

In [None]:
# Relevance between the texts and strings:

relevance_matrix_ML = get_relevance_matrix(abstracts_ML, strings)
print("Relevance mairix ML:")
print(relevance_matrix_ML)

save_matrix(relevance_matrix_ML, "matrixes/relevmtx_ML")

Obtain fuzzy clusters

In [18]:
import numpy as np
from pyfaddis.lapin import lapin
from pyfaddis.faddis import faddis

from operator import itemgetter

In [None]:
relevance_matrix_LG = np.loadtxt("matrixes/relevmtx_LG")
print(relevance_matrix_LG.shape)
tc = relevance_matrix_LG.T.dot(relevance_matrix_LG)
print(tc.shape)

tc_transformed = lapin(tc)
B, member, contrib, intensity, lat, tt = faddis(tc_transformed)
np.savetxt("clusters/clusters_LG.dat", member)
np.save("clusters/LG_contrib", contrib)
np.save("clusters/LG_intensity", intensity)
np.save("clusters/LG_lat", lat)
np.save("clusters/LG_tt", tt)

In [21]:
relevance_matrix_AI = np.loadtxt("matrixes/relevmtx_AI")
print(relevance_matrix_AI.shape)
tc = relevance_matrix_AI.T.dot(relevance_matrix_AI)
print(tc.shape)

tc_transformed = lapin(tc)
B, member, contrib, intensity, lat, tt = faddis(tc_transformed)
np.savetxt("clusters/clusters_AI.dat", member)
np.save("clusters/AI_contrib", contrib)
np.save("clusters/AI_intensity", intensity)
np.save("clusters/AI_lat", lat)
np.save("clusters/AI_tt", tt)

(20841, 379)
(379, 379)
Cluster contribution is too small


In [22]:
relevance_matrix_ML = np.loadtxt("matrixes/relevmtx_ML")
print(relevance_matrix_ML.shape)
tc = relevance_matrix_ML.T.dot(relevance_matrix_ML)
print(tc.shape)

tc_transformed = lapin(tc)
B, member, contrib, intensity, lat, tt = faddis(tc_transformed)
np.savetxt("clusters/clusters_ML.dat", member)
np.save("clusters/ML_contrib", contrib)
np.save("clusters/ML_intensity", intensity)
np.save("clusters/ML_lat", lat)
np.save("clusters/ML_tt", tt)

(9336, 379)
(379, 379)
Cluster contribution is too small


# Analysis

In [110]:
print(f'ML articles: {len(df_LG)}')
print(f'AI articles: {len(df_AI)}')
print(f'ML (statistics) articles: {len(df_ML)}')

ML articles: 42929
AI articles: 20841
ML (statistics) articles: 9336


In [118]:
member_LG = np.loadtxt('clusters/clusters_LG.dat')
member_AI = np.loadtxt('clusters/clusters_AI.dat')
member_ML = np.loadtxt('clusters/clusters_ML.dat')

In [120]:
terms_LG = []
for i, el in enumerate(strings):
    terms_LG.append((el, member_LG[i, 0], member_LG[i, 1], member_LG[i, 2], member_LG[i, 3], member_LG[i, 4]))

terms_AI = []
for i, el in enumerate(strings):
    terms_AI.append((el, member_AI[i, 0], member_AI[i, 1], member_AI[i, 2], member_AI[i, 3], member_AI[i, 4]))
    
terms_ML = []
for i, el in enumerate(strings):
    terms_ML.append((el, member_ML[i, 0], member_ML[i, 1], member_ML[i, 2]))

#### Machine Leaning (Computer Science)

In [121]:
contrib_LG = np.load('clusters/LG_contrib.npy')
print(f'Cluster contributions: {contrib_LG}')



Cluster contributions: [0.01593554 0.01557557 0.00655408 0.00712074 0.00241163]


In [122]:
print('Machine Learning, Cluster 1:')
terms_LG.sort(reverse=True, key=lambda x: x[1])
for el in terms_LG:
    if el[1] == 0:
        break
    print(f'{el[0]}:\t\t{el[1]}')


Machine Learning, Cluster 1:
data provenance:		0.4337646436749266
2d pca:		0.4200553802871822
bayesian network models:		0.37343141255300594
adversarial learning:		0.33608056630589883
cluster analysis:		0.29552641681032543
markov networks:		0.2428307758986762
distributed database recovery:		0.2425845115367029
data exchange:		0.1974674127407011
data cleaning:		0.19617240053455479
information extraction:		0.1337805882877761
sequential decision making:		0.13071013153836322
anomaly detection:		0.11189830511379048
markov decision processes:		0.10469428507884355
recommender systems:		0.10017294688175708
inverse reinforcement learning:		0.08648519759893851
data locking:		0.08620640658775858
learning to rank:		0.08417138190948344
multi-agent reinforcement learning:		0.06871282152177126
apprenticeship learning:		0.059628387249230516
support vector machines:		0.038684693115777054
boosting:		0.00789701305904788
gaussian processes:		0.006527853448585735
fuzzy representation:		3.200534048444169e-10


In [123]:
print('Machine Learning, Cluster 2:')
terms_LG.sort(reverse=True, key=lambda x: x[2])
for el in terms_LG:
    if el[2] == 0:
        break
    print(f'{el[0]}:\t\t{el[2]}')

Machine Learning, Cluster 2:
data provenance:		0.4396953457030884
2d pca:		0.4261918057130437
adversarial learning:		0.33991644481846445
markov network models:		0.337185005712885
cluster analysis:		0.29965509845437854
bayesian networks:		0.2838477447980121
data cleaning:		0.20021050762244857
data exchange:		0.19748113508204618
distributed data locking:		0.18739884707557178
database recovery:		0.1427764126134109
information extraction:		0.1355731243340465
sequential decision making:		0.1273557894597782
anomaly detection:		0.10996522792090706
markov decision processes:		0.10240506566044434
recommender systems:		0.0993942122933422
inverse reinforcement learning:		0.08781355067614915
learning to rank:		0.07964307184603972
multi-agent reinforcement learning:		0.06636223012671934
apprenticeship learning:		0.06031408968942941
support vector machines:		0.04020082421555917
boosting:		0.007277178418447111
gaussian processes:		0.004412184746553468


In [124]:
print('Machine Learning, Cluster 3:')
terms_LG.sort(reverse=True, key=lambda x: x[3])
for el in terms_LG:
    if el[3] == 0:
        break
    print(f'{el[0]}:\t\t{el[3]}')

Machine Learning, Cluster 3:
learning to rank:		0.415242407867412
sequential decision making:		0.394426726457116
support vector machines:		0.3606143248419246
markov decision processes:		0.3142283090006086
data cleaning:		0.2785466464726828
information extraction:		0.24724563515801362
anomaly detection:		0.22278892218598814
inverse reinforcement learning:		0.22127732072230655
apprenticeship learning:		0.2127356688332793
multi-agent reinforcement learning:		0.18151191850018808
data exchange:		0.16828878348945764
cluster analysis:		0.15635097039667567
recommender systems:		0.13433996479355195
distributed data locking:		0.09872628766826216
2d pca:		0.09811096193279259
data provenance:		0.0966267289547563
gaussian processes:		0.08875219552958068
boosting:		0.08825170205309776
database recovery:		0.07544625232640033
markov network models:		0.06283010452172838
bayesian networks:		0.052930468198218075
adversarial learning:		0.008300701757279474


In [125]:
print('Machine Learning, Cluster 4:')
terms_LG.sort(reverse=True, key=lambda x: x[4])
for el in terms_LG:
    if el[4] == 0:
        break
    print(f'{el[0]}:\t\t{el[4]}')

Machine Learning, Cluster 4:
learning to rank:		0.428250270584852
sequential decision making:		0.40188377577237333
support vector machines:		0.37609595455774997
markov decision processes:		0.3202858950697762
data cleaning:		0.2726319991639786
information extraction:		0.24559003539328272
inverse reinforcement learning:		0.2236187067978763
anomaly detection:		0.2216679317871739
apprenticeship learning:		0.2170748235851293
multi-agent reinforcement learning:		0.182571231095051
data exchange:		0.15565322824366057
cluster analysis:		0.13328470956239333
recommender systems:		0.1302826306169525
distributed database recovery:		0.1081334582627704
gaussian processes:		0.09153375397976467
boosting:		0.09099344956053369
2d pca:		0.059541528210488706
data provenance:		0.055173096029132465
data locking:		0.037500644647982466
bayesian network models:		0.03283750788195399
markov networks:		0.020742726760064008


In [126]:
print('Machine Learning, Cluster 5:')
terms_LG.sort(reverse=True, key=lambda x: x[5])
for el in terms_LG:
    if el[5] == 0:
        break
    print(f'{el[0]}:\t\t{el[5]}')

Machine Learning, Cluster 5:
query representation:		0.9263917911984688
fuzzy clustering:		0.22333130825872058
mixture modeling:		0.19808927760468295
hierarchical data models:		0.1403275886325566
inconsistent data:		0.10157868558958608
document topic models:		0.08242752357094182
test collections:		0.05483188201885137
probabilistic retrieval models:		0.05083477963391369
temporal reasoning:		0.04414367642360795
vagueness and fuzzy logic:		0.030620996307408
neuro-fuzzy approach:		0.029732505973361467
kernel independent components:		0.02704075883930022
search results deduplication:		0.02613326096397946
shape representations:		0.025940410527286246
database query processing and optimization:		0.019880117068685553
active learning:		0.019522033866878466
sparse tensor:		0.019032182537057133
canonical correlation analysis:		0.01846109330491038
online learning settings:		0.015424708539636922
object recognition:		0.012936945940312033
models of learning:		0.01269886074416544
structured query languag

#### AI

In [117]:
contrib_AI = np.load('clusters/AI_contrib.npy')
print(f'Cluster contributions: {contrib_AI}')


Cluster contributions: [0.02048352 0.01977237 0.0055853  0.00547745 0.00286389]


In [104]:
print('AI, Cluster 1:')
terms_AI.sort(reverse=True, key=lambda x: x[1])
for el in terms_AI:
    if el[1] == 0:
        break
    print(f'{el[0]}:\t\t{el[1]}')

AI, Cluster 1:
support vector machines:		0.3819018050052615
anomaly detection:		0.35480000486855523
distributed database recovery:		0.3428976977378799
2d pca:		0.326197441647439
gaussian processes:		0.27492639083327725
boosting:		0.26870030745010426
markov decision processes:		0.25729433468901614
cluster analysis:		0.2526371877594142
bayesian network models:		0.24137518484187223
apprenticeship learning:		0.17163801632548897
markov networks:		0.1565495007637852
recommender systems:		0.14674827846309288
information extraction:		0.13031835919783943
data locking:		0.12476473857591884
learning to rank:		0.12182756680249088
data cleaning:		0.11299497117537075
adversarial learning:		0.09461650350790746
sequential decision making:		0.0917504874887954
data exchange:		0.09019242142565233
multi-agent reinforcement learning:		0.05835104483678031
data provenance:		0.05028247248097171
inverse reinforcement learning:		0.03335358561056711
inconsistent data:		2.428009045602563e-10
probabilistic retriev

In [105]:
print('AI, Cluster 2:')
terms_AI.sort(reverse=True, key=lambda x: x[2])
for el in terms_AI:
    if el[2] == 0:
        break
    print(f'{el[0]}:\t\t{el[2]}')

AI, Cluster 2:
support vector machines:		0.4097246523865225
2d pca:		0.34313786294874077
anomaly detection:		0.3420705733991426
gaussian processes:		0.27984167662897663
distributed data locking:		0.2755050924666692
boosting:		0.26740165675834565
markov decision processes:		0.25369554511481973
cluster analysis:		0.236124752887918
markov network models:		0.2244326824453322
database recovery:		0.21509584815670188
bayesian networks:		0.1851059051914503
apprenticeship learning:		0.15936089161891304
recommender systems:		0.14526914399850377
information extraction:		0.12886503095103707
learning to rank:		0.11825584537426756
data cleaning:		0.1143902958207284
adversarial learning:		0.09121618218204301
data exchange:		0.08954881371387764
sequential decision making:		0.08139716720414208
data provenance:		0.06151437796089847
multi-agent reinforcement learning:		0.04813867495370546
inverse reinforcement learning:		0.03733584259163887


In [106]:
print('AI, Cluster 3:')
terms_AI.sort(reverse=True, key=lambda x: x[3])
for el in terms_AI:
    if el[3] == 0:
        break
    print(f'{el[0]}:\t\t{el[3]}')

AI, Cluster 3:
support vector machines:		0.467073855287433
cluster analysis:		0.42633169593482256
anomaly detection:		0.38263054438290045
apprenticeship learning:		0.3035107462080092
data provenance:		0.25017700624479583
2d pca:		0.24893826470473518
distributed database recovery:		0.2389726853396938
sequential decision making:		0.2305779247640998
multi-agent reinforcement learning:		0.2050782724160649
markov decision processes:		0.16010643555240034
inverse reinforcement learning:		0.10375659113238551
boosting:		0.10301467661479788
learning to rank:		0.10221596415369721
data locking:		0.08693069392538968
bayesian network models:		0.0846975436514543
adversarial learning:		0.06514882724722768
markov networks:		0.054920410755000225
recommender systems:		0.053040476032016746
information extraction:		0.029322720497506245
gaussian processes:		0.02858323065359169
data exchange:		0.02563977377424034
data cleaning:		0.008366298954496835


In [128]:
print('AI, Cluster 4:')
terms_AI.sort(reverse=True, key=lambda x: x[4])
for el in terms_AI:
    if el[4] == 0:
        break
    print(f'{el[0]}:\t\t{el[4]}')

AI, Cluster 4:
support vector machines:		0.4684796728498491
cluster analysis:		0.44121959117922027
anomaly detection:		0.3802025495308368
apprenticeship learning:		0.31479379045164757
data provenance:		0.2693193600689445
sequential decision making:		0.24376337047301355
2d pca:		0.2346307776315456
multi-agent reinforcement learning:		0.21953744136277295
distributed data locking:		0.16965400300612637
markov decision processes:		0.14557678366360427
database recovery:		0.13255615429656314
inverse reinforcement learning:		0.1098390911904022
learning to rank:		0.09785642703149249
boosting:		0.08077603625554645
adversarial learning:		0.06002314167790333
markov network models:		0.057330509891136874
bayesian networks:		0.04725557762851908
recommender systems:		0.040380823879650356
data exchange:		0.016948052271871076
information extraction:		0.015785433031568433


In [127]:
print('AI, Cluster 5:')
terms_AI.sort(reverse=True, key=lambda x: x[5])
for el in terms_AI:
    if el[5] == 0:
        break
    print(f'{el[0]}:\t\t{el[5]}')

AI, Cluster 5:
learning to rank:		0.44415922476065994
information extraction:		0.4226382878998822
markov decision processes:		0.3395194849457521
data cleaning:		0.3063693800407274
data provenance:		0.267561113900364
recommender systems:		0.2583785315554916
boosting:		0.2485458993362883
2d pca:		0.23175675960452188
sequential decision making:		0.18900259067647765
adversarial learning:		0.18560273865739266
data exchange:		0.1601073546022096
apprenticeship learning:		0.12527786273345767
support vector machines:		0.1216555585320845
anomaly detection:		0.10568865269111494
cluster analysis:		0.09891908835740307
inverse reinforcement learning:		0.07839098120327641
gaussian processes:		0.05031403288789675
multi-agent reinforcement learning:		0.0436258966021172
distributed data locking:		0.043213988352151414
database recovery:		0.03379741917409715
markov network models:		0.0130211208083743
bayesian networks:		0.010749948130869429


#### Machine Learning (Statistics)

In [116]:

contrib_ML = np.load('clusters/ML_contrib.npy')
print(f'Cluster contributions: {contrib_ML}')

Cluster contributions: [0.01943845 0.01867466 0.00483639]


In [107]:
print('Machine Learning (Statistics), Cluster 1:')
terms_ML.sort(reverse=True, key=lambda x: x[1])
for el in terms_ML:
    if el[1] == 0:
        break
    print(f'{el[0]}:\t\t{el[1]}')

Machine Learning (Statistics), Cluster 1:
adversarial learning:		0.3736923549301199
learning to rank:		0.3685328483004863
distributed database recovery:		0.3527455968860433
boosting:		0.3469433442790289
multi-agent reinforcement learning:		0.3064425099811277
bayesian network models:		0.3027894284894846
support vector machines:		0.28473314716948805
data cleaning:		0.1949433135574657
markov networks:		0.19021091936548537
data exchange:		0.14816048088094896
sequential decision making:		0.13707836249628666
recommender systems:		0.13492380986079155
markov decision processes:		0.13441304175391872
apprenticeship learning:		0.1276275507342564
data locking:		0.11943487621544069
gaussian processes:		0.11792717160453721
data provenance:		0.09803808666851291
cluster analysis:		0.07524444295746532
information extraction:		0.03250740709333087
inverse reinforcement learning:		0.02919529880692395
anomaly detection:		0.015234436604552813
2d pca:		0.002436450163272978
fuzzy representation:		5.3740788134

In [108]:
print('Machine Learning (Statistics), Cluster 2:')
terms_ML.sort(reverse=True, key=lambda x: x[2])
for el in terms_ML:
    if el[2] == 0:
        break
    print(f'{el[0]}:\t\t{el[2]}')

Machine Learning (Statistics), Cluster 2:
adversarial learning:		0.37946689111894366
learning to rank:		0.37509053164650547
boosting:		0.3526648123941726
multi-agent reinforcement learning:		0.31009805055848577
support vector machines:		0.2899971081974449
distributed data locking:		0.27854805037219893
markov network models:		0.26706665540265795
bayesian networks:		0.23314226913140526
database recovery:		0.20352167259633477
data cleaning:		0.1976597840373411
data exchange:		0.14995875920557247
sequential decision making:		0.1388511217453459
recommender systems:		0.13612519935719503
markov decision processes:		0.13590285676820169
apprenticeship learning:		0.12878592908564462
gaussian processes:		0.11897898622660151
data provenance:		0.09841994630344361
cluster analysis:		0.0758032826760344
information extraction:		0.03104780596555398
inverse reinforcement learning:		0.028446852603110584
anomaly detection:		0.01431916015369153
2d pca:		0.0011823843825989517


In [109]:
print('Machine Learning (Statistics), Cluster 3:')
terms_ML.sort(reverse=True, key=lambda x: x[3])
for el in terms_ML:
    if el[3] == 0:
        break
    print(f'{el[0]}:\t\t{el[3]}')

Machine Learning (Statistics), Cluster 3:
data provenance:		0.49455791368616586
apprenticeship learning:		0.3928307404541719
sequential decision making:		0.33246769677579496
bayesian network models:		0.2827993911906054
multi-agent reinforcement learning:		0.24760203499107813
gaussian processes:		0.24455092106236578
adversarial learning:		0.2124162795962565
inverse reinforcement learning:		0.20807031598371856
cluster analysis:		0.19438661526075787
markov networks:		0.17765378251039915
data cleaning:		0.17685954526258704
2d pca:		0.1701004182528179
markov decision processes:		0.1511645008204334
information extraction:		0.13423831610767478
support vector machines:		0.0887637617092758
boosting:		0.08716866304026843
distributed data locking:		0.06502219174845411
learning to rank:		0.06404264874232415
anomaly detection:		0.05481575333800362
database recovery:		0.04760784367223929
data exchange:		0.03731493000949169
recommender systems:		0.012603563454247952
information visualization:		2.2150

## The following is a test ----

## ----