In [1]:
%matplotlib inline

In [119]:
import numpy as np
import matplotlib.pyplot as plt
from scipy.cluster.vq import kmeans2
from scipy.spatial.distance import cdist
import pandas as pd
from sklearn.datasets import fetch_20newsgroups
from sklearn.cluster import Birch, SpectralClustering, AffinityPropagation, KMeans
from time import time

import datasets
import preprocessing
import interactive
import metrics

In [98]:
ARTICLES_IN_CORPUS = 2000
MAX_FEATURES = 1000
STOP_WORDS = 'english'
DATASET_PATH = 'resources/datasets/'

TWENTY_NEWS = fetch_20newsgroups(shuffle=True, remove=('headers', 'footers', 'quotes'))
ALL_THE_NEWS = pd.read_csv(f"{DATASET_PATH}articles1.csv", nrows=ARTICLES_IN_CORPUS)
BBC = pd.read_csv(f"{DATASET_PATH}bbc-text.csv", nrows=ARTICLES_IN_CORPUS)

In [195]:
pd.read_csv(f"{DATASET_PATH}bbc-text.csv").shape

(2225, 2)

In [99]:
texts = {
    'bbc': BBC.text,
    '20news': TWENTY_NEWS.data[:ARTICLES_IN_CORPUS],
    'allthenews': ALL_THE_NEWS.content
}
labels = {
    'bbc': BBC.category,
    '20news': pd.Series([TWENTY_NEWS['target_names'][label_index]
               for label_index in TWENTY_NEWS['target'][:ARTICLES_IN_CORPUS]])
}

vectorizers = {name: preprocessing.NLPProcessor(max_features=MAX_FEATURES, stop_words=STOP_WORDS)
               for name in texts}
features = {name: vectorizers[name].fit_transform(texts[name]) for name in texts}
terms = {name: vectorizers[name].vec.get_feature_names() for name in texts}

In [100]:
labels['bbc'].value_counts(normalize=True)

sport            0.2340
business         0.2275
politics         0.1880
entertainment    0.1765
tech             0.1740
Name: category, dtype: float64

In [101]:
labels['20news'].value_counts(normalize=True)

soc.religion.christian      0.0590
comp.os.ms-windows.misc     0.0565
rec.motorcycles             0.0555
sci.electronics             0.0550
rec.autos                   0.0545
misc.forsale                0.0540
comp.sys.mac.hardware       0.0540
rec.sport.hockey            0.0540
rec.sport.baseball          0.0530
sci.crypt                   0.0525
sci.med                     0.0520
talk.politics.mideast       0.0510
comp.windows.x              0.0505
talk.politics.guns          0.0500
comp.sys.ibm.pc.hardware    0.0475
comp.graphics               0.0460
sci.space                   0.0435
alt.atheism                 0.0430
talk.politics.misc          0.0380
talk.religion.misc          0.0305
dtype: float64

In [102]:
assert labels['bbc'].nunique() == 5
assert labels['20news'].nunique() == 20

In [103]:
assert list(map(len, texts.values())) == [ARTICLES_IN_CORPUS] * 3

In [108]:
labeled = {'bbc': 5, '20news': 20}

algos = {
    KMeans,
    AffinityPropagation,
    SpectralClustering,
    Birch
}

In [143]:
out = []

for algo in algos:
    for name in labeled:
        print(f'trying {name} and {algo}')
        clusterer = algo() if algo == AffinityPropagation else algo(n_clusters=labeled[name])
        wall_time = time()
        preds = clusterer.fit_predict(features[name])
        scorer = metrics.GroundTruthMetrics(labels[name], preds)
        ami = scorer.adjusted_mutual_info_score()
        ari = scorer.adjusted_rand_index()
        h, c, v = scorer.homogeneity_completeness_v_measure()
        wall_time = time() - wall_time
        record = {'name': name, 'algo': clusterer.__class__.__name__,
                  'ami': ami, 'ari': ari, 'homogeneity': h, 'completeness': c, 'vmeasure': v,
                  'time': wall_time}
        print(record)
        out.append(record)

trying bbc and <class 'sklearn.cluster.k_means_.KMeans'>




{'name': 'bbc', 'algo': 'KMeans', 'ami': 0.7589299120313606, 'ari': 0.7682124779363028, 'homogeneity': 0.759534199478192, 'completeness': 0.7771736272767589, 'vmeasure': 0.7682526744146604, 'time': 11.679721117019653}
trying 20news and <class 'sklearn.cluster.k_means_.KMeans'>




{'name': '20news', 'algo': 'KMeans', 'ami': 0.21275130249384677, 'ari': 0.07107686994002867, 'homogeneity': 0.23834874636853356, 'completeness': 0.2564456251196784, 'vmeasure': 0.24706624319564024, 'time': 3.3954741954803467}
trying bbc and <class 'sklearn.cluster.birch.Birch'>




{'name': 'bbc', 'algo': 'Birch', 'ami': 0.51881635984108, 'ari': 0.4595031362196219, 'homogeneity': 0.5200234554116147, 'completeness': 0.5676643141130006, 'vmeasure': 0.542800547013462, 'time': 1.9231159687042236}
trying 20news and <class 'sklearn.cluster.birch.Birch'>




{'name': '20news', 'algo': 'Birch', 'ami': 0.17227248717322152, 'ari': 0.04205658851852501, 'homogeneity': 0.19981686984644084, 'completeness': 0.2654638362734414, 'vmeasure': 0.22800925172220654, 'time': 2.324589729309082}
trying bbc and <class 'sklearn.cluster.affinity_propagation_.AffinityPropagation'>
{'name': 'bbc', 'algo': 'AffinityPropagation', 'ami': 0.21044185205322666, 'ari': 0.028890470717590714, 'homogeneity': 0.8705194802364914, 'completeness': 0.2561439634251427, 'vmeasure': 0.39582061734761703, 'time': 11.44703483581543}
trying 20news and <class 'sklearn.cluster.affinity_propagation_.AffinityPropagation'>
{'name': '20news', 'algo': 'AffinityPropagation', 'ami': 0.009386270083282204, 'ari': 0.0018507799576087693, 'homogeneity': 0.1595410871811317, 'completeness': 0.11078185247604079, 'vmeasure': 0.13076402029647982, 'time': 7.350111961364746}
trying bbc and <class 'sklearn.cluster.spectral.SpectralClustering'>




{'name': 'bbc', 'algo': 'SpectralClustering', 'ami': 0.6488100754841307, 'ari': 0.611968423020058, 'homogeneity': 0.6496908776524083, 'completeness': 0.6929824480322945, 'vmeasure': 0.6706387417508599, 'time': 0.5402779579162598}
trying 20news and <class 'sklearn.cluster.spectral.SpectralClustering'>
{'name': '20news', 'algo': 'SpectralClustering', 'ami': 0.19745767911207757, 'ari': 0.05034881593512607, 'homogeneity': 0.22385891600236385, 'completeness': 0.2655311098049059, 'vmeasure': 0.2429207922976345, 'time': 0.8547642230987549}




In [146]:
pd.DataFrame(out).sort_values(by='ami', ascending=False)

Unnamed: 0,algo,ami,ari,completeness,homogeneity,name,time,vmeasure
0,KMeans,0.75893,0.768212,0.777174,0.759534,bbc,11.679721,0.768253
6,SpectralClustering,0.64881,0.611968,0.692982,0.649691,bbc,0.540278,0.670639
2,Birch,0.518816,0.459503,0.567664,0.520023,bbc,1.923116,0.542801
1,KMeans,0.212751,0.071077,0.256446,0.238349,20news,3.395474,0.247066
4,AffinityPropagation,0.210442,0.02889,0.256144,0.870519,bbc,11.447035,0.395821
7,SpectralClustering,0.197458,0.050349,0.265531,0.223859,20news,0.854764,0.242921
3,Birch,0.172272,0.042057,0.265464,0.199817,20news,2.32459,0.228009
5,AffinityPropagation,0.009386,0.001851,0.110782,0.159541,20news,7.350112,0.130764


### iKMeans

#### bbc

In [172]:
focus = 'bbc'
user_input = [
    ['government', 'tony', 'blair', 'party', 'liberal'],
    ['film', 'award', 'actor', 'actress', 'singer', 'oscar'],
    ['game', 'match', 'cup', 'victory', 'football', 'soccer', 'season'],
    ['economy', 'growth', 'oil', 'shares', 'demand', 'analyst'],
    ['digital', 'internet', 'phone', 'technology', 'internet']
]

wall_time = time()
clusters_docs, cluster_key_terms, key_terms, silhouette_avg = interactive.icluster(features[focus].todense(),
                                                                                   np.array(terms[focus]).reshape(1, -1),
                                                                                 user_input,
                                                                                   labeled[focus],
                                                                                   1)
wall_time = time() - wall_time

In [180]:
preds_dict = {
    doc - 1: i
    for i, docs in enumerate(clusters_docs)
    for doc in docs
}
preds_label = [preds_dict[i] for i in range(ARTICLES_IN_CORPUS)]
scorer = metrics.GroundTruthMetrics(labels[focus], preds_label)
ami = scorer.adjusted_mutual_info_score()
ari = scorer.adjusted_rand_index()
h, c, v = scorer.homogeneity_completeness_v_measure()



In [182]:
ami

0.7657592463562864

In [183]:
ari

0.8060840810692051

In [185]:
h

0.7663462316644689

In [186]:
v

0.768047342702954

In [187]:
c

0.7697560226873126

### importance of K

In [191]:
out = []

for algo in algos:
    for name in labels:
        for k in [2, 5, 10, 15, 20]:
            print(f'trying {name} and {algo}')
            clusterer = algo() if algo == AffinityPropagation else algo(n_clusters=k)
            wall_time = time()
            preds = clusterer.fit_predict(features[name])
            scorer = metrics.GroundTruthMetrics(labels[name], preds)
            ami = scorer.adjusted_mutual_info_score()
            ari = scorer.adjusted_rand_index()
            h, c, v = scorer.homogeneity_completeness_v_measure()
            wall_time = time() - wall_time
            record = {'name': name, 'algo': clusterer.__class__.__name__, 'k': k,
                      'ami': ami, 'ari': ari, 'homogeneity': h, 'completeness': c, 'vmeasure': v,
                      'time': wall_time}
            print(record)
            out.append(record)

trying bbc and <class 'sklearn.cluster.k_means_.KMeans'>




{'name': 'bbc', 'algo': 'KMeans', 'k': 2, 'ami': 0.30969046273906115, 'ari': 0.30694241936911354, 'homogeneity': 0.3101222548930779, 'completeness': 0.7418478923618657, 'vmeasure': 0.4373955701447933, 'time': 9.986953973770142}
trying bbc and <class 'sklearn.cluster.k_means_.KMeans'>




{'name': 'bbc', 'algo': 'KMeans', 'k': 5, 'ami': 0.7242483542377041, 'ari': 0.7211374523726868, 'homogeneity': 0.724939669061785, 'completeness': 0.7484617801253705, 'vmeasure': 0.736512965273403, 'time': 11.569300413131714}
trying bbc and <class 'sklearn.cluster.k_means_.KMeans'>




{'name': 'bbc', 'algo': 'KMeans', 'k': 10, 'ami': 0.5374823131528914, 'ari': 0.44262602477843566, 'homogeneity': 0.749583123149188, 'completeness': 0.5393649636937496, 'vmeasure': 0.627331508739148, 'time': 14.50144100189209}
trying bbc and <class 'sklearn.cluster.k_means_.KMeans'>




{'name': 'bbc', 'algo': 'KMeans', 'k': 15, 'ami': 0.5001197852602904, 'ari': 0.3832097727504691, 'homogeneity': 0.8162412607629222, 'completeness': 0.5028422322585107, 'vmeasure': 0.6223117487178803, 'time': 12.86903977394104}
trying bbc and <class 'sklearn.cluster.k_means_.KMeans'>




{'name': 'bbc', 'algo': 'KMeans', 'k': 20, 'ami': 0.4598838372011992, 'ari': 0.32745997387994813, 'homogeneity': 0.8170073215919175, 'completeness': 0.46357978250559767, 'vmeasure': 0.5915225527996896, 'time': 19.62586784362793}
trying 20news and <class 'sklearn.cluster.k_means_.KMeans'>




{'name': '20news', 'algo': 'KMeans', 'k': 2, 'ami': 0.06170139079891386, 'ari': 0.033298412512813395, 'homogeneity': 0.06320213985460693, 'completeness': 0.2723045136060166, 'vmeasure': 0.10259246887923813, 'time': 4.214666366577148}
trying 20news and <class 'sklearn.cluster.k_means_.KMeans'>




{'name': '20news', 'algo': 'KMeans', 'k': 5, 'ami': 0.13676547453162194, 'ari': 0.04834750606211893, 'homogeneity': 0.14237132667469699, 'completeness': 0.30800935011289987, 'vmeasure': 0.19473171059008643, 'time': 4.727804899215698}
trying 20news and <class 'sklearn.cluster.k_means_.KMeans'>




{'name': '20news', 'algo': 'KMeans', 'k': 10, 'ami': 0.1813327835766502, 'ari': 0.05479155294813021, 'homogeneity': 0.1935524122577321, 'completeness': 0.287233879363421, 'vmeasure': 0.2312662037242268, 'time': 4.0502259731292725}
trying 20news and <class 'sklearn.cluster.k_means_.KMeans'>




{'name': '20news', 'algo': 'KMeans', 'k': 15, 'ami': 0.2070038779507654, 'ari': 0.0691815521168252, 'homogeneity': 0.22589985227890794, 'completeness': 0.28257525251248355, 'vmeasure': 0.25107898970366993, 'time': 3.72645902633667}
trying 20news and <class 'sklearn.cluster.k_means_.KMeans'>




{'name': '20news', 'algo': 'KMeans', 'k': 20, 'ami': 0.19197635085431405, 'ari': 0.05123698071150063, 'homogeneity': 0.21854416584404404, 'completeness': 0.24632584692712509, 'vmeasure': 0.23160485840593636, 'time': 5.685532331466675}
trying bbc and <class 'sklearn.cluster.birch.Birch'>




{'name': 'bbc', 'algo': 'Birch', 'k': 2, 'ami': 0.24146358193741452, 'ari': 0.18739946493021312, 'homogeneity': 0.2419384261691893, 'completeness': 0.7266430121619478, 'vmeasure': 0.36301101753964754, 'time': 1.956022024154663}
trying bbc and <class 'sklearn.cluster.birch.Birch'>




{'name': 'bbc', 'algo': 'Birch', 'k': 5, 'ami': 0.51881635984108, 'ari': 0.4595031362196219, 'homogeneity': 0.5200234554116147, 'completeness': 0.5676643141130006, 'vmeasure': 0.542800547013462, 'time': 2.0145342350006104}
trying bbc and <class 'sklearn.cluster.birch.Birch'>




{'name': 'bbc', 'algo': 'Birch', 'k': 10, 'ami': 0.46977213011519364, 'ari': 0.38246703994150527, 'homogeneity': 0.6421051641435618, 'completeness': 0.4719800703855055, 'vmeasure': 0.5440532396885793, 'time': 2.0546181201934814}
trying bbc and <class 'sklearn.cluster.birch.Birch'>




{'name': 'bbc', 'algo': 'Birch', 'k': 15, 'ami': 0.42897689555471397, 'ari': 0.3295618600327984, 'homogeneity': 0.6623423620614173, 'completeness': 0.4322863137807781, 'vmeasure': 0.5231391146154488, 'time': 1.995591163635254}
trying bbc and <class 'sklearn.cluster.birch.Birch'>




{'name': 'bbc', 'algo': 'Birch', 'k': 20, 'ami': 0.40059014653878783, 'ari': 0.2740012074593462, 'homogeneity': 0.7013430781030255, 'completeness': 0.4047721900953649, 'vmeasure': 0.513299439748931, 'time': 2.0153920650482178}
trying 20news and <class 'sklearn.cluster.birch.Birch'>




{'name': '20news', 'algo': 'Birch', 'k': 2, 'ami': 0.04946191752125397, 'ari': 0.025247647449663305, 'homogeneity': 0.050983192989703906, 'completeness': 0.2326767083939936, 'vmeasure': 0.08363960835066273, 'time': 2.440546989440918}
trying 20news and <class 'sklearn.cluster.birch.Birch'>




{'name': '20news', 'algo': 'Birch', 'k': 5, 'ami': 0.09434048775273876, 'ari': 0.030977001027812007, 'homogeneity': 0.10053385143767936, 'completeness': 0.29955125722728426, 'vmeasure': 0.15054317663833167, 'time': 2.3809549808502197}
trying 20news and <class 'sklearn.cluster.birch.Birch'>




{'name': '20news', 'algo': 'Birch', 'k': 10, 'ami': 0.15077153749928054, 'ari': 0.0422756699751281, 'homogeneity': 0.16371380276308908, 'completeness': 0.2700915946468168, 'vmeasure': 0.20385971367799988, 'time': 2.9286227226257324}
trying 20news and <class 'sklearn.cluster.birch.Birch'>




{'name': '20news', 'algo': 'Birch', 'k': 15, 'ami': 0.15932209062621125, 'ari': 0.040676412361322446, 'homogeneity': 0.1796347998646293, 'completeness': 0.25767312563238803, 'vmeasure': 0.2116909283126322, 'time': 2.8367371559143066}
trying 20news and <class 'sklearn.cluster.birch.Birch'>




{'name': '20news', 'algo': 'Birch', 'k': 20, 'ami': 0.17227248717322152, 'ari': 0.04205658851852501, 'homogeneity': 0.19981686984644084, 'completeness': 0.2654638362734414, 'vmeasure': 0.22800925172220654, 'time': 2.5198071002960205}
trying bbc and <class 'sklearn.cluster.affinity_propagation_.AffinityPropagation'>
{'name': 'bbc', 'algo': 'AffinityPropagation', 'k': 2, 'ami': 0.21044185205322666, 'ari': 0.028890470717590714, 'homogeneity': 0.8705194802364914, 'completeness': 0.2561439634251427, 'vmeasure': 0.39582061734761703, 'time': 11.69841480255127}
trying bbc and <class 'sklearn.cluster.affinity_propagation_.AffinityPropagation'>
{'name': 'bbc', 'algo': 'AffinityPropagation', 'k': 5, 'ami': 0.21044185205322666, 'ari': 0.028890470717590714, 'homogeneity': 0.8705194802364914, 'completeness': 0.2561439634251427, 'vmeasure': 0.39582061734761703, 'time': 11.996942043304443}
trying bbc and <class 'sklearn.cluster.affinity_propagation_.AffinityPropagation'>
{'name': 'bbc', 'algo': 'Affin



{'name': 'bbc', 'algo': 'SpectralClustering', 'k': 2, 'ami': 0.16715283545073034, 'ari': 0.10950351247575946, 'homogeneity': 0.16767457270810077, 'completeness': 0.5910228100299617, 'vmeasure': 0.2612359008670222, 'time': 0.4728667736053467}
trying bbc and <class 'sklearn.cluster.spectral.SpectralClustering'>




{'name': 'bbc', 'algo': 'SpectralClustering', 'k': 5, 'ami': 0.6488100754841308, 'ari': 0.611968423020058, 'homogeneity': 0.6496908776524084, 'completeness': 0.6929824480322946, 'vmeasure': 0.67063874175086, 'time': 0.5241401195526123}
trying bbc and <class 'sklearn.cluster.spectral.SpectralClustering'>




{'name': 'bbc', 'algo': 'SpectralClustering', 'k': 10, 'ami': 0.5057711235692459, 'ari': 0.37645179371152226, 'homogeneity': 0.6505983266454891, 'completeness': 0.5079681519363808, 'vmeasure': 0.5705036970231265, 'time': 0.647672176361084}
trying bbc and <class 'sklearn.cluster.spectral.SpectralClustering'>




{'name': 'bbc', 'algo': 'SpectralClustering', 'k': 15, 'ami': 0.44347487901501775, 'ari': 0.2595397807960822, 'homogeneity': 0.672389154888744, 'completeness': 0.44675672608535794, 'vmeasure': 0.5368279195772643, 'time': 0.6998882293701172}
trying bbc and <class 'sklearn.cluster.spectral.SpectralClustering'>




{'name': 'bbc', 'algo': 'SpectralClustering', 'k': 20, 'ami': 0.3995302657057766, 'ari': 0.20067136239015082, 'homogeneity': 0.6693706132525197, 'completeness': 0.40391515483996165, 'vmeasure': 0.503815373193162, 'time': 0.8138911724090576}
trying 20news and <class 'sklearn.cluster.spectral.SpectralClustering'>




{'name': '20news', 'algo': 'SpectralClustering', 'k': 2, 'ami': 0.0636950971125398, 'ari': 0.033748139444553675, 'homogeneity': 0.06519269579555775, 'completeness': 0.2815726921763949, 'vmeasure': 0.1058726360940998, 'time': 0.3772013187408447}
trying 20news and <class 'sklearn.cluster.spectral.SpectralClustering'>




{'name': '20news', 'algo': 'SpectralClustering', 'k': 5, 'ami': 0.1135251915625939, 'ari': 0.038440398070133164, 'homogeneity': 0.1194724698745725, 'completeness': 0.2857564399026321, 'vmeasure': 0.1684974928195648, 'time': 0.44328784942626953}
trying 20news and <class 'sklearn.cluster.spectral.SpectralClustering'>




{'name': '20news', 'algo': 'SpectralClustering', 'k': 10, 'ami': 0.1749758503365892, 'ari': 0.047388542268242434, 'homogeneity': 0.18746481072817245, 'completeness': 0.29881959434340916, 'vmeasure': 0.2303925773116682, 'time': 0.5511918067932129}
trying 20news and <class 'sklearn.cluster.spectral.SpectralClustering'>




{'name': '20news', 'algo': 'SpectralClustering', 'k': 15, 'ami': 0.192987615799543, 'ari': 0.047778184861222484, 'homogeneity': 0.21241936064269967, 'completeness': 0.28550871807664047, 'vmeasure': 0.24359975644571388, 'time': 0.6675839424133301}
trying 20news and <class 'sklearn.cluster.spectral.SpectralClustering'>
{'name': '20news', 'algo': 'SpectralClustering', 'k': 20, 'ami': 0.19488731387396535, 'ari': 0.048987571013379375, 'homogeneity': 0.2214202602188926, 'completeness': 0.26294126067139445, 'vmeasure': 0.24040110474973667, 'time': 0.7346968650817871}




In [200]:
df = pd.DataFrame(out).sort_values(by='ami', ascending=False)
df

Unnamed: 0,algo,ami,ari,completeness,homogeneity,k,name,time,vmeasure
1,KMeans,0.724248,0.721137,0.748462,0.72494,5,bbc,11.5693,0.736513
31,SpectralClustering,0.64881,0.611968,0.692982,0.649691,5,bbc,0.52414,0.670639
2,KMeans,0.537482,0.442626,0.539365,0.749583,10,bbc,14.501441,0.627332
11,Birch,0.518816,0.459503,0.567664,0.520023,5,bbc,2.014534,0.542801
32,SpectralClustering,0.505771,0.376452,0.507968,0.650598,10,bbc,0.647672,0.570504
3,KMeans,0.50012,0.38321,0.502842,0.816241,15,bbc,12.86904,0.622312
12,Birch,0.469772,0.382467,0.47198,0.642105,10,bbc,2.054618,0.544053
4,KMeans,0.459884,0.32746,0.46358,0.817007,20,bbc,19.625868,0.591523
33,SpectralClustering,0.443475,0.25954,0.446757,0.672389,15,bbc,0.699888,0.536828
13,Birch,0.428977,0.329562,0.432286,0.662342,15,bbc,1.995591,0.523139


In [202]:
df[df.algo == 'KMeans'].sort_values(by=['name', 'k'])

Unnamed: 0,algo,ami,ari,completeness,homogeneity,k,name,time,vmeasure
5,KMeans,0.061701,0.033298,0.272305,0.063202,2,20news,4.214666,0.102592
6,KMeans,0.136765,0.048348,0.308009,0.142371,5,20news,4.727805,0.194732
7,KMeans,0.181333,0.054792,0.287234,0.193552,10,20news,4.050226,0.231266
8,KMeans,0.207004,0.069182,0.282575,0.2259,15,20news,3.726459,0.251079
9,KMeans,0.191976,0.051237,0.246326,0.218544,20,20news,5.685532,0.231605
0,KMeans,0.30969,0.306942,0.741848,0.310122,2,bbc,9.986954,0.437396
1,KMeans,0.724248,0.721137,0.748462,0.72494,5,bbc,11.5693,0.736513
2,KMeans,0.537482,0.442626,0.539365,0.749583,10,bbc,14.501441,0.627332
3,KMeans,0.50012,0.38321,0.502842,0.816241,15,bbc,12.86904,0.622312
4,KMeans,0.459884,0.32746,0.46358,0.817007,20,bbc,19.625868,0.591523


### user testing values

In [212]:
x = [1, 2, 1, 1, 2, 2, 2, 2, 1, 1]
y = [2, 3, 1, 2, 3, 2, 4, 2, 1, 2]

In [213]:
np.mean(x)

1.5

In [214]:
np.std(x)

0.5

In [215]:
np.mean(y)

2.2

In [216]:
np.std(y)

0.8717797887081347

In [217]:
x = [5, 7, 4, 5, 8, 2, 3, 5, 6, 1]
np.mean(x)

4.6

In [218]:
np.std(x)

2.0591260281974

In [219]:
y = [6, 7, 9, 9, 3, 4, 9, 11, 7, 8]
np.mean(y)

7.3

In [220]:
np.std(y)

2.3259406699226015