In [1]:
import requests
from bs4 import BeautifulSoup as bs 

import numpy as np 
import pandas as pd 
import time, re, json, pickle

In [2]:
url = 'https://distrowatch.com/search.php?status=All'
base_url = 'https://distrowatch.com/'

r = requests.get(url)
soup = bs(r.text, 'html.parser')

In [3]:
b = soup.find_all('b')
b[21]

<b>1. <a href="mx">MX Linux</a> (1)</b>

In [4]:
ranking = soup.find_all('b')[21:]

distros = [a.get_text() for a in ranking if a.find('a') is not None]
hrefs = [a.find('a')['href'] for a in ranking if a.find('a') is not None]

ranked_distros = distros[:275]
ranked_hrefs = hrefs[:275]

## Analyzing Single Page

In [5]:
r = requests.get(base_url + hrefs[1])
soup = bs(r.content, 'html.parser')

In [6]:
# Distro Name
name = soup.find('h1').text

# OS Information
out = soup.select('ul')[1]

outline_ = out.find_all('b')
outline = [b.get_text() for b in outline_]

os_info_ = out.find_all('a')
os_info = [spec.get_text(strip=True) for spec in os_info_]

In [7]:
print(outline)
print()
print(os_info)

t = out.text
print()
print(t)

['OS Type:', 'Based on:', 'Origin:', 'Architecture:', 'Desktop:', 'Category:', 'Status:', 'Popularity:']

['Linux', 'Arch', 'Austria, Germany, France', 'aarch64', 'x86_64', 'Awesome', 'bspwm', 'Budgie', 'Cinnamon', 'GNOME', 'i3', 'KDE Plasma', 'LXQt', 'MATE', 'Openbox', 'Xfce', 'Desktop', 'Live Medium', 'Raspberry Pi', '2 (2,343 hits per day)']

OS Type: LinuxBased on: ArchOrigin: Austria, Germany, France
Architecture: aarch64, x86_64Desktop: Awesome, bspwm, Budgie, Cinnamon, GNOME, i3, KDE Plasma, LXQt, MATE, Openbox, XfceCategory: Desktop, Live Medium, Raspberry Pi Status: ActivePopularity: 2 (2,343 hits per day)



In [8]:
def get_os_info(text):
    t = text

    split_index = [i for i, e in enumerate(t) if e.islower() and t[i+1].isupper()
                                        or e.isnumeric() and t[i+1].isupper()]

    for i, j in enumerate(split_index, start=1):
        t = t[:j+i] + ' ' + t[j+i:]

    t = re.sub(r'\n', ' ', t)

    # a = re.search(r'\b(OS Type):', t).span()[0]
    b = re.search(r'\b(Based on):', t).span()[0]
    c = re.search(r'\b(Origin):', t).span()[0]
    d = re.search(r'\b(Architecture):', t).span()[0]
    e = re.search(r'\b(Desktop):', t).span()[0]
    f = re.search(r'\b(Category):', t).span()[0]
    g = re.search(r'\b(Status):', t).span()[0]
    h = re.search(r'\b(Popularity):', t).span()[0]

    idx = [b, c, d, e, f, g, h]

    split_list = [t[i:j].strip() for i, j in zip(idx, idx[1:]+[None])]

    return split_list

In [9]:
text = get_os_info(t)

# Regular dictionary
di = {}

for a in text:
    di[a.split(':')[0]] = a.split(':')[1].strip()


# Defaultdict with lists
from collections import defaultdict
text = get_os_info(t)
d = defaultdict(list)

for a in text:
    d[a.split(':')[0]].append(a.split(':')[1].strip())

di['Origin']

'Austria, Germany, France'

In [10]:
def os_to_dict(text):
    d = {}
    for val in text:
        d[val.split(':')[0]] = val.split(':')[1].strip()

    return d

test = os_to_dict(text)

test

{'Based on': 'Arch',
 'Origin': 'Austria, Germany, France',
 'Architecture': 'aarch64, x86_64',
 'Desktop': 'Awesome, bspwm, Budgie, Cinnamon, GNOME, i3, KDE Plasma, LXQt, MATE, Openbox, Xfce',
 'Category': 'Desktop, Live Medium, Raspberry Pi',
 'Status': 'Active',
 'Popularity': '2 (2,343 hits per day)'}

### Releases / Versions Dataframe

In [11]:
dfs = pd.read_html(base_url + ranked_hrefs[1])

release = dfs[15].set_index(0)
release_df = release.iloc[:15].copy()
release_df.head()

Unnamed: 0_level_0,1,2,3,4,5,6,7,8,9,10
0,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
Feature,stable,20.2,19.0,18.1.0,17.1.12,16.10.3,15.12,0.8.11,0.8.8,Feature
Release Date,2020-12-17,2020-12-03,2020-02-25,2019-09-12,2018-08-19,2016-11-27,2015-12-23,2014-12-01,2013-11-26,Release Date
End Of Life,,,,,,,,,,End Of Life
Price (US$),Free,Free,Free,Free,Free,Free,Free,Free,Free,Price (US$)
Image Size (MB),,2600-3000,2500-2900,1800-2100,1600-2100,1533-2116,589-1856,472-1481,582-1861,Image Size (MB)


### Rating

In [12]:
rating = soup.find('div', attrs={'style': 'font-size: 64px; text-align: left'}).text

rating

'8.4'

### Overall Ranking df

In [13]:
rank_dfs = pd.read_html('https://distrowatch.com/dwres.php?resource=popularity')

mo_12 = rank_dfs[8].drop('Last 12 months', axis=1).rename(columns={
            'Last 12 months.1': 'Distro',
            'Last 12 months.2': '12 months'
})

mo_6 = rank_dfs[9].drop('Last 6 months', axis=1).rename(columns={
            'Last 6 months.1': 'Distro',
            'Last 6 months.2': '6 months'
})

mo_3 = rank_dfs[10].drop('Last 3 months', axis=1).rename(columns={
            'Last 3 months.1': 'Distro',
            'Last 3 months.2': '3 months'
})

mo_1 = rank_dfs[11].drop('Last 1 month', axis=1).rename(columns={
            'Last 1 month.1': 'Distro',
            'Last 1 month.2': '1 month'
})

In [14]:
from functools import reduce

rank_df = reduce(lambda x,y: pd.merge(x,y, on='Distro', how='outer'), [mo_12, mo_6, mo_3, mo_1])

# rank_df.to_csv('df-ranks.csv', index=False, header=rank_df.columns.values)

# The numbers represent the clicks per day
rank_df.head()

Unnamed: 0,Distro,12 months,6 months,3 months,1 month
0,MX Linux,3945,3409,3417,3294
1,Manjaro,2709,2343,2395,2571
2,Mint,2345,2061,1990,2087
3,Ubuntu,1608,1377,1414,1275
4,Debian,1433,1250,1275,1337


## All distros

In [15]:
def get_os_info_full(text):
    t = text

    split_index = [i for i, e in enumerate(t) if e.islower() and t[i+1].isupper()
                                        or e.isnumeric() and t[i+1].isupper()]

    for i, j in enumerate(split_index, start=1):
        t = t[:j+i] + ' ' + t[j+i:]

    t = re.sub(r'\n', ' ', t)

    # a = re.search(r'\b(OS Type):', t).span()[0]
    try:
        b = re.search(r'\b(Based on):', t).span()[0]
    except:
        b = None
    
    try:
        c = re.search(r'\b(Origin):', t).span()[0]
    except:
        c = None

    try: 
        d = re.search(r'\b(Architecture):', t).span()[0]
    except:
        d = None

    try:
        e = re.search(r'\b(Desktop):', t).span()[0]
    except:
        e = None
    
    try:
        f = re.search(r'\b(Category):', t).span()[0]
    except:
        f = None

    try:
        g = re.search(r'\b(Status):', t).span()[0]
    except:
        g = None

    try:
        h = re.search(r'\b(Popularity):', t).span()[0]
    except:
        h = None

    idx = [b, c, d, e, f, g, h]

    split_list = [t[i:j].strip() for i, j in zip(idx, idx[1:]+[None])]

    return split_list

#### I was going to use this because it looks much more elegant, however it returns multiple entries in the list and isn't split up right

In [16]:
def try_except(string):
    try:
        return re.search(rf'\b({string}):', out).span()[0]
    except:
        return None


def get_os_info_full_2(text):
    t = text

    split_index = [i for i, e in enumerate(t) if e.islower() and t[i+1].isupper()
                                        or e.isnumeric() and t[i+1].isupper()]

    for i, j in enumerate(split_index, start=1):
        t = t[:j+i] + ' ' + t[j+i:]

    t = re.sub(r'\n', ' ', t)

    b = try_except('Based on')
    c = try_except('Origin')
    d = try_except('Architecture')
    e = try_except('Desktop')
    f = try_except('Category')
    g = try_except('Status')
    h = try_except('Status')

    idx = [a, b, c, d, e, f, g, h]

    split_list = [t[i:j].strip() for i, j in zip(idx, idx[1:]+[None])]

    return split_list

# get_os_info_full_2(out)

In [18]:
def get_page(url):
    response = requests.get(url)

    if not response.ok:
        print('Server responded: ', response.status_code)
    else:
        soup = bs(response.text, 'html.parser')

    return soup


def get_distro(soup):
    name = soup.find('h1').text
    out = soup.select('ul')[1].text

    os_info = get_os_info_full(out)
    os_dict = os_to_dict(os_info)

    full_d = {}
    temp_d = {name: {}}
    for k, v in os_dict.items():
        temp_d[name].setdefault(k, []).append(v)

    full_d.update(temp_d)
    return temp_d


### Testing on small sample

In [19]:
test_hrefs = np.random.choice(ranked_hrefs, size=5).tolist()
test_urls = [base_url + item for item in test_hrefs]

all_d = {}

for url in test_urls:
    t_dict = get_distro(get_page(url))
    all_d.update(t_dict)

### Getting all of the ranked distros

In [20]:
len(ranked_hrefs)

275

In [150]:
full_urls = [base_url + item for item in ranked_hrefs]

full_d = {}

for url in full_urls:
    t_dict = get_distro(get_page(url))
    full_d.update(t_dict)

### Saving the full dictionary to json and pickle

In [152]:
with open('distro_dict.json', 'w', encoding='utf-8') as f:
    json.dump(full_d, f, ensure_ascii=False, indent=4)

with open('distro_dict.json', 'r') as f:
    data = json.load(f)

In [160]:
data['Manjaro Linux']['Based on']

['Arch']

In [164]:
with open('distro_dict.pickle', 'wb') as f:
    pickle.dump(full_d, f, protocol=pickle.HIGHEST_PROTOCOL)

with open('distro_dict.pickle', 'rb') as f:
    data2 = pickle.load(f)

In [276]:
os_info_df = pd.DataFrame(data)

# os_info_df.to_csv('df-os_info.csv', index=True, columns=os_info_df.columns.values)

os_info_df.head()

Unnamed: 0,MX Linux,Manjaro Linux,Linux Mint,Pop!_OS,Ubuntu,Debian,elementary OS,Fedora,EndeavourOS,Solus,...,Webconverger,YunoHost,OB2D Linux,OSGeoLive,ARMA aka Omoikane GNU/Linux,Raspberry Digital Signage,tuxtrans,DRBL Live,Canaima GNU/Linux,Baltix GNU/Linux
Based on,"[Debian (Stable), anti XOrigin]",[Arch],"[Debian, Ubuntu (LTS)]","[Ubuntu, Debian]",[Debian],[Independent],"[Debian, Ubuntu]",[Independent],[Arch],[Independent],...,[Debian (Testing)],[Debian],[Debian (Testing)],"[Debian, Lubuntu (LTS)]",[Debian],[Raspbian],"[Xubuntu, Debian]",[Debian],[Debian (Stable)],"[Debian, Ubuntu]"
OS Type,[Linux Based on],,,[Linux Based on],,,,,,[Linux Based on],...,,[Linux Based on],,[Linux Based on],[Linux Based on],,[Linux Based on],,[Linux Based on],[Linux Based on]
Architecture,"[i686, x86_64]","[aarch64, x86_64]","[i686, x86_64]",[x86_64],"[armhf, i686, powerpc, ppc64el, s390x, x86_64]","[aarch64, armel, armhf, i386, i686, mips, mips...",[x86_64],"[aarch64, armhfp, x86_64]","[aarch64, x86_64]",[x86_64],...,[i686],"[armhf, i686, x86_64]",[x86_64],[x86_64],[--],[armhf],[x86_64],"[i686, x86_64]",[x86_64],[x86_64]
Desktop,[Xfce],"[Awesome, bspwm, Budgie, Cinnamon, GNOME, i3, ...","[Cinnamon, MATE, Xfce]",[GNOMECategory],"[GNOME, Unity]","[After Step, Awesome, Blackbox, Cinnamon, Flux...",[Pantheon],"[Awesome, Cinnamon, Deepin, Enlightenment, GNO...","[Budgie, Cinnamon, Deepin, GNOME, i3, KDE Plas...","[Budgie, GNOME, KDE Plasma, MATECategory]",...,[Firefox],"[No Desktop, Web UICategory]",[Xfce],[LXDECategory],"[GNOME, KDECategory]",[Chromium],[MATECategory],[Xfce],"[GNOME, MATECategory]",[GNOMECategory]
Category,"[Desktop, From RAM, Live Medium]","[Desktop, Live Medium, Raspberry Pi]","[Beginners, Desktop, Live Medium]",,"[Beginners, Desktop, Server, Live Medium]","[Desktop, Live Medium, Server]","[Beginners, Desktop, Live Medium]","[Desktop, Server, Live Medium]","[Desktop, Live Medium]",,...,"[Live Medium, Specialist]",,[Server],,,[Raspberry Pi],,"[Specialist, Live Medium]",,


## Getting all versions / releases of each distro

In [235]:
def get_versions(href):

    dfs = pd.read_html(base_url + href)

    ix = [11, 12, 13, 14, 15, 16]

    for i in ix:
        try:
            dfs[i] = dfs[i].set_index(0)
            if 'Feature' in dfs[i].index.values:
                release_df = dfs[i]
                return release_df.iloc[:15]
            else:
                pass
        except:
            pass

### Testing on sample first

In [194]:
df_list = []

for val in test_hrefs:
    x = get_versions(val)
    df_list.append(x)

In [203]:
sample_df = pd.concat([item.T for item in df_list], keys=[item for item in test_hrefs], axis=0).reset_index(level=1, drop=True)

sample_df.sample(n=5)

Unnamed: 0,Feature,Release Date,End Of Life,Price (US$),Image Size (MB),Free Download,Installation,Default Desktop,Package Management,Release Model,Office Suite,Processor Architecture,Init Software,Journaled File Systems,Multilingual
gobo,12.0,2005-06-02,,Free,,ISO,Graphical,,,Fixed,OO.o*,i686,SysV,"ext3, ReiserFS","de, en, pt"
rds,11.2,2018-04-11,,Free,674-674,IMG,--,Chromium,DEB (apt),Fixed,--,armhf,systemd,"ext3, ext4",--
sophos,6.0,2005-07-06,,,,ISO,,--,--,Fixed,--,i386,other,,
gobo,6.0,2003-05-19,,Free,,ISO,Graphical,,,Fixed,OO.o*,i686,SysV,,
sophos,7.5,2009-10-01,,,,ISO,,--,--,Fixed,--,i386,other,,


### Getting all versions

In [236]:
df_list = []

for idx, val in enumerate(ranked_hrefs):
    x = get_versions(val)
    df_list.append(x)

    if (idx+1) % 10 == 0:
        print(f'Finished {val}, Number: {idx+1}')

Finished solus, Number: 10
Finished puppy, Number: 20
Finished peppermint, Number: 30
Finished devuan, Number: 40
Finished pureos, Number: 50
Finished ultimate, Number: 60
Finished alt, Number: 70
Finished nomadbsd, Number: 80
Finished makulu, Number: 90
Finished volumio, Number: 100
Finished rescuezilla, Number: 110
Finished solydxk, Number: 120
Finished swift, Number: 130
Finished pardus, Number: 140
Finished parabola, Number: 150
Finished opnsense, Number: 160
Finished porteuskiosk, Number: 170
Finished archstrike, Number: 180
Finished rancheros, Number: 190
Finished hardenedbsd, Number: 200
Finished vine, Number: 210
Finished rebeccablackos, Number: 220
Finished zeroshell, Number: 230
Finished elastix, Number: 240
Finished linhes, Number: 250
Finished olpc, Number: 260
Finished omoikane, Number: 270


In [237]:
with open('versions.pickle', 'wb') as f:
    pickle.dump(df_list, f, protocol=pickle.HIGHEST_PROTOCOL)

with open('versions.pickle', 'rb') as f:
    data2 = pickle.load(f)

#### The first go around I had some missing dataframes, so I added 11 to the ranged to check and no longer have any NoneTypes

In [238]:
count = 0
idxs = []

for idx, item in enumerate(df_list):
    if isinstance(item, type(None)):
        count += 1
        idxs.append(idx)

print(count)
print(idxs)

0
[]


In [257]:
full_df = pd.concat([el.T for el in df_list], 
                    keys=[el for el in ranked_hrefs], axis=0).reset_index(level=1, drop=True)

full_df.index.name = 'distro'
# full_df.to_csv('df-all_versions.csv', index=True, columns=full_df.columns.values)

full_df.sample(n=5)

Unnamed: 0_level_0,Feature,Release Date,End Of Life,Price (US$),Image Size (MB),Free Download,Installation,Default Desktop,Package Management,Release Model,Office Suite,Processor Architecture,Init Software,Journaled File Systems,Multilingual
distro,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1
solaris,11,2011-11-09,,,,"ISO, registration required",Graphical,GNOME,"IPS, pkgadm",Fixed,--,"sparc64, x86_64",other,"ZFS, UFS","de, en, es, fr, it, pt"
ubuntumate,18.10cosmic,2018-10-18,2019-07,Free,2000-2100,ISO,Graphical,MATE,"DEB (apt), snap",Fixed,LibreOffice,x86_64,systemd,"Btrfs, ext3, ext4, JFS, ReiserFS, XFS",Yes
smeserver,4.1.2,2001-03-30,,,,ISO,Text mode,--,RPM,Fixed,--,i386,SysV,,
xigmanas,12.1.0.4,2020-12-12,,Free,200-600,ISO,Text mode,--,--,Fixed,--,"i386, x86_64",RC,--,--
lite,2.8,2016-01-31,,Free,,ISO,Graphical,Xfce,DEB,Fixed,LibreOffice,"i386, x86_64",systemd,,


## Loading it back in

In [21]:
releases = pd.read_csv('full_releases_df.csv', index_col=0)
os_info = pd.read_csv('os_info_df.csv', index_col=0)
distro_rank = pd.read_csv('distro_rank_df.csv', index_col=0)

# df[df.index == 'arch']['Journaled File Systems'].value_counts()

In [22]:
idxs = np.unique(releases.index.values, return_index=True)[1]
distro_hrefs = [releases.index.values[idx] for idx in sorted(idxs)]

print(len(os_info.columns.values), len(distro_hrefs), distro_rank.shape[0])

275 275 276


In [23]:
href2distro = dict(zip(distro_hrefs, os_info.columns.values))

In [24]:
distro_rank = distro_rank.rename(index={b: a for a, b in href2distro.items()})
distro_rank.index = distro_rank.index.str.lower()

distro_rank.head(5)

Unnamed: 0_level_0,12 months,6 months,3 months,1 month
Distro,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
mx,3959,3413,3407,3464
manjaro,2713,2341,2451,2532
mint,2355,2094,1948,1987
ubuntu,1610,1378,1410,1271
debian,1435,1246,1258,1297


In [25]:
os_info = os_info.rename(columns={b: a for a, b in href2distro.items()})

ranks = pd.DataFrame(np.arange(1, os_info.shape[1]+1)).rename(columns={0: 'ranks'})
ranks = ranks.T
ranks.columns = os_info.columns

os_info = pd.concat([ranks, os_info])
os_info = pd.concat([os_info, distro_rank.T])

cols = os_info.T.columns.tolist()
cols = [cols[0]] + cols[9:][::-1] + cols[1:9]

os_info = os_info.T[cols].T

# I guess the column lengths changed, so:
os_info.iloc[0] = np.where(os_info.iloc[0].isnull(), [x for x in np.arange(1, 290)], os_info.iloc[0])

os_info.head()

Unnamed: 0,mx,manjaro,mint,popos,ubuntu,debian,elementary,fedora,endeavour,solus,...,ubuntu dp,suse,debian edu,fatdog64,secure-k,photon,source mage,super grub2,rocks cluster,pakos
ranks,1,2,3,4,5,6,7,8,9,10,...,280,281,282,283,284,285,286,287,288,289
1 month,3464,2532,1987,2242,1271,1297,1129,908,1085,771,...,148,68,105,39,36,35,33,32,36,3
3 months,3407,2451,1948,2204,1410,1258,1109,1070,1024,785,...,174,62,95,40,36,34,33,30,32,1
6 months,3413,2341,2094,1821,1378,1246,1133,966,877,863,...,176,82,79,38,34,32,31,30,30,0
12 months,3959,2713,2355,1400,1610,1435,1337,1028,727,1073,...,117,83,75,47,35,32,32,31,30,0


### Getting all the ratings

In [26]:
full_urls = [base_url + item for item in ranked_hrefs]


def get_rating(soup):
    name = soup.find('h1').text
    rating = soup.find('div', attrs={'style': 'font-size: 64px; text-align: left'}).text

    d = {name: rating}

    return d

In [28]:
test_ratings = np.random.choice(full_urls, 5).tolist()

ratings_dict = {}

[ratings_dict.update(get_rating(get_page(el))) for el in test_ratings]

In [27]:
full_ratings = {}

for i, el in enumerate(full_urls):
    full_ratings.update(get_rating(get_page(el)))

    if (i+1) % 10 == 0:
        print(f'Distro {el} completed, number {i+1}')

In [38]:
with open('ratings.json', 'w', encoding='utf-8') as f:
    json.dump(full_ratings, f, ensure_ascii=False, indent=4)

In [29]:
with open('ratings.json', 'r') as f:
    ratings = json.load(f)

### Adding ratings to the dataframe

In [30]:
distro2href = {b: a for a, b in href2distro.items()}

r_keys = list(ratings.keys())
d_keys = list(distro2href.keys())

ra_keys = distro_rank.T.columns.values

# Symmetric difference between the two
{*d_keys} ^ {*r_keys}

{'Baltix GNU/Linux', 'Scientific Linux'}

In [31]:
print('Scientific Linux' in d_keys)
print('Baltix GNU/Linux' in r_keys)

# Scientific Linux = scientific
# Baltix GNU/Linux = baltix

print(len(r_keys), len(d_keys), len(ra_keys), os_info.shape[1])

print('scientific' in os_info.columns.values)
print('baltix' in os_info.columns.values)

# {*os_info.columns.values} ^ {*href2distro}

False
False
275 275 276 289
False
True


In [32]:
ratings['Baltix GNU/Linux'] = np.nan
href2distro['scientific'] = 'Scientific Linux'

distro2href = {b: a for a, b in href2distro.items()}

In [33]:
ratings_dict = {distro2href.get(k, v): v for k, v in ratings.items()}

# s_ratings = dict(sorted(ratings.items()))
# s_distro2href = dict(sorted(distro2href.items()))

# dict(zip(s_distro2href.values(), s_ratings.values()))

In [34]:
from collections import Counter

print(Counter(list(ratings.values())).most_common(1))

na = [k for k, v in ratings.items() if v == 'N/A']

[('N/A', 32)]


In [35]:
os_turned = os_info.T.copy()
os_turned['rating'] = os_turned.index.map(ratings_dict)

# Popularity column is redundant since I have the rank and hits per day in separate columns
os_turned = os_turned.drop('Popularity', axis=1)
os_turned['rating'] = os_turned['rating'].replace('N/A', np.nan).fillna(np.nan).astype(np.float32)

os_turned[os_turned.index == 'mx']

Unnamed: 0,ranks,1 month,3 months,6 months,12 months,Based on,OS Type,Architecture,Desktop,Category,Status,Origin,rating
mx,1,3464,3407,3413,3959,"['Debian (Stable), anti XOrigin']",['Linux Based on'],"['i686, x86_64']",['Xfce'],"['Desktop, From RAM, Live Medium']",['Active'],,8.7


In [36]:
full_releases = releases.copy()

In [37]:
idxs = np.unique(releases.index.values, return_index=True)[1]
distro_hrefs = [releases.index.values[idx] for idx in sorted(idxs)]

recent_release_idx = [(full_releases.index == i).argmax() for i in distro_hrefs]
# full_releases.groupby(full_releases.index).first()
recent_releases = full_releases.iloc[recent_release_idx]

info_releases = pd.concat([os_turned, recent_releases], join='outer', axis=1)
info_releases = info_releases.drop(info_releases.tail(14).index) # Last 14 are all nan
info_releases.head()

Unnamed: 0,ranks,1 month,3 months,6 months,12 months,Based on,OS Type,Architecture,Desktop,Category,...,Free Download,Installation,Default Desktop,Package Management,Release Model,Office Suite,Processor Architecture,Init Software,Journaled File Systems,Multilingual
mx,1,3464,3407,3413,3959,"['Debian (Stable), anti XOrigin']",['Linux Based on'],"['i686, x86_64']",['Xfce'],"['Desktop, From RAM, Live Medium']",...,ISO,Graphical,"KDE Plasma, Xfce",DEB,Semi-Rolling,LibreOffice,"i386, x86_64",SysV,"ext3, ext4, JFS, ReiserFS, XFS","en, ca, cs, de, es, fr, hu, nl, pt_br"
manjaro,2,2532,2451,2341,2713,['Arch'],,"['aarch64, x86_64']","['Awesome, bspwm, Budgie, Cinnamon, GNOME, i3,...","['Desktop, Live Medium, Raspberry Pi']",...,ISO,Graphical,,"Pacman, snap",Rolling,"FreeOffice, LibreOffice",x86_64,systemd,"Btrfs, ext3, ext4, JFS, ReiserFS, XFS","de, en, fr"
mint,3,1987,1948,2094,2355,"['Debian, Ubuntu (LTS)']",,"['i686, x86_64']","['Cinnamon, MATE, Xfce']","['Beginners, Desktop, Live Medium']",...,ISO,Graphical,"Cinnamon, MATE, Xfce",DEB,Fixed,LibreOffice,x86_64,systemd,"Btrfs, ext3, ext4, JFS, ReiserFS, XFS",Yes
popos,4,2242,2204,1821,1400,"['Ubuntu, Debian']",['Linux Based on'],['x86_64'],['GNOMECategory'],,...,ISO,Graphical,GNOME,DEB (apt),Fixed,LibreOffice,x86_64,systemd,"Btrfs, ext4",Yes
ubuntu,5,1271,1410,1378,1610,['Debian'],,"['armhf, i686, powerpc, ppc64el, s390x, x86_64']","['GNOME, Unity']","['Beginners, Desktop, Server, Live Medium']",...,ISO,Graphical,GNOME,DEB,Rolling,LibreOffice,"amd64, arm64, ppc64el, s390x",systemd,"Btrfs, ext3, ext4, JFS, ReiserFS, XFS",Yes


----
----
## Cleaning the Data
----
### Desktops:
Most common: KDE, Xfce, Budgie, Cinnamon, GNOME, MATE, LXDE, LXQt, Openbox, Fluxbox

----

In [38]:
i_release = info_releases.copy()
i_release['ddesktop'] = i_release['Desktop'].astype(str) + ' ' + i_release['Default Desktop'].astype(str)
i_release = i_release.replace(r"\[|\]|\'|\,|\(|\)", '', regex=True)

i_release = i_release.rename(columns={'1 month': '1_month', '3 months': '3_months', '6 months': '6_months', '12 months': '12_months'}).drop(['Desktop', 'Default Desktop'], axis=1)

i_release.columns = i_release.columns.str.lower()

In [39]:
def other(df:pd.core.frame.DataFrame, l:list, col, new_col:str):
    other = df[col].str.contains('|'.join(l), case=False, na=False).astype(int)
    df[new_col] = 0
    df.loc[other[other == 0].index, new_col] = 1

    df = df.drop(col, axis=1)

    return df

In [40]:
# Counter(' '.join(i_release['ddesktop']).split()).most_common()

i_release['desk_kde'] = i_release['ddesktop'].str.contains('KDE', case=False, na=False).astype(int)
i_release['desk_xfce'] = i_release['ddesktop'].str.contains('Xfce', case=False, na=False).astype(int)
i_release['desk_budgie'] = i_release['ddesktop'].str.contains('Budgie', case=False, na=False).astype(int)
i_release['desk_cinnamon'] = i_release['ddesktop'].str.contains('Cinnamon', case=False, na=False).astype(int)
i_release['desk_gnome'] = i_release['ddesktop'].str.contains('GNOME', case=False, na=False).astype(int)
i_release['desk_mate'] = i_release['ddesktop'].str.contains('MATE', case=False, na=False).astype(int)
i_release['desk_lxde'] = i_release['ddesktop'].str.contains('LXDE', case=False, na=False).astype(int)
i_release['desk_lxqt'] = i_release['ddesktop'].str.contains('LXQt', case=False, na=False).astype(int)
i_release['desk_openbox'] = i_release['ddesktop'].str.contains('openbox', case=False, na=False).astype(int)
i_release['desk_fluxbox'] = i_release['ddesktop'].str.contains('fluxbox', case=False, na=False).astype(int)
i_release['desk_nodesk'] = i_release['ddesktop'].str.contains('no desktop', case=False, na=False).astype(int)
i_release['desk_web'] = i_release['ddesktop'].str.contains('web', case=False, na=False).astype(int)
i_release['desk_i3'] = i_release['ddesktop'].str.contains('i3', case=False, na=False).astype(int)

# Other desktops
desktops = ['KDE', 'Xfce', 'Budgie', 'Cinnamon', 'GNOME', 'MATE', 'LXDE', 'LXQt', 'openbox', 'fluxbox', 'no desktop', 'web', 'i3']

i_release = other(df=i_release, l=desktops, col='ddesktop', new_col='desk_other')

i_release.sample(2)

Unnamed: 0,ranks,1_month,3_months,6_months,12_months,based on,os type,architecture,category,status,...,desk_gnome,desk_mate,desk_lxde,desk_lxqt,desk_openbox,desk_fluxbox,desk_nodesk,desk_web,desk_i3,desk_other
ghostbsd,49,329.0,229.0,232.0,254.0,,BSDBased on,x86_64,Desktop Live Medium,Active,...,0,1,0,0,0,0,0,0,0,0
hamonikr,259,28.0,28.0,26.0,13.0,Debian Mint,,x86_64,Desktop Live Medium,Active,...,0,0,0,0,0,0,0,0,0,0


In [41]:
# other = i_release['ddesktop'].str.contains('|'.join(desktops), case=False, na=False).astype(int)
# i_release['desk_other'] = 0
# i_release.loc[other[other == 0].index, 'desk_other'] = 1

# i_release

----
### Based On:
Most common: Fedora, Red Hat, Debian, Ubuntu, Independent, Arch (pacman), Gentoo, Slackware 

----

In [42]:
# Counter(' '.join(i_release['Based on']).split()).most_common()

i_release['based_cent'] = i_release['based on'].str.contains('Cent|Clear|Scientific', case=False, na=False).astype(int)
# Fedora and Red Hat are different (red hat spnosors fedora) so I'm combining them
i_release['based_fedora'] = i_release['based on'].str.contains('Fedora|Red|Hat', case=False, na=False).astype(int)
# Ubuntu is Debian based, and there is a line of Unbuntu-based distros but i'm combining them
i_release['based_debian'] = i_release['based on'].str.contains('Debian|Ubuntu', case=False, na=False).astype(int)
i_release['based_indep'] = i_release['based on'].str.contains('Independent', case=False, na=False).astype(int)
i_release['based_pacman'] = i_release['based on'].str.contains('Arch', case=False, na=False).astype(int)
i_release['based_gentoo'] = i_release['based on'].str.contains('Gentoo', case=False, na=False).astype(int)
i_release['based_slack'] = i_release['based on'].str.contains('Slackware', case=False, na=False).astype(int)

# Other 'based_on'
based_on = ['Cent', 'Clear', 'Scientific', 'Fedora', 'Red', 'Hat', 'Debian', 'Ubuntu', 'Independent', 'Arch', 'Gentoo', 'Slackware']

i_release = other(df=i_release, l=based_on, col='based on', new_col='based_other')
i_release.sample(2)

Unnamed: 0,ranks,1_month,3_months,6_months,12_months,os type,architecture,category,status,origin,...,desk_i3,desk_other,based_cent,based_fedora,based_debian,based_indep,based_pacman,based_gentoo,based_slack,based_other
nethserver,166,181.0,78.0,52.0,44.0,Linux Based on,x86_64,,Active,,...,0,0,1,0,0,0,0,0,0,0
ipfire,113,133.0,115.0,97.0,98.0,Linux Based on,armv5tel i586 x86_64,,Active,Germany,...,0,0,0,0,0,1,0,0,0,0


----
### OS Type

Doesn't look useful, so I will drop it

----

In [43]:
print(Counter(' '.join(i_release['os type'].astype(str)).split()).most_common())

i_release = i_release.drop('os type', axis=1)

[('nan', 170), ('on', 105), ('Based', 84), ('Linux', 83), ('BSDBased', 15), ('Other', 4), ('OSBased', 4), ('Solaris', 1), ('KISSBased', 1), ('GRUBBased', 1)]


----
### Architecture
----

In [44]:
i_release['processor'] = i_release['architecture'].astype(str) + ' ' + i_release['processor architecture'].astype(str)

i_release = i_release.drop(['architecture', 'processor architecture'], axis=1)

Counter(' '.join(i_release['processor'].astype(str)).split()).most_common(5)

[('x86_64', 502), ('i686', 111), ('i386', 74), ('armhf', 54), ('aarch64', 36)]

In [45]:
i_release['arc_x86'] = i_release['processor'].str.contains('x86_64', case=False, na=False).astype(int)
i_release['arc_i686'] = i_release['processor'].str.contains('i686', case=False, na=False).astype(int)
i_release['arc_i386'] = i_release['processor'].str.contains('i386', case=False, na=False).astype(int)
i_release['arc_arm'] = i_release['processor'].str.contains('armfh', case=False, na=False).astype(int)
i_release['arc_aarch'] = i_release['processor'].str.contains('aarch64', case=False, na=False).astype(int)

# Other architecture
arc = ['x86_64', 'i686', 'i386', 'armfh', 'aarch64']
i_release = other(df=i_release, l=arc, col='processor', new_col='arc_other')

i_release.sample(2)

Unnamed: 0,ranks,1_month,3_months,6_months,12_months,category,status,origin,rating,feature,...,based_pacman,based_gentoo,based_slack,based_other,arc_x86,arc_i686,arc_i386,arc_arm,arc_aarch,arc_other
antix,18,517.0,588.0,567.0,706.0,,Active,Greece,8.3,19.3,...,0,0,0,0,1,1,0,0,0,0
redhat,63,,,,,,Active,USA,7.8,RHEL-8.3,...,0,0,0,0,1,0,1,0,1,0


----
### Category
----

In [46]:
Counter(' '.join(i_release['category'].astype(str)).split()).most_common(5)

[('Live', 126), ('Medium', 125), ('Desktop', 119), ('nan', 80), ('Server', 41)]

In [47]:
i_release['cat_live'] = i_release['category'].str.contains('Live', case=False, na=False).astype(int)
i_release['cat_med'] = i_release['category'].str.contains('Medium', case=False, na=False).astype(int)
i_release['cat_desk'] = i_release['category'].str.contains('Desktop', case=False, na=False).astype(int)
i_release['cat_serv'] = i_release['category'].str.contains('Server', case=False, na=False).astype(int)

cat = ['Live', 'Medium', 'Desktop', 'Server']

i_release = other(df=i_release, l=cat, col='category', new_col='cat_other')

i_release.sample(2)

Unnamed: 0,ranks,1_month,3_months,6_months,12_months,status,origin,rating,feature,release date,...,arc_i686,arc_i386,arc_arm,arc_aarch,arc_other,cat_live,cat_med,cat_desk,cat_serv,cat_other
archbang,65,154.0,235.0,183.0,169.0,Active,Canada,9.2,711,2020-11-07,...,0,0,0,0,0,1,1,1,0,0
artix,51,212.0,216.0,230.0,207.0,Active,Global,8.9,20200210,2020-02-10,...,0,0,0,0,0,1,1,1,0,0


----
### Status
All are active, so I am going to drop this column

----


In [48]:
print(Counter(' '.join(i_release['status'].astype(str)).split()).most_common(5))

i_release = i_release.drop('status', axis=1)

[('Active', 275)]


----
### Origin

Since I am from the United States and for simplicities sake, I am going to do either USA or not

----

In [49]:
print(Counter(' '.join(i_release['origin'].astype(str)).split()).most_common(5))

i_release = other(df=i_release, l=['USA'], col='origin', new_col='org_usa')

[('USA', 68), ('nan', 29), ('France', 19), ('United', 15), ('Kingdom', 15)]


----
### Ranks and Clicks
----

In [50]:
# Clicks
i_release.iloc[:, 1:5] = i_release.iloc[:, 1:5].fillna(0).astype(int)

# Ranks
na_df = i_release.loc[i_release['rating'].isna(), 'ranks']
na_ranks = list(zip(na_df.index, na_df))

fill_ranks = {}

for dist, rank in na_ranks:
    fill_val = round(i_release.loc[i_release['ranks'].between(rank-10, rank+10), 'rating'].mean(), 1)
    fill_ranks[dist] = fill_val

i_release['rating'] = i_release['rating'].fillna(fill_ranks)

----
### Feature


----

In [51]:
print(Counter(' '.join(i_release['feature'].astype(str)).split()).most_common(5))

i_release['feature'] = i_release['feature'].str.extract('(\d+.?\d+.?\d+)', expand=False).fillna(1.0)

# Show what the other versions look like since they don't follow the traditional incremental versions
start20_n = i_release[i_release['feature'].str.startswith('2020', na=False)].index.tolist()
start20 = [full_releases.groupby(full_releases.index).get_group(name=el)[['Feature', 'Release Date']] for el in start20_n]

[('20.10groovy', 7), ('current', 4), ('20.04', 4), ('4.1', 3), ('3.0', 3)]


#### I decided to create a column that will show the number of releases for the distribution in the past year instead of using the version

In [52]:
rel2020 = full_releases['Release Date'].str.startswith('2020', na=False).groupby(full_releases.index).sum()
rel2020.name = 'rel_2020'

i_release = pd.concat([i_release, rel2020], join='outer', axis=1)
i_release = i_release.drop('feature', axis=1)

----
### Release date

I'm not sure if the month would have any correlation, but I'm going to add it

#### Could create season

----

In [53]:
i_release['release_date'] = pd.to_datetime(i_release['release date'])
i_release['release_month'] = i_release['release_date'].dt.month

i_release = i_release.drop('release date', axis=1)
i_release = i_release.drop('release_date', axis=1)

----
### End of Life

Doesn't seem useful so I'm going to drop it

----

In [54]:
i_release = i_release.drop('end of life', axis=1)

----
### Price

94% of them are free so I'm going to drop it as well

----

In [55]:
i_release = i_release.drop('price (us$)', axis=1)

----
### Image Size (MB)

Since most are a range, I'm going to average it

----

In [56]:
i_release = i_release.rename(columns={'image size (mb)': 'image_size'})
i_release['image_size']

mx          1500-1900
manjaro           NaN
mint        1800-2000
popos       2200-2700
ubuntu            NaN
              ...    
rds           674-674
tuxtrans    2900-3000
drbl          700-800
canaima     2700-2800
baltix      3500-3600
Name: image_size, Length: 275, dtype: object

In [57]:
fill_avg = i_release['image_size'].str.split('-', expand=True).astype(float).mean(axis=0)
fill_val = int((fill_avg[0] + fill_avg[1]) / 2)

i_release['image_size'] = (i_release['image_size'].str.split('-', expand=True).fillna(fill_val)
                                                         .astype(int).mean(axis=1).astype(int))

----
### Free Download
----

In [58]:
i_release['down_iso'] = i_release['free download'].str.contains('ISO', case=False, na=False).astype(int)
i_release['down_img'] = i_release['free download'].str.contains('IMG', case=False, na=False).astype(int)

i_release = other(df=i_release, l=['ISO', 'IMG'], col='free download', new_col='down_oth')

----
### Installation
----

In [59]:
i_release['inst_graph'] = i_release['installation'].str.contains('Graphic', case=False, na=False).astype(int)
i_release['inst_text'] = i_release['installation'].str.contains('Text', case=False, na=False).astype(int)

i_release = other(df=i_release, l=['Graphic', 'Text'], col='installation', new_col='inst_oth')

----
### Package Management
----

In [60]:
Counter(' '.join(i_release['package management'].astype(str)).split()).most_common(5)

[('DEB', 125), ('RPM', 34), ('apt', 29), ('Pacman', 22), ('--', 21)]

In [61]:
i_release['pack_deb'] = i_release['package management'].str.contains('DEB', case=False, na=False).astype(int)
i_release['pack_rpm'] = i_release['package management'].str.contains('RPM', case=False, na=False).astype(int)
i_release['pack_apt'] = i_release['package management'].str.contains('Pacman', case=False, na=False).astype(int)
i_release['pack_pacman'] = i_release['package management'].str.contains('APT', case=False, na=False).astype(int)

i_release = other(df=i_release, l=['DEB', 'RPM', 'Pacman', 'APT'], col='package management', new_col='pack_other')

----
### Release Model
----

In [62]:
Counter(' '.join(i_release['release model'].astype(str)).split()).most_common(15)

[('Fixed', 217), ('Rolling', 55), ('Semi-Rolling', 4), ('Semi', 1), ('--', 1)]

In [63]:
i_release['rel_fix'] = i_release['release model'].str.contains('Fixed', case=False, na=False).astype(int)
i_release['rel_roll'] = i_release['release model'].str.contains('Rolling', case=False, na=False).astype(int)

i_release = other(df=i_release, l=['Fixed', 'Rolling'], col='release model', new_col='rel_other')

----
### Office Suite

Doesn't seem to be unique/useful

----

In [64]:
i_release = i_release.drop('office suite', axis=1)

----
### Init Software
----

In [65]:
Counter(' '.join(i_release['init software'].astype(str)).split()).most_common(5)

[('systemd', 172), ('SysV', 38), ('other', 37), ('RC', 10), ('OpenRC', 10)]

In [66]:
i_release['init_sysd'] = i_release['init software'].str.contains('systemd', case=False, na=False).astype(int)
i_release['init_sysv'] = i_release['init software'].str.contains('sysv', case=False, na=False).astype(int)
i_release['init_oprc'] = i_release['init software'].str.contains('OpenRC', case=False, na=False).astype(int)

i_release = other(df=i_release, l=['systemd', 'sysv', 'openrc'], col='init software', new_col='init_other')

----
### Journaled File System
----

In [67]:
Counter(' '.join(i_release['journaled file systems'].astype(str)).split()).most_common(5)

[('ext4', 151), ('ext3', 122), ('XFS', 76), ('ReiserFS', 71), ('Btrfs', 70)]

In [68]:
i_release['jour_ext4'] = i_release['journaled file systems'].str.contains('ext4', case=False, na=False).astype(int)
i_release['jour_ext3'] = i_release['journaled file systems'].str.contains('ext3', case=False, na=False).astype(int)
i_release['jour_xfs'] = i_release['journaled file systems'].str.contains('XFS', case=False, na=False).astype(int)
i_release['jour_btrfs'] = i_release['journaled file systems'].str.contains('btrfs', case=False, na=False).astype(int)
i_release['jour_reiser'] = i_release['journaled file systems'].str.contains('Reiser', case=False, na=False).astype(int)

# Possibly add JFS / NFS

i_release = other(df=i_release, l=['ext4', 'ext3', 'xfs', 'btrfs', 'reiser'], col='journaled file systems', new_col='journ_oth')

----
### Multilingual
----

In [69]:
i_release['multiling'] = np.where(~i_release['multilingual'].str.contains('No|--', case=False, na=True), 1, 0)

i_release = i_release.drop('multilingual', axis=1)

In [70]:
i_release.sample(2)

Unnamed: 0,ranks,1_month,3_months,6_months,12_months,rating,image_size,desk_kde,desk_xfce,desk_budgie,...,init_sysv,init_oprc,init_other,jour_ext4,jour_ext3,jour_xfs,jour_btrfs,jour_reiser,journ_oth,multiling
tinycore,83,0,0,0,0,9.4,110,0,0,0,...,0,0,1,0,0,0,0,0,1,1
photonos,227,0,0,0,0,6.5,3150,0,0,0,...,0,0,0,1,0,0,0,0,0,0


In [71]:
i_release.to_csv('df-ohe.csv', index=True, columns=i_release.columns.values)