# Analyse Python Version

This notebook classifies repos as either Python 3 compatible (select=True) or not (select=False).

Inputs:
* `results_version.csv` (for DS and non-DS)

Outputs:
* `ds_version_tab.csv`, `nonds_version_tab.csv`

In [1]:
import pandas as pd
import numpy as np
import sys
from os.path import join
import pathlib
import altair as alt

In [2]:
DATA_DIR = "../output/"
NB_OUT = join(DATA_DIR, "notebooks_out")
MERGED_DIR = join(DATA_DIR, "merged")
pathlib.Path(NB_OUT).mkdir(parents=True, exist_ok=True)

In [3]:
def summarize(either, neither, python2, python3):
    if neither > 0:
        return 'neither'
    if python3 == 0 and python2 > 0:
        return 'only2'
    if python2 == 0 and python3 > 0:
        return 'only3'
    if python3 == 0 and python2 == 0 and either > 0:
        return 'either'
    if python2 > 0 and python3 > 0:
        return 'mix'
    else:
        return 'empty'

summarize_vec = np.vectorize(summarize)

def select(either, neither, python2, python3):
    if either + python3 > 0 and python2 <= python3:
        return True
    else:
        return False
    
select_vec = np.vectorize(select)

def tabluate_version(ver_df):
    ver_group_df = ver_df.groupby(["repo", "ver"]).count()
    ver_group_df_flat = pd.DataFrame(ver_group_df.to_records())
    ver_group_df_flat = ver_group_df_flat.rename(columns={"path": "vercnt"})
    tab = ver_group_df_flat.pivot(index="repo", columns="ver", values="vercnt")
    tab = tab.fillna(0)
    tab['summary'] = tab.apply(
        lambda x: summarize_vec(x.either, x.neither, x.python2, x.python3),
        axis=1)
    tab['select'] = tab.apply(
        lambda x: select_vec(x.either, x.neither, x.python2, x.python3),
        axis=1)
    return tab

In [4]:
ds_version = pd.read_csv(join(DATA_DIR, "ds-t5/results_version.csv"))
nonds_version = pd.read_csv(join(DATA_DIR, "nonds-t5/results_version.csv"))

ds_version_tab = tabluate_version(ds_version)
ds_version_tab.to_csv(join(NB_OUT, "ds_version_tab.csv"), index=True)

nonds_version_tab = tabluate_version(nonds_version)
nonds_version_tab.to_csv(join(NB_OUT, "nonds_version_tab.csv"), index=True)

In [5]:
ds_version_tab

ver,either,neither,python2,python3,summary,select
repo,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
118130,16.0,0.0,28.0,1.0,mix,False
192904,143.0,0.0,0.0,19.0,only3,True
329033,355.0,0.0,8.0,0.0,only2,False
379988,107.0,0.0,1.0,0.0,only2,False
462713,29.0,0.0,2.0,0.0,only2,False
...,...,...,...,...,...,...
157936206,412.0,0.0,0.0,1.0,only3,True
159004094,30.0,0.0,0.0,4.0,only3,True
159175746,26.0,0.0,0.0,3.0,only3,True
160251929,31.0,0.0,0.0,0.0,either,True


In [6]:
chart = alt.Chart(ds_version_tab).mark_bar().encode(
    alt.X("summary", axis=alt.Axis(title='Project Python Version(s)'), sort=["either", "only2", "only3", "neither"]),
    y=alt.Y('count()', title="Number of Projects (DS)"),
)
chart

In [7]:
chart = alt.Chart(ds_version_tab).mark_bar().encode(
    alt.X("select", title="Py3 Compatible"),
    y=alt.Y('count()', title="Number of Projects (DS)")
)
chart

In [8]:
chart = alt.Chart(nonds_version_tab).mark_bar().encode(
    alt.X("summary", axis=alt.Axis(title='Project Python Version(s)'), sort=["either", "only2", "only3", "neither"]),
    y=alt.Y('count()', title="Number of Projects (Non-DS)"),
)
chart

In [9]:
chart = alt.Chart(nonds_version_tab).mark_bar().encode(
    alt.X("select", title="Py3 Compatible"),
    y=alt.Y('count()', title="Number of Projects (Non-DS)"),
)
chart