In [1]:
import pandas as pd
import json
import urllib
import progressbar
import os
import numpy as np
import click

from loguru import logger
from urllib.error import URLError
from urllib.error import HTTPError

In [2]:
URL_sisyphus = 'https://rdb.altlinux.org/api/export/branch_binary_packages/sisyphus' #sisyphus
URL_sp10 = 'https://rdb.altlinux.org/api/export/branch_binary_packages/p10' #p10

In [3]:
logger.add('debug.log', format="{time} {level} {message}", level='DEBUG')

1

In [4]:
def show_progress(block_num, block_size, total_size):
    
    '''Function to generate progress bar'''
    
    global pbar
    if pbar is None:
        logger.debug('Creating progress bar')
        pbar = progressbar.ProgressBar(maxval=total_size, widgets=[
    'Downloading:', 
    progressbar.Bar(left='[', marker='=', right=']'), # Progress
    progressbar.Percentage(), # Percentage
    ' Bytes: ',
    progressbar.SimpleProgress(), # Number of bytes
    ' Network: ',
    progressbar.FileTransferSpeed(), # Transfer speed-o-meeter
    ])
        pbar.start() # Lauching progress bar

    downloaded = block_num * block_size
    if downloaded < total_size:
        pbar.update(downloaded)
    else:
        pbar.finish()
        pbar = None

In [20]:
# @click.command()
# @click.option('--force', '-f', default=False, show_default=True, help="Use after reading README")
def download_data(flag=False):
    
    """Entry point function and file upload
    and compare lists of versions of packages
     -force
    
    """
    
    try:
        logger.info('Access successful')
        logger.info('Starting to download firts package from %s', URL_sisyphus)
        tmp_sis = urllib.request.urlretrieve(URL_sisyphus, 'chache_sis.tmp', show_progress) #Downloading the firts part of data
        logger.info('Download from %s complete', URL_sisyphus)
        df_sis =  preparing_data(tmp_sis)                              # Making dataframes
        
        
        logger.info('Starting to download second package from %s', URL_sp10)
        tmp_p10 = urllib.request.urlretrieve(URL_sp10, 'chache_p10.tmp', show_progress) #Downloading the second part of data
        logger.info('Download from %s complete', URL_sp10)
        df_p10 = preparing_data(tmp_p10)     # Making dataframes
        
        if flag:
            data_comparator(df_sis, df_p10, flag)
        elif not flag:
            data_comparator(df_sis, df_p10)
            
    except HTTPError as exception:
        raise exception
    except URLError as exception:
        if isinstance(exception.reason, exception.timeout):
            logger.error('Timeout Error: Data of %s not retrieved because %s\nURL: %s', name, error, url)
        else:
            logger.error('URL Error: Data of %s not retrieved because %s\nURL: %s', name, error, url)      

In [6]:
def making_dataframe(json_object: dict, path):
    
    """Function to convert json to Dataframe for processing"""
    
    logger.info('Making Dataframe of %s', path)
    
    df0 = pd.DataFrame(json_object['packages'])  # Making DataFrame from JSON used only 'packages' because there is all data which we need
    df = df0.sort_values(by=['name', 'version']) # Sorting data
    df = df.drop_duplicates(keep='last')         # Deleting duplicated rows
    df = df.dropna(subset=['name', 'version', 'release']) # Deleting NaN rows by subset columns
    df = df.reset_index(drop=True)               # Resetting indexies in table
    
    name = path.split('.')[0]                    # Name of future .csv table with got data
    
    df.to_csv(f'{name}.csv')                     # Saving of csv table with data for future
    
    #checking if file exist or not
    if(os.path.isfile(path)):
    
        #os.remove() function to remove the file
        os.remove(path)
    
        logger.info("Temporary file (%s) deleted successfully", path)
    else:
        logger.error("Temporary file does not exist")
        
    logger.info('Making DataFrame already finished')
    
    return df

In [7]:
def preparing_data(tmp: tuple) -> pd.core.frame.DataFrame:
    
    path = tmp[0] # Getter of path and name of future file
    
    logger.info('Converting byte-like string %s to JSON', path)
    
    with open(path, 'r') as file:
        json_object = json.load(file) # Making str -> dict convertation
        logger.info('JSON saved')
    
    return making_dataframe(json_object, path)

In [8]:
def data_comparator(df_sis: pd.core.frame.DataFrame, df_p10: pd.core.frame.DataFrame, cross_analysis=False):
    
    """Function for generating reports on branches in the DataFrame format"""
    
    dfs = pd.merge(df_sis, df_p10, how='inner')         # Find the intersection of tables
    
    #Add intersections as duplicates and delete them *crutch
    df_sis = df_sis.append(dfs)                      
    df_p10 = df_p10.append(dfs)
    
    df_sis = df_sis.drop_duplicates(keep=False)
    df_p10 = df_p10.drop_duplicates(keep=False)
    
    # Reseting indexies
    df_sis = df_sis.reset_index(drop=True)
    df_p10 = df_p10.reset_index(drop=True)
    
    result_json_sis = df_sis.to_json(orient="columns")
    result_json_p10 = df_p10.to_json(orient="columns")
    
    with open('result_json_sis.txt', 'w') as file:
        file.write(result_json_sis)
    with open('result_json_p10.txt', 'w') as file:
        file.write(result_json_p10)
    
    """Cross-analysis works O (n ^ 2), for some reason pandas stubbornly refuses to compare sets,
        and I didn’t fasten the SKL because of time pressure, I’m sorry
        This is a head-on solution and it is not optimal, there is a much easier way to solve this problem"""
    
    res = False
    
    if cross_analysis:
        
        df_sis_croped = df_sis.drop(columns=['epoch', 'release', 'disttag', 'buildtime', 'source'])
        df_p10_croped = df_p10.drop(columns=['epoch', 'release', 'disttag', 'buildtime', 'source'])
        
        df_sis_croped['newer'] = True
        
        for i, x in df_sis_croped.iterrows():
            for j, y in df_sis_croped.iterrows():
                if x[0] == y[0] and x[1] == y[1] and x[2] < y[2]:
                    df_sis_croped.drop(index=[i], inplace = True)
        res = df_sis_croped.to_json(orient="columns")
    
    return result_json_sis, result_json_p10, res

In [21]:
if __name__ == '__main__':
    pbar = None # Making entitiy for future progress bar
    download_data() # Entry-point of script

2022-10-17 08:47:22.502 | INFO     | __main__:download_data:12 - Access successful
2022-10-17 08:47:22.503 | INFO     | __main__:download_data:13 - Starting to download firts package from %s
2022-10-17 08:47:29.371 | DEBUG    | __main__:show_progress:7 - Creating progress bar
2022-10-17 08:49:08.817 | INFO     | __main__:download_data:15 - Download from %s complete
2022-10-17 08:49:08.817 | INFO     | __main__:preparing_data:5 - Converting byte-like string %s to JSON
2022-10-17 08:49:09.089 | INFO     | __main__:preparing_data:9 - JSON saved
2022-10-17 08:49:09.089 | INFO     | __main__:making_dataframe:5 - Making Dataframe of %s
2022-10-17 08:49:10.645 | INFO     | __main__:making_dataframe:23 - Temporary file (%s) deleted successfully
2022-10-17 08:49:10.645 | INFO     | __main__:making_dataframe:27 - Making DataFrame already finished
2022-10-17 08:49:10.689 | INFO     | __main__:download_data:19 - Starting to download second package from %s
2022-10-17 08:49:17.259 | DEBUG    | __mai