# Prepare subsets from CSV

## Import and Functions

In [1]:
import csv
import time
import winsound
import pathlib
import pandas as pd

def get_header_and_lookup_values(file: str):
    t = pd.read_csv(file, dtype=str)
    filter_column=t.columns[0]
    wanted_values = t[filter_column].to_list()
    return filter_column, wanted_values

def filter_source_file(source_file: str, filter_column: str, wanted_values: list, chunksize=10**5):
    print('Looking for mathces in column {} inside file:{}'.format(filter_column, source_file))
    filtered_data = pd.DataFrame(index=None)
    with pd.read_csv(source_file, chunksize=chunksize, dtype=str) as reader:
        for chunk in reader:
            to_be_added = chunk.loc[chunk[filter_column].isin(wanted_values)]
            filtered_data = pd.concat([filtered_data, to_be_added])
            print('Rows in filtered data: {}'.format(filtered_data.shape[0]))
    print('done reading {}'.format(source_file))
    return filtered_data

def write_df_to_csv(df, file_full_path: str, str_addition='_subset'):
    full_path = pathlib.Path(file_full_path)
    file_name = full_path.stem
    subset_full_path = "output/" + file_name + str_addition + '.csv'
    print('Writing: {}..'.format(subset_full_path),end='')
    df.to_csv(subset_full_path, index=False)
    print('done')
    



## Used files

In [2]:
#Training:
YTMC_file = 'data/2022-08-17_YTMC_oilseed_SUBSET.csv'
NFMC_file = 'data/2022-08-17_NFMC_oilseed_SUBSET.csv'
INVB_file = 'data/2022-08-17_INVB_oilseed_SUBSET.csv'
# Real:
# YTMC_file = 'C:/Users/amnon/Desktop/ACCOUNTS/BASF/WS3_migration/Crop_zip_files/oil_seeds_170822/2022-08-17_YTMC_oilseed.csv'
# NFMC_file = 'C:/Users/amnon/Desktop/ACCOUNTS/BASF/WS3_migration/Crop_zip_files/oil_seeds_170822/2022-08-17_NFMC_oilseed.csv'
# INVB_file = 'C:/Users/amnon/Desktop/ACCOUNTS/BASF/WS3_migration/Crop_zip_files/oil_seeds_170822/2022-08-17_INVB_oilseed.csv'
wanted_YTMC_values_file= 'Oilseeds_YTMC_doc_ids.csv'
wanted_NFMC_values_file= 'Oilseeds_NFMC_doc_ids.csv'

# Main

In [3]:
startTime = time.time()

## YTMC

### Get header for filtering and values

In [4]:
filter_column, wanted_values = get_header_and_lookup_values(wanted_YTMC_values_file)
# filter_column = 'doc_id'
# wanted_values = ['507733', '507736']

### read source and get df of the subset

In [5]:
df = filter_source_file(YTMC_file, filter_column, wanted_values)

Looking for mathces in column doc_id inside file:data/2022-08-17_YTMC_oilseed_SUBSET.csv
Rows in filtered data: 2772
done reading data/2022-08-17_YTMC_oilseed_SUBSET.csv


### write subset to csv

In [6]:
write_df_to_csv(df, YTMC_file)

Writing: output/2022-08-17_YTMC_oilseed_SUBSET_subset.csv..done


### Add PD_BAYER_UID values to list

In [7]:
YTMC_pd_BAYER_UID = [item for item in df['C_PD_BAYER_UID'].unique().tolist() if item]

## NFMC

### Get header for filtering and values

In [8]:
filter_column, wanted_values = get_header_and_lookup_values(wanted_NFMC_values_file)

### read source and get df of the subset

In [9]:
df = filter_source_file(NFMC_file, filter_column, wanted_values)

Looking for mathces in column doc_id inside file:data/2022-08-17_NFMC_oilseed_SUBSET.csv
Rows in filtered data: 157
done reading data/2022-08-17_NFMC_oilseed_SUBSET.csv


### Write subset to csv

In [10]:
write_df_to_csv(df, NFMC_file)

Writing: output/2022-08-17_NFMC_oilseed_SUBSET_subset.csv..done


### Add PD_BAYER_UID values to list

In [11]:
NFMC_pd_BAYER_UID = [item for item in df['C_PD_BAYER_UID'].unique().tolist() if item]

## INVB file by list

In [12]:
# remove blanks from list
needed_pd_BAYER_UID = YTMC_pd_BAYER_UID + NFMC_pd_BAYER_UID
# Create subset from another file (INVB)
    # Read source (INVB) and get df of the subset
invb_df = filter_source_file(INVB_file, 'C_PD_BAYER_UID', needed_pd_BAYER_UID)
    # write subset to CSV
write_df_to_csv(invb_df,INVB_file, '_subset')

Looking for mathces in column C_PD_BAYER_UID inside file:data/2022-08-17_INVB_oilseed_SUBSET.csv
Rows in filtered data: 2888
done reading data/2022-08-17_INVB_oilseed_SUBSET.csv
Writing: output/2022-08-17_INVB_oilseed_SUBSET_subset.csv..done


## Finish

In [13]:
winsound.Beep(440, 1000)
executionTime = (time.time() - startTime)
print('Execution time in seconds: ' + str(executionTime))

Execution time in seconds: 2.3558247089385986
