# Merge datasets
In this notebook the following task will be done:
- Create `pickle.gz` files of `test` and `solution` datasets
- Merge submissions datasets with test dataset

In [1]:
import pandas as pd
import numpy as np
import glob
import gzip
import pickle
from datetime import datetime

import sys
sys.path.append("..\\source\\")
import utils as utils

In [2]:
path_in = "..\\data\\original\\"
path_out = "..\\data\\processed\\"

## Compress solution data

In [None]:
solution = pd.read_csv(path_in + "solution\solution.csv", usecols=["row_id","meter_reading"])

In [26]:
# Replace negative values with NaN
solution = solution.replace(-9999.000, np.nan)

In [27]:
solution.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 41697600 entries, 0 to 41697599
Data columns (total 2 columns):
 #   Column         Dtype  
---  ------         -----  
 0   row_id         int64  
 1   meter_reading  float64
dtypes: float64(1), int64(1)
memory usage: 636.3 MB


In [30]:
solution.isna().sum()*100 / len(solution)

row_id           0.000000
meter_reading    2.026966
dtype: float64

In [31]:
file = gzip.GzipFile(path_in + f'solution\\solution.pickle.gz', 'wb', 9)
file.write(pickle.dumps(solution))
file.close()

## Compress test data

In [None]:
df = pd.read_csv(path_in + "test\\test.csv")

In [None]:
file = gzip.GzipFile(path_out + f'test.pickle.gz', 'wb', 9)
file.write(pickle.dumps(df))
file.close()

## Add solution to top50 submissions

In [32]:
# Load solution
solution = pd.read_pickle(path_in + "solution\\solution.pickle.gz")

In [3]:
#Top50 files
top50_files = glob.glob(path_in + "top50_submissions\\*")
len(top50_files)

50

In [15]:
start = 1
end = 51

for datafile in top50_files[start:end]:

    # file id
    name = datafile.split("\\")[-1].split(".")[0].split("_")[0]

    # Print progress
    number = top50_files.index(datafile) + 1
    total = len(top50_files)
    progress = round(number * 100 / total,2)
    print(f"{name} - {progress}% ({number} of {len(top50_files)})")

    # Load data
    df = pd.read_pickle(datafile).rename(columns={"meter_reading":"submission"})
    print(f"Data loaded")

    # Merge
    df = pd.merge(df, solution, how="inner", on="row_id")
    print(f"Data merged")

    # Export df
    file = gzip.GzipFile(path_out + f'merged\\submissions_solution\\sub{name}_solution.pickle.gz', 'wb', 6)
    file.write(pickle.dumps(df))
    file.close()
    print(f"Data exported")
    
    print("")

NameError: name 'top50_files' is not defined

## Merge with test

In [4]:
# Sub + solution files
files = glob.glob(path_out + 'merged\\submissions_solution\\*")
len(files)

50

In [6]:
# Test data
test = pd.read_pickle(path_in + "test\\test.pickle.gz")
test = utils.reduce_mem_usage(test)

Mem. usage decreased to 596.49 Mb (53.1% reduction)


In [5]:
test.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 41697600 entries, 0 to 41697599
Data columns (total 4 columns):
 #   Column       Dtype 
---  ------       ----- 
 0   row_id       int32 
 1   building_id  int16 
 2   meter        int8  
 3   timestamp    object
dtypes: int16(1), int32(1), int8(1), object(1)
memory usage: 596.5+ MB


In [7]:
start = 0
end = 1

for datafile in files[start:end]:

    # file id
    name = datafile.split("\\")[-1].split(".")[0].split("_")[0]

    # Print progress
    number = files.index(datafile) + 1
    total = len(files)
    progress = round(number * 100 / total,2)
    print(f"{name} - {progress}% ({number} of {len(files)})")

    # Load data
    df = pd.read_pickle(datafile)
    df = utils.reduce_mem_usage(df)
    print(f"Data loaded")

    # Merge
    df = pd.merge(df, test, how="inner", on="row_id").drop("row_id",axis=1)
    print(f"Data merged")

    # Export df
    file = gzip.GzipFile(path_out + f'merged\\{name}_merged.pickle.gz', 'wb', 6)
    file.write(pickle.dumps(df))
    file.close()
    print(f"Data exported")
    
    print("")

sub13577404 - 2.0% (1 of 50)
Mem. usage decreased to 795.32 Mb (37.5% reduction)
Data loaded
Data merged
Data exported

