# Clean Orbis Data

This is an **old version** for pre-processing of Orbis data files.

Here, we used _Overviews_ to retrieve information about which companies are German.


In [None]:
import os
import pyunpack
import pandas as pd

## 1. Load parts of Orbis dataset

The Orbis dataset is stored on path:
```python
../data/raw/orbis/
```

The data are read into Pandas **DataFrame**.


### Extract the .rar

In [None]:
# Create intermediate data directory if does not exist
data_intermediate_dir = '../data/intermediate/orbis'
os.makedirs(data_intermediate_dir, exist_ok=True)

overviews_rar = '../data/raw/orbis/Overviews.rar'

# Unrar rar file to the intermediate directory
pyunpack.Archive(overviews_rar).extractall(data_intermediate_dir)

### Get overview

In [None]:
overviews_path = '../data/intermediate/orbis/Overviews.txt'

# Read the large file with specified chunksize 
df_chunk = pd.read_csv(overviews_path, nrows=5,
                       error_bad_lines=False, 
                       sep='\t')

df_chunk

## 2. Filter by the country

The information about the **main domestic country** and the **main foreign countries or regions** and other interesting values are stored in the Overviews.rar.

In [None]:
chunk_list = []  # append each chunk df here 

# Read the large file with specified chunksize 
df_chunk = pd.read_csv(overviews_path, chunksize=1000,
                       error_bad_lines=False,
                       sep='\t')

for chunk in df_chunk:
    # Define filter to get rows where ID starts with 'DE'
    bvd_id_filter = chunk['BvD ID number'].str.match('^DE.*') == True
    
     # Get the rows containing words 'Germany' or 'Deutschland'
    id_chunk = chunk[bvd_id_filter]
    
    # Append filtered chunk
    chunk_list.append(id_chunk)
    
    # Remove matched from chunk
    chunk = chunk[~chunk['BvD ID number'].isin(id_chunk['BvD ID number'])]
    
    for column_name in chunk.columns:
        # Define filter to get rows which contains 'Germany' or 'Deutschland'
        german_filter = chunk[column_name].astype(str).str.contains(r'Germany|Deutschland') == True

        # Get the rows containing words 'Germany' or 'Deutschland'
        german_chunk = chunk[german_filter]

        # Append filtered chunk
        chunk_list.append(german_chunk)
        
        # Remove matched from chunk
        chunk = chunk[~chunk['BvD ID number'].isin(german_chunk['BvD ID number'])]

# Concatenate all the filtered chunks
df_concat = pd.concat(chunk_list)

df_concat


## 3. Check interesting values

In [None]:
# Get rows where BvD ID does not match DE
new_df = df_concat[~df_concat['BvD ID number'].str.match('^DE.*')== True]

new_df.head()

In [None]:
# Get unique Main domestic country values
df_concat['Main domestic country'].unique()

### Check column values

Check values in the different columns to choose the useful columns.

In [None]:
# Get column names
column_names = df_concat.columns 

# Get number of the columns
print(f"Number of the columns: {len(column_names)}")

# Check the number of rows
print(f"Number of rows: {len(df_concat)}")

print()

# Get column data types
print(df_concat.dtypes)

### Check the uniqeness and NaN values

In [None]:
# Check if the column is unique
for i in column_names:
  print(f'{i} is unique: {df_concat[i].is_unique}')

# Check the index values
# Results in error if there is no index
df_concat.index.values

In [None]:
# Any row that contains NaN
row_contains_NaN = df_concat.isnull().any(axis=1)
print(f"Row contains NaN: {len(df_concat[row_contains_NaN])}")

In [None]:
# Any row that contains only NaN
row_is_NaN = df_concat.isnull().all(axis=1)
print(f"Row is NaN: {len(df_concat[row_is_NaN])}")

## 3. Save the BvD ID of German companies


The processed data is stored in a csv file on a path:
```python
../data/intermediate/orbis
```

In [None]:
columns_to_take = [0]  # Take BvID column

new_df = df_concat.take(columns_to_take, axis=1)

print(f'Num. of rows: {len(new_df)}')

new_df.head(10)

In [None]:
intermediate_dir = "../data/intermediate/orbis"
df_file = "orbis_german_bvid.csv"

# Create parent directory if does not exist
os.makedirs(intermediate_dir, exist_ok=True)

# Save dataframe to a csv file
new_df.to_csv(os.path.join(intermediate_dir, df_file), sep='\t')

## 4. Remove intermediate file

Remove _Overview.txt_.

In [None]:
# Delete intermediate file
os.remove(overviews_path)