In [1]:
import pandas as pd
import numpy as np
import gzip
import json
import shutil
from pathlib import Path

The aim is to read the data from a CSV file after wrangling it, and then read it with csv.DictReader and append it to the JSON file we created.

# processing data in csv file

The overall purpose of this wrangle() function is to read a CSV file, perform various data cleaning and transformation steps, and return the resulting DataFrame with the modifications applied.

In [2]:
def wrangle(file_path):
    df = pd.read_csv(file_path,low_memory=False)
    
    start_index = 1  # Starting index for the new column names
    column_range = range(0, 64)  # Range of columns to be renamed

    for i, col_index in enumerate(column_range):
        new_col_name = f"Attr_{start_index + i}"
        df.rename(columns={df.columns[col_index]: new_col_name}, inplace=True)
    
    # insert column 'company_id'
    df.insert(0, 'company_id', range(1, len(df) + 1))

    # rename 'class' column to 'bankrupt'
    df = df.rename(columns={'class':'bankrupt'})
    
    # remove '?' values 
    df = df.replace({'?': 'NaN'})
    
    df= df.astype({c: "float64" for c in df.columns[1:65]})
    df['bankrupt'] = df['bankrupt'].astype(bool)
    
    return df

In [27]:
df = wrangle(r"E:\WorldQuant\project_5\data\Poland_bankruptcy.csv")
print(df.shape)
df.head()

(10503, 66)


Unnamed: 0,company_id,Attr_1,Attr_2,Attr_3,Attr_4,Attr_5,Attr_6,Attr_7,Attr_8,Attr_9,...,Attr_56,Attr_57,Attr_58,Attr_59,Attr_60,Attr_61,Attr_62,Attr_63,Attr_64,bankrupt
0,1,0.17419,0.41299,0.14371,1.348,-28.982,0.60383,0.21946,1.1225,1.1961,...,0.16396,0.37574,0.83604,7e-06,9.7145,6.2813,84.291,4.3303,4.0341,False
1,2,0.14624,0.46038,0.2823,1.6294,2.5952,0.0,0.17185,1.1721,1.6018,...,0.027516,0.271,0.90108,0.0,5.9882,4.1103,102.19,3.5716,5.95,False
2,3,0.000595,0.22612,0.48839,3.1599,84.874,0.19114,0.004572,2.9881,1.0077,...,0.007639,0.000881,0.99236,0.0,6.7742,3.7922,64.846,5.6287,4.4581,False
3,4,0.024526,0.43236,0.27546,1.7833,-10.105,0.56944,0.024526,1.3057,1.0509,...,0.048398,0.043445,0.9516,0.14298,4.2286,5.0528,98.783,3.695,3.4844,False
4,5,0.18829,0.41504,0.34231,1.9279,-58.274,0.0,0.23358,1.4094,1.3393,...,0.17648,0.32188,0.82635,0.073039,2.5912,7.0756,100.54,3.6303,4.6375,False


In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10503 entries, 0 to 10502
Data columns (total 66 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   company_id  10503 non-null  int64  
 1   Attr_1      10503 non-null  float64
 2   Attr_2      10503 non-null  float64
 3   Attr_3      10503 non-null  float64
 4   Attr_4      10485 non-null  float64
 5   Attr_5      10478 non-null  float64
 6   Attr_6      10503 non-null  float64
 7   Attr_7      10503 non-null  float64
 8   Attr_8      10489 non-null  float64
 9   Attr_9      10500 non-null  float64
 10  Attr_10     10503 non-null  float64
 11  Attr_11     10503 non-null  float64
 12  Attr_12     10485 non-null  float64
 13  Attr_13     10460 non-null  float64
 14  Attr_14     10503 non-null  float64
 15  Attr_15     10495 non-null  float64
 16  Attr_16     10489 non-null  float64
 17  Attr_17     10489 non-null  float64
 18  Attr_18     10503 non-null  float64
 19  Attr_19     10460 non-nul

### Save changes we made on csv file by wrangle function to new csv file `Poland_bankruptcy_modified.csv`

In [5]:
df.to_csv(r"E:\WorldQuant\project_5\data\Poland_bankruptcy_modified.csv", index=False)

###### After procession  the CSV file,we extracts random rows, sorts them based on the first column, saves them to a new CSV file, and removes the extracted rows from the original file.

In [6]:
import csv
import random

input_file = r'E:\WorldQuant\project_5\data\Poland_bankruptcy_modified.csv'
output_file = r'E:\WorldQuant\project_5\data\Poland_bankruptcy_modified_test.csv'

# Step 1: Read the CSV file
with open(input_file, 'r') as file:
    reader = csv.reader(file)
    rows = list(reader)

    header_row = rows[0]
    data_rows = rows[1:]

    num_data_rows = len(data_rows)
    num_random_rows = 526  # Define the number of random rows you want to extract
    random_indices = random.sample(range(num_data_rows), num_random_rows)

    random_rows = [data_rows[index] for index in random_indices]

    # Step 2: Sort the extracted rows based on the first column
    sorted_rows = sorted(random_rows, key=lambda row: float(row[0]))

    # Step 3: Save the sorted extracted rows in a new CSV file with the header
    with open(output_file, 'w', newline='') as output:
        writer = csv.writer(output)
        writer.writerow(header_row)  # Write the header row
        writer.writerows(sorted_rows)

    # Step 4: Remove the extracted rows from the original CSV file
    remaining_rows = [row for index, row in enumerate(data_rows) if index not in random_indices]

    with open(input_file, 'w', newline='') as file:
        writer = csv.writer(file)
        writer.writerow(header_row)  # Write the header row
        writer.writerows(remaining_rows)

In [26]:
original_csv = pd.read_csv(r'E:\WorldQuant\project_5\data\Poland_bankruptcy_modified.csv')
print(original_csv.shape)
original_csv.head()

(9977, 66)


Unnamed: 0,company_id,Attr_1,Attr_2,Attr_3,Attr_4,Attr_5,Attr_6,Attr_7,Attr_8,Attr_9,...,Attr_56,Attr_57,Attr_58,Attr_59,Attr_60,Attr_61,Attr_62,Attr_63,Attr_64,bankrupt
0,1,0.17419,0.41299,0.14371,1.348,-28.982,0.60383,0.21946,1.1225,1.1961,...,0.16396,0.37574,0.83604,7e-06,9.7145,6.2813,84.291,4.3303,4.0341,False
1,2,0.14624,0.46038,0.2823,1.6294,2.5952,0.0,0.17185,1.1721,1.6018,...,0.027516,0.271,0.90108,0.0,5.9882,4.1103,102.19,3.5716,5.95,False
2,3,0.000595,0.22612,0.48839,3.1599,84.874,0.19114,0.004572,2.9881,1.0077,...,0.007639,0.000881,0.99236,0.0,6.7742,3.7922,64.846,5.6287,4.4581,False
3,4,0.024526,0.43236,0.27546,1.7833,-10.105,0.56944,0.024526,1.3057,1.0509,...,0.048398,0.043445,0.9516,0.14298,4.2286,5.0528,98.783,3.695,3.4844,False
4,5,0.18829,0.41504,0.34231,1.9279,-58.274,0.0,0.23358,1.4094,1.3393,...,0.17648,0.32188,0.82635,0.073039,2.5912,7.0756,100.54,3.6303,4.6375,False


In [8]:
import csv
filename = r'E:\WorldQuant\project_5\data\Poland_bankruptcy_modified_test.csv'
column_index = 65

with open(filename, 'r', newline='') as file:
    reader = csv.reader(file)
    rows = [row[:column_index] + row[column_index+1:] for row in reader]

with open(filename, 'w', newline='') as file:
    writer = csv.writer(file)
    writer.writerows(rows)

In [25]:
extracted_csv = pd.read_csv(r'E:\WorldQuant\project_5\data\Poland_bankruptcy_modified_test.csv')
print(extracted_csv.shape)
extracted_csv.head()

(526, 65)


Unnamed: 0,company_id,Attr_1,Attr_2,Attr_3,Attr_4,Attr_5,Attr_6,Attr_7,Attr_8,Attr_9,...,Attr_55,Attr_56,Attr_57,Attr_58,Attr_59,Attr_60,Attr_61,Attr_62,Attr_63,Attr_64
0,9,0.23895,0.55473,0.40697,1.7609,-22.907,0.0,0.29714,0.80268,2.3317,...,3315.2,0.14038,0.53663,0.87292,0.044584,4.4527,7.1847,83.727,4.3594,40.097
1,28,0.15079,0.5842,0.23091,1.4485,85.889,0.0,0.18543,0.71174,0.87949,...,6854.9,0.21593,0.36266,0.79215,0.0,12.094,1.308,213.66,1.7083,3.459
2,40,0.013141,0.38626,0.38047,2.243,44.2,0.0,0.013141,1.5889,0.88321,...,3220.0,-0.13272,0.021412,0.9872,0.0,3.3917,4.1778,126.5,2.8855,2.8178
3,81,-0.010051,0.5457,-0.0015,0.99725,-31.118,0.12076,-0.010051,0.83252,3.3555,...,-2.081,-0.008543,-0.022123,1.003,0.0,11.869,13.582,59.359,6.149,7.3617
4,83,0.033372,0.089748,0.10903,2.2789,-2.6961,0.28508,0.033372,10.142,0.28484,...,2097.1,-0.31878,0.036663,0.92596,0.0,2.5771,4.0406,109.24,3.3413,0.35351


### Create JSON file 

In [10]:
Poland_bankruptcy_modified = {
     "schema": {
               "fields":[{'name': 'company_id', 'type': 'integer'},
                         {'name': 'Attr_1', 'type': 'number'},
                         {'name': 'Attr_2', 'type': 'number'},
                         {'name': 'Attr_3', 'type': 'number'},
                         {'name': 'Attr_4', 'type': 'number'},
                         {'name': 'Attr_5', 'type': 'number'},
                         {'name': 'Attr_6', 'type': 'number'},
                         {'name': 'Attr_7', 'type': 'number'},
                         {'name': 'Attr_8', 'type': 'number'},
                         {'name': 'Attr_9', 'type': 'number'},
                         {'name': 'Attr_10', 'type': 'number'},
                         {'name': 'Attr_11', 'type': 'number'},
                         {'name': 'Attr_12', 'type': 'number'},
                         {'name': 'Attr_13', 'type': 'number'},
                         {'name': 'Attr_14', 'type': 'number'},
                         {'name': 'Attr_15', 'type': 'number'},
                         {'name': 'Attr_16', 'type': 'number'},
                         {'name': 'Attr_17', 'type': 'number'},
                         {'name': 'Attr_18', 'type': 'number'},
                         {'name': 'Attr_19', 'type': 'number'},
                         {'name': 'Attr_20', 'type': 'number'},
                         {'name': 'Attr_21', 'type': 'number'},
                         {'name': 'Attr_22', 'type': 'number'},
                         {'name': 'Attr_23', 'type': 'number'},
                         {'name': 'Attr_24', 'type': 'number'},
                         {'name': 'Attr_25', 'type': 'number'},
                         {'name': 'Attr_26', 'type': 'number'},
                         {'name': 'Attr_27', 'type': 'number'},
                         {'name': 'Attr_28', 'type': 'number'},
                         {'name': 'Attr_29', 'type': 'number'},
                         {'name': 'Attr_30', 'type': 'number'},
                         {'name': 'Attr_31', 'type': 'number'},
                         {'name': 'Attr_32', 'type': 'number'},
                         {'name': 'Attr_33', 'type': 'number'},
                         {'name': 'Attr_34', 'type': 'number'},
                         {'name': 'Attr_35', 'type': 'number'},
                         {'name': 'Attr_36', 'type': 'number'},
                         {'name': 'Attr_37', 'type': 'number'},
                         {'name': 'Attr_38', 'type': 'number'},
                         {'name': 'Attr_39', 'type': 'number'},
                         {'name': 'Attr_40', 'type': 'number'},
                         {'name': 'Attr_41', 'type': 'number'},
                         {'name': 'Attr_42', 'type': 'number'},
                         {'name': 'Attr_43', 'type': 'number'},
                         {'name': 'Attr_44', 'type': 'number'},
                         {'name': 'Attr_45', 'type': 'number'},
                         {'name': 'Attr_46', 'type': 'number'},
                         {'name': 'Attr_47', 'type': 'number'},
                         {'name': 'Attr_48', 'type': 'number'},
                         {'name': 'Attr_49', 'type': 'number'},
                         {'name': 'Attr_50', 'type': 'number'},
                         {'name': 'Attr_51', 'type': 'number'},
                         {'name': 'Attr_52', 'type': 'number'},
                         {'name': 'Attr_53', 'type': 'number'},
                         {'name': 'Attr_54', 'type': 'number'},
                         {'name': 'Attr_55', 'type': 'number'},
                         {'name': 'Attr_56', 'type': 'number'},
                         {'name': 'Attr_57', 'type': 'number'},
                         {'name': 'Attr_58', 'type': 'number'},
                         {'name': 'Attr_59', 'type': 'number'},
                         {'name': 'Attr_60', 'type': 'number'},
                         {'name': 'Attr_61', 'type': 'number'},
                         {'name': 'Attr_62', 'type': 'number'},
                         {'name': 'Attr_63', 'type': 'number'},
                         {'name': 'Attr_64', 'type': 'number'},
                         {'name': 'bankrupt', 'type': 'boolean'}],
               "primary_key":["company_id"],
                "pandas_version":"0.20.0"
              },
    "data": [],
    "metadata": {
                 "title":"Ensemble Boosted Trees with Synthetic Features Generation in Application to Bankruptcy Prediction",
                 "authors":"Zieba, M., Tomczak, S. K., & Tomczak, J. M.",
                 "journal":"Expert Systems with Applications",
                 "publicationYear":2016,
                 "dataYear":2009,
                 "articleLink":"doi:10.1016/j.eswa.2016.04.001",
                 "datasetLink":"https://archive.ics.uci.edu/ml/datasets/Polish+companies+bankruptcy+data"
                }
    }
with open(r"E:\WorldQuant\project_5\data\Poland_bankruptcy_modified.json","w") as file:
    json.dump(Poland_bankruptcy_modified,file,ensure_ascii=False,indent=4)

# Note

The csv.`DictReader` function in Python's csv module reads values from a CSV file as strings by default. This behavior is intentional because CSV files are text-based and do not have explicit data types defined for the values. Therefore, all values in the CSV file are initially treated as strings.

So if we append the data from a CSV file as dictionaries, the data types of values will be treated as strings. To avoid this problem, there are two solutions: one is to fix the data types of values before appending them to the JSON file, and the other is to fix the data types of values after appending them to the JSON file.

##### In this Notebook, we will fix the data types of values before appending them to the JSON file.

By the end of the loop, the `dict_data_1` list will contain dictionaries, with each dictionary representing a row of data from the CSV file.

In [11]:
dict_data_1=[]
import csv
        
with open(r"E:\WorldQuant\project_5\data\Poland_bankruptcy_modified.csv", newline='') as f:
    reader = csv.DictReader(f)
    
    for row in reader:
        
        myfixedrow = {k: (None if v=="" else (True if v=="True" else (False if v=="False" else (int(v)if k=="company_id" else float(v))))) for k,v in row.items()}
        
        dict_data_1.append(myfixedrow)

In [12]:

json_file = Path(r"E:\WorldQuant\project_5\data\Poland_bankruptcy_modified.json")
json_data = json.loads(json_file.read_text())

data = [index for index in dict_data_1]
json_data["data"] = data

json_file.write_text(json.dumps(json_data))

12901871

In [13]:
def wrangle_1(filename):
    
    # Open compressed file, load into dictionary
    with open(filename,"r") as f:
        data = json.load(f)
    # Load dictionary into DataFrame, set index
    df=pd.DataFrame.from_dict(data["data"])
    
    return df

In [28]:
df1 = wrangle_1(r"E:\WorldQuant\project_5\data\Poland_bankruptcy_modified.json")
print(df1.shape)
df1.head()

(9977, 66)


Unnamed: 0,company_id,Attr_1,Attr_2,Attr_3,Attr_4,Attr_5,Attr_6,Attr_7,Attr_8,Attr_9,...,Attr_56,Attr_57,Attr_58,Attr_59,Attr_60,Attr_61,Attr_62,Attr_63,Attr_64,bankrupt
0,1,0.17419,0.41299,0.14371,1.348,-28.982,0.60383,0.21946,1.1225,1.1961,...,0.16396,0.37574,0.83604,7e-06,9.7145,6.2813,84.291,4.3303,4.0341,False
1,2,0.14624,0.46038,0.2823,1.6294,2.5952,0.0,0.17185,1.1721,1.6018,...,0.027516,0.271,0.90108,0.0,5.9882,4.1103,102.19,3.5716,5.95,False
2,3,0.000595,0.22612,0.48839,3.1599,84.874,0.19114,0.004572,2.9881,1.0077,...,0.007639,0.000881,0.99236,0.0,6.7742,3.7922,64.846,5.6287,4.4581,False
3,4,0.024526,0.43236,0.27546,1.7833,-10.105,0.56944,0.024526,1.3057,1.0509,...,0.048398,0.043445,0.9516,0.14298,4.2286,5.0528,98.783,3.695,3.4844,False
4,5,0.18829,0.41504,0.34231,1.9279,-58.274,0.0,0.23358,1.4094,1.3393,...,0.17648,0.32188,0.82635,0.073039,2.5912,7.0756,100.54,3.6303,4.6375,False


In [15]:
df1.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 9977 entries, 0 to 9976
Data columns (total 66 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   company_id  9977 non-null   int64  
 1   Attr_1      9977 non-null   float64
 2   Attr_2      9977 non-null   float64
 3   Attr_3      9977 non-null   float64
 4   Attr_4      9959 non-null   float64
 5   Attr_5      9955 non-null   float64
 6   Attr_6      9977 non-null   float64
 7   Attr_7      9977 non-null   float64
 8   Attr_8      9963 non-null   float64
 9   Attr_9      9974 non-null   float64
 10  Attr_10     9977 non-null   float64
 11  Attr_11     9977 non-null   float64
 12  Attr_12     9959 non-null   float64
 13  Attr_13     9936 non-null   float64
 14  Attr_14     9977 non-null   float64
 15  Attr_15     9969 non-null   float64
 16  Attr_16     9963 non-null   float64
 17  Attr_17     9963 non-null   float64
 18  Attr_18     9977 non-null   float64
 19  Attr_19     9936 non-null  

In [16]:
# gzip compress file
with open(r"E:\WorldQuant\project_5\data\Poland_bankruptcy_modified.json",'rb') as f_input:
    with gzip.open(r"E:\WorldQuant\project_5\data\Poland_bankruptcy_modified.json.gz",'wb') as f_output:
        shutil.copyfileobj(f_input,f_output)


In [17]:
Poland_bankruptcy_modified_test = {
     "schema": {
               "fields":[{'name': 'company_id', 'type': 'integer'},
                         {'name': 'Attr_1', 'type': 'number'},
                         {'name': 'Attr_2', 'type': 'number'},
                         {'name': 'Attr_3', 'type': 'number'},
                         {'name': 'Attr_4', 'type': 'number'},
                         {'name': 'Attr_5', 'type': 'number'},
                         {'name': 'Attr_6', 'type': 'number'},
                         {'name': 'Attr_7', 'type': 'number'},
                         {'name': 'Attr_8', 'type': 'number'},
                         {'name': 'Attr_9', 'type': 'number'},
                         {'name': 'Attr_10', 'type': 'number'},
                         {'name': 'Attr_11', 'type': 'number'},
                         {'name': 'Attr_12', 'type': 'number'},
                         {'name': 'Attr_13', 'type': 'number'},
                         {'name': 'Attr_14', 'type': 'number'},
                         {'name': 'Attr_15', 'type': 'number'},
                         {'name': 'Attr_16', 'type': 'number'},
                         {'name': 'Attr_17', 'type': 'number'},
                         {'name': 'Attr_18', 'type': 'number'},
                         {'name': 'Attr_19', 'type': 'number'},
                         {'name': 'Attr_20', 'type': 'number'},
                         {'name': 'Attr_21', 'type': 'number'},
                         {'name': 'Attr_22', 'type': 'number'},
                         {'name': 'Attr_23', 'type': 'number'},
                         {'name': 'Attr_24', 'type': 'number'},
                         {'name': 'Attr_25', 'type': 'number'},
                         {'name': 'Attr_26', 'type': 'number'},
                         {'name': 'Attr_27', 'type': 'number'},
                         {'name': 'Attr_28', 'type': 'number'},
                         {'name': 'Attr_29', 'type': 'number'},
                         {'name': 'Attr_30', 'type': 'number'},
                         {'name': 'Attr_31', 'type': 'number'},
                         {'name': 'Attr_32', 'type': 'number'},
                         {'name': 'Attr_33', 'type': 'number'},
                         {'name': 'Attr_34', 'type': 'number'},
                         {'name': 'Attr_35', 'type': 'number'},
                         {'name': 'Attr_36', 'type': 'number'},
                         {'name': 'Attr_37', 'type': 'number'},
                         {'name': 'Attr_38', 'type': 'number'},
                         {'name': 'Attr_39', 'type': 'number'},
                         {'name': 'Attr_40', 'type': 'number'},
                         {'name': 'Attr_41', 'type': 'number'},
                         {'name': 'Attr_42', 'type': 'number'},
                         {'name': 'Attr_43', 'type': 'number'},
                         {'name': 'Attr_44', 'type': 'number'},
                         {'name': 'Attr_45', 'type': 'number'},
                         {'name': 'Attr_46', 'type': 'number'},
                         {'name': 'Attr_47', 'type': 'number'},
                         {'name': 'Attr_48', 'type': 'number'},
                         {'name': 'Attr_49', 'type': 'number'},
                         {'name': 'Attr_50', 'type': 'number'},
                         {'name': 'Attr_51', 'type': 'number'},
                         {'name': 'Attr_52', 'type': 'number'},
                         {'name': 'Attr_53', 'type': 'number'},
                         {'name': 'Attr_54', 'type': 'number'},
                         {'name': 'Attr_55', 'type': 'number'},
                         {'name': 'Attr_56', 'type': 'number'},
                         {'name': 'Attr_57', 'type': 'number'},
                         {'name': 'Attr_58', 'type': 'number'},
                         {'name': 'Attr_59', 'type': 'number'},
                         {'name': 'Attr_60', 'type': 'number'},
                         {'name': 'Attr_61', 'type': 'number'},
                         {'name': 'Attr_62', 'type': 'number'},
                         {'name': 'Attr_63', 'type': 'number'},
                         {'name': 'Attr_64', 'type': 'number'}],
               "primary_key":["company_id"],
                "pandas_version":"0.20.0"
              },
    "data": [],
    "metadata": {
                 "title":"Ensemble Boosted Trees with Synthetic Features Generation in Application to Bankruptcy Prediction",
                 "authors":"Zieba, M., Tomczak, S. K., & Tomczak, J. M.",
                 "journal":"Expert Systems with Applications",
                 "publicationYear":2016,
                 "dataYear":2009,
                 "articleLink":"doi:10.1016/j.eswa.2016.04.001",
                 "datasetLink":"https://archive.ics.uci.edu/ml/datasets/Polish+companies+bankruptcy+data"
                }
    }
with open(r"E:\WorldQuant\project_5\data\Poland_bankruptcy_modified_test.json","w") as file:
    json.dump(Poland_bankruptcy_modified_test,file,ensure_ascii=False,indent=4)

In [18]:
dict_data_2=[]
import csv
        
with open(r"E:\WorldQuant\project_5\data\Poland_bankruptcy_modified_test.csv", newline='') as f:
    reader = csv.DictReader(f)
    
    for row in reader:
        
        myfixedrow = {k: (None if v=="" else (int(v)if k=="company_id" else float(v))) for k,v in row.items()}
        
        dict_data_2.append(myfixedrow)

In [19]:
json_file = Path(r"E:\WorldQuant\project_5\data\Poland_bankruptcy_modified_test.json")
json_data = json.loads(json_file.read_text())

data = [index for index in dict_data_2]
json_data["data"] = data

json_file.write_text(json.dumps(json_data))

673715

In [20]:
def wrangle_2(filename):
    
    # Open compressed file, load into dictionary
    with open(filename,"r") as f:
        data = json.load(f)
    # Load dictionary into DataFrame, set index
    df=pd.DataFrame.from_dict(data["data"])
    
    return df

In [24]:
df2 = wrangle_2(r"E:\WorldQuant\project_5\data\Poland_bankruptcy_modified_test.json")
print(df2.shape)
df2.head()

(526, 65)


Unnamed: 0,company_id,Attr_1,Attr_2,Attr_3,Attr_4,Attr_5,Attr_6,Attr_7,Attr_8,Attr_9,...,Attr_55,Attr_56,Attr_57,Attr_58,Attr_59,Attr_60,Attr_61,Attr_62,Attr_63,Attr_64
0,9,0.23895,0.55473,0.40697,1.7609,-22.907,0.0,0.29714,0.80268,2.3317,...,3315.2,0.14038,0.53663,0.87292,0.044584,4.4527,7.1847,83.727,4.3594,40.097
1,28,0.15079,0.5842,0.23091,1.4485,85.889,0.0,0.18543,0.71174,0.87949,...,6854.9,0.21593,0.36266,0.79215,0.0,12.094,1.308,213.66,1.7083,3.459
2,40,0.013141,0.38626,0.38047,2.243,44.2,0.0,0.013141,1.5889,0.88321,...,3220.0,-0.13272,0.021412,0.9872,0.0,3.3917,4.1778,126.5,2.8855,2.8178
3,81,-0.010051,0.5457,-0.0015,0.99725,-31.118,0.12076,-0.010051,0.83252,3.3555,...,-2.081,-0.008543,-0.022123,1.003,0.0,11.869,13.582,59.359,6.149,7.3617
4,83,0.033372,0.089748,0.10903,2.2789,-2.6961,0.28508,0.033372,10.142,0.28484,...,2097.1,-0.31878,0.036663,0.92596,0.0,2.5771,4.0406,109.24,3.3413,0.35351


In [22]:
df2.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 526 entries, 0 to 525
Data columns (total 65 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   company_id  526 non-null    int64  
 1   Attr_1      526 non-null    float64
 2   Attr_2      526 non-null    float64
 3   Attr_3      526 non-null    float64
 4   Attr_4      526 non-null    float64
 5   Attr_5      523 non-null    float64
 6   Attr_6      526 non-null    float64
 7   Attr_7      526 non-null    float64
 8   Attr_8      526 non-null    float64
 9   Attr_9      526 non-null    float64
 10  Attr_10     526 non-null    float64
 11  Attr_11     526 non-null    float64
 12  Attr_12     526 non-null    float64
 13  Attr_13     524 non-null    float64
 14  Attr_14     526 non-null    float64
 15  Attr_15     526 non-null    float64
 16  Attr_16     526 non-null    float64
 17  Attr_17     526 non-null    float64
 18  Attr_18     526 non-null    float64
 19  Attr_19     524 non-null    f

In [23]:
# gzip compress file
with open(r"E:\WorldQuant\project_5\data\Poland_bankruptcy_modified_test.json",'rb') as f_input:
    with gzip.open(r"E:\WorldQuant\project_5\data\Poland_bankruptcy_modified_test.json.gz",'wb') as f_output:
        shutil.copyfileobj(f_input,f_output)
