In [1]:
#importing the libraries
import sdv
import pandas as pd

print(sdv.version.public)



1.11.0


In [2]:
#folder that contains the data
import zipfile
import os
zip_path = 'To Benedict.zip'
extract_path = os.getcwd()

with zipfile.ZipFile(zip_path, 'r') as zip_ref:
    zip_ref.extractall(extract_path)


### Loading the data

In [3]:
#viewing the data we have
train = pd.read_csv('To Benedict/hMOFs data/train_scl1.csv')



In [4]:
train.head()

Unnamed: 0,id,lcd,pld,void_fraction,surface_area_m2cm3,pressure_2.5(bar)
0,hMOF-1000002,1.874265,1.599817,4.513268,3.23268,1.05588
1,hMOF-1000004,1.874265,1.599817,4.328546,3.358655,1.086126
2,hMOF-1000006,1.753345,1.599817,4.626599,3.150817,1.099162
3,hMOF-1000010,1.874265,1.599817,4.32831,3.344909,1.027537
4,hMOF-1000011,1.753345,1.599817,4.585523,3.129274,1.069447


In [5]:
# train_columns = ['id','lcd', 'pld', 'void_fraction', 'surface_area_m2g','pressure_0.01(bar)']
# train.columns = train_columns


In [6]:
train.to_csv('new_train.csv', index=False)

column_names = ['id','lcd', 'pld', 'void_fraction', 'surface_area_m2g','surface_area_m2cm3', 'pressure_0.01(bar)']


### Meta data

In [7]:
from sdv.metadata import SingleTableMetadata

metadata = SingleTableMetadata()

metadata.detect_from_csv(filepath='new_train.csv')

In [8]:
python_dict = metadata.to_dict()

   set  'pii=True' if you want to anonymize that column in your corrections
   


In [9]:
python_dict

{'METADATA_SPEC_VERSION': 'SINGLE_TABLE_V1',
 'primary_key': 'id',
 'columns': {'id': {'sdtype': 'id'},
  'lcd': {'sdtype': 'numerical'},
  'pld': {'sdtype': 'numerical'},
  'void_fraction': {'sdtype': 'numerical'},
  'surface_area_m2cm3': {'sdtype': 'numerical'},
  'pressure_2.5(bar)': {'sdtype': 'numerical'}}}

In [10]:
# #in a case where there is mistake in the auto detection of metadata
# metadata.update_columns(
#     column_names=['age', 'transactions', 'session_length'],
#     sdtype='numerical',
#     computer_representation='Float'
# )

In [11]:
# #setting primary key
# metadata.set_primary_key(column_name='guest_email')

In [12]:
#validating metadata
metadata.validate()

In [13]:
metadata.validate_data(data=train)

In [14]:
# from sdv.metadata import SingleTableMetadata

# metadata.save_to_json(filepath='my_metadata_v1.json')




In [15]:
metadata = SingleTableMetadata.load_from_json(filepath='my_metadata_v1.json')

### Creating a synthesizer
An SDV synthesizer is an object that we  use to create synthetic data. It learns patterns from the real data and replicates them to generate synthetic data.



In [16]:
#fast method
from sdv.lite import SingleTablePreset

# synthesizer = SingleTablePreset(metadata, name='FAST_ML')
# synthesizer.fit(data)

# synthetic_data = synthesizer.sample(num_rows=10)


from sdv.single_table import GaussianCopulaSynthesizer

# Step 1: Create the synthesizer
synthesizer = GaussianCopulaSynthesizer(metadata)

# Step 2: Train the synthesizer
synthesizer.fit(train)

# Step 3: Generate synthetic data
synthetic_data = synthesizer.sample(num_rows=21148)

In [17]:
synthetic_data.to_csv('synthetic_data.csv', index=False)

In [18]:
#evaluating our the data with the real ones
from sdv.evaluation.single_table import run_diagnostic

diagnostic = run_diagnostic(
    real_data=train,
    synthetic_data=synthetic_data,
    metadata=metadata
)

Generating report ...
(1/2) Evaluating Data Validity: : 100%|██████████| 6/6 [00:00<00:00, 53.37it/s]
(2/2) Evaluating Data Structure: : 100%|██████████| 1/1 [00:00<00:00, 130.32it/s]

Overall Score: 100.0%

Properties:
- Data Validity: 100.0%
- Data Structure: 100.0%


In [19]:
#measure the data quality or the statistical similarity between the real and synthetic data.
from sdv.evaluation.single_table import evaluate_quality

quality_report = evaluate_quality(
    train,
    synthetic_data,
    metadata
)

Generating report ...
(1/2) Evaluating Column Shapes: : 100%|██████████| 6/6 [00:00<00:00, 39.83it/s]
(2/2) Evaluating Column Pair Trends: : 100%|██████████| 15/15 [00:00<00:00, 43.63it/s]

Overall Score: 93.73%

Properties:
- Column Shapes: 94.3%
- Column Pair Trends: 93.16%


In [20]:
quality_report.get_details('Column Shapes')

Unnamed: 0,Column,Metric,Score
0,lcd,KSComplement,0.949972
1,pld,KSComplement,0.857906
2,void_fraction,KSComplement,0.979052
3,surface_area_m2cm3,KSComplement,0.959854
4,pressure_2.5(bar),KSComplement,0.968177


In [21]:
# #generate a plot
# from sdv.evaluation.single_table import get_column_pair_plot

# fig = get_column_pair_plot(
#     real_data=train,
#     synthetic_data=synthetic_data,
#     column_names=['surface_area_m2cm3', 'void_fraction'],
#     metadata=metadata
# )

# fig.show()

In [22]:
# from sdv.evaluation.single_table import get_column_plot

# fig = get_column_plot(
#     real_data=train,
#     synthetic_data=synthetic_data,
#     column_name='pressure_2.5(bar)',
#     metadata=metadata
# )

# fig.show()

In [23]:
synthetic_data

Unnamed: 0,id,lcd,pld,void_fraction,surface_area_m2cm3,pressure_2.5(bar)
0,sdv-id-0,3.036759,2.784457,6.007545,3.861824,0.622209
1,sdv-id-1,1.839953,1.210207,4.671190,3.610159,1.054537
2,sdv-id-2,2.620401,1.819342,4.883504,2.547022,0.378229
3,sdv-id-3,1.775954,1.720503,5.039513,4.691008,1.018116
4,sdv-id-4,1.666225,1.576467,5.470821,3.170497,1.177341
...,...,...,...,...,...,...
21143,sdv-id-21143,1.314956,1.114136,3.763455,4.560103,1.254863
21144,sdv-id-21144,1.635590,1.085299,4.588752,3.364793,1.090449
21145,sdv-id-21145,2.607567,2.331363,6.295555,4.460693,0.977427
21146,sdv-id-21146,2.176922,1.425379,4.291933,2.778922,1.154610


In [24]:
train

Unnamed: 0,id,lcd,pld,void_fraction,surface_area_m2cm3,pressure_2.5(bar)
0,hMOF-1000002,1.874265,1.599817,4.513268,3.232680,1.055880
1,hMOF-1000004,1.874265,1.599817,4.328546,3.358655,1.086126
2,hMOF-1000006,1.753345,1.599817,4.626599,3.150817,1.099162
3,hMOF-1000010,1.874265,1.599817,4.328310,3.344909,1.027537
4,hMOF-1000011,1.753345,1.599817,4.585523,3.129274,1.069447
...,...,...,...,...,...,...
21143,hMOF-5061635,3.808990,3.495896,6.314403,3.536744,0.542799
21144,hMOF-5061648,1.632424,1.125797,5.068869,5.958382,1.227704
21145,hMOF-5061649,1.269663,1.007292,3.177161,2.306537,0.973083
21146,hMOF-5061655,3.688070,3.377391,6.093433,3.842655,0.536680


In [25]:
synthetic_data.to_csv('synthetic_data.csv', index=False)

In [26]:
#saving the synthesizing model for future use
from sdv.lite import SingleTablePreset

synthesizer.save('my_synthesizer.pkl')

synthesizer = SingleTablePreset.load('my_synthesizer.pkl')

In [29]:
core_mof = pd.read_csv('To Benedict/CoRE-MOFs data/CoRE_MOFs_pred.csv')