# Analysing the crystal properties

In [1]:
## Basic imports and function definitions

import pandas as pd
import pylab as plt
from pathlib import Path
from config import INPUT_DIR, OUTPUT_DIR, INTERESTING_COLUMNS

N_SG = -5
CUBIC = True

def read_file(fpath):
    if str(fpath).endswith('.csv'):
        df = pd.read_csv(fpath)
    elif str(fpath).endswith('.xlsx'):
        df = pd.read_excel(fpath)
    else:
        df = None
    return df

def modify_df(df, cubic = CUBIC, dump_to_file = False):
    modified_df = df[INTERESTING_COLUMNS]
    if cubic:
        modified_df = modified_df[modified_df.alpha == 90].reset_index(drop=1)
        modified_df = modified_df[modified_df.beta == 90].reset_index(drop=1)
        modified_df = modified_df[modified_df.gamma == 90].reset_index(drop=1)

    ## take the only top N_SG values
    value_counts = dict(modified_df.sg.value_counts())
    if N_SG > 0:
        print(f"Extracting top {N_SG} groups")
        SG_to_model = list(value_counts.keys())[:N_SG]
        modified_df = modified_df[modified_df.sg.isin(SG_to_model)].reset_index(drop=1)
        
    if dump_to_file:
        modified_df.to_excel(Path(OUTPUT_DIR,f'data_to_model_{cubic}_{N_SG}.xlsx'))
    
    return modified_df

## Extract only the relevant data and print a sample for inspection

In [2]:
fpath = Path(INPUT_DIR,'Data.csv')
raw_df = read_file(fpath)
assert raw_df is not None, "Raw data not read properly"

modified_df = modify_df(raw_df)

modified_df.head()

  if (await self.run_code(code, result,  async_=asy)):


Unnamed: 0,a,b,c,alpha,beta,gamma,sg
0,5.596,10.29,3.806,90.0,90.0,90.0,P b a m
1,5.334,6.308,7.196,90.0,90.0,90.0,P b n m
2,7.84,14.431,6.237,90.0,90.0,90.0,C m c a
3,7.922,7.239,6.243,90.0,90.0,90.0,P m n a
4,10.033,10.033,10.033,90.0,90.0,90.0,P m -3 m


## Numerical investigations

In [11]:
## Count and frequencies of the SG available in the data

from collections import Counter
sg_count = Counter(modified_df['sg'])
count_count = Counter(dd.values())
count__count_sorted = dict(sorted(count_count.items(), key=lambda item: item[0]))

print(count__count_sorted)

{1: 55, 2: 23, 3: 15, 4: 12, 5: 14, 6: 12, 7: 8, 8: 8, 9: 4, 10: 8, 11: 2, 12: 5, 13: 4, 14: 9, 15: 4, 16: 1, 17: 1, 18: 5, 21: 1, 22: 3, 23: 1, 24: 2, 25: 2, 26: 2, 27: 4, 30: 3, 32: 1, 34: 2, 36: 1, 37: 1, 38: 1, 39: 1, 40: 1, 42: 2, 43: 2, 46: 1, 47: 1, 51: 1, 52: 1, 53: 1, 57: 2, 61: 1, 62: 1, 66: 1, 70: 2, 72: 1, 75: 1, 76: 2, 77: 1, 82: 1, 87: 1, 89: 1, 91: 1, 92: 1, 95: 1, 96: 1, 97: 1, 102: 1, 104: 1, 105: 1, 107: 1, 109: 1, 136: 1, 159: 1, 170: 1, 179: 1, 231: 1, 280: 1, 289: 1, 461: 1, 556: 1, 823: 1, 1233: 1, 1506: 1}


## Basic investigative plots for the data

In [4]:
# plt.hist(modified_df['sg'])
# plt.show()