In [1]:
import numpy as np
import glob
from tqdm import tqdm

First, we define the basic parameters of our simulations:

In [2]:
num_bands = '5'
num_params = 4
num_k_points = 31

The directory below should be changed if ran locally:

In [3]:
import os
directory = "/home/ben/Desktop/Notes/Applied ML/AppliedML2021/ML_2021_Big_Project/raw_data/" #This should be changed accordingly

# We count the number of files finishing with ".txt"

num_files =  len(glob.glob1(directory,"*.txt")) 
print('Number of files in directory: ',num_files)

# We create the arrays to store all parameters and frequencies

params = np.zeros((int(num_files), int(num_params)))  
frequencies = np.zeros((int(num_files),int(int(num_bands)*num_k_points)))

index = 0

# We run through all files in directory
    
for file in tqdm(os.listdir(directory)):
    
    # We pick all ".txt" files
    
    if file.endswith(".txt"):
        
        # We get the value of the parameters by stripping the filename text
        
        x = np.array((file.split(".tx")[0]).split("_"))
        x = np.array([e[1:] for e in x]).astype(float)
        params [index, :] = x
        
        # We load the files to numpy and flatten it, so it can be stored in (num_files, k_points) format
        
        freq = np.loadtxt(directory+file).flatten()
        frequencies [index,:] = freq
        
        index += 1

  5%|▍         | 155/3240 [00:00<00:01, 1549.14it/s]

Number of files in directory:  3239


100%|██████████| 3240/3240 [00:01<00:00, 2116.99it/s]


As we can se we generate an array for the frequencies, whose shape is:

In [4]:
np.shape(frequencies)

(3239, 155)

In [5]:
(num_files, num_k_points*int(num_bands))

(3239, 155)

As we can se we generate an array for the parameters, whose shape is:

In [6]:
np.shape(params)

(3239, 4)

In [7]:
(num_files,num_params)

(3239, 4)

Now we convert the parameters to a pandas dataframe:

In [8]:
import pandas as pd

columns_params = ["n", "e", "r [nm]", "t[º]"]
df_params = pd.DataFrame(params, columns = columns_params)

In [9]:
df_params

Unnamed: 0,n,e,r [nm],t[º]
0,7.4014,0.35338,20.2618,75.2023
1,11.6902,0.99652,77.7695,3.5179
2,6.1927,0.31256,129.5113,82.1149
3,19.7595,0.82878,91.6994,48.9540
4,13.7727,0.32737,80.2894,68.5358
...,...,...,...,...
3234,14.1821,0.58939,100.2772,69.3037
3235,11.2997,0.65983,43.5147,8.9384
3236,4.8226,0.86481,89.6153,37.0728
3237,18.2879,0.98177,34.5011,19.7491


We do the same with the frequencies:

In [10]:
columns_k_points = [] #Here we will store all column labels

for i in range(np.shape(frequencies)[1]):
    
    #We get the band number
    
    band = i//int(num_k_points)
    
    #We obtain the k_number in each of the bands
    
    k_number = i - band * num_k_points
    
    #We append the label of the column
    
    columns_k_points.append('Band_'+str(band)+"_k_"+str(k_number))

In [11]:
df_frequencies = pd.DataFrame(frequencies, columns = columns_k_points)

In [12]:
df_frequencies

Unnamed: 0,Band_0_k_0,Band_0_k_1,Band_0_k_2,Band_0_k_3,Band_0_k_4,Band_0_k_5,Band_0_k_6,Band_0_k_7,Band_0_k_8,Band_0_k_9,...,Band_4_k_21,Band_4_k_22,Band_4_k_23,Band_4_k_24,Band_4_k_25,Band_4_k_26,Band_4_k_27,Band_4_k_28,Band_4_k_29,Band_4_k_30
0,0.0,0.25910,0.51819,0.77729,1.03638,1.29547,1.55455,1.81363,2.07270,2.33174,...,7.84786,7.52634,7.20901,6.89646,6.58935,6.28850,6.02166,5.73665,5.46049,5.21749
1,0.0,0.29122,0.58217,0.87254,1.16198,1.45004,1.73600,2.01853,2.29418,2.54952,...,8.00736,7.73822,7.46038,7.17809,6.89442,6.78260,6.75986,6.52295,6.29366,6.17325
2,0.0,0.28548,0.57076,0.85561,1.13972,1.42261,1.70340,1.98027,2.24829,2.48946,...,7.99410,7.73389,7.46315,7.18395,6.89933,6.61304,6.72267,6.48437,6.26683,6.16810
3,0.0,0.29789,0.59544,0.89228,1.18795,1.48188,1.77313,2.05991,2.33765,2.58941,...,8.08047,7.81203,7.53079,7.24485,6.95887,6.97181,6.93343,6.68770,6.46084,6.34544
4,0.0,0.26866,0.53727,0.80578,1.07412,1.34220,1.60987,1.87683,2.14231,2.40270,...,7.88221,7.56867,7.25482,6.94395,6.63799,6.34148,6.33800,6.05831,5.79489,5.62800
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3234,0.0,0.29070,0.58113,0.87101,1.15996,1.44746,1.73262,2.01365,2.28590,2.53216,...,8.00143,7.72381,7.43841,7.15042,6.86885,6.73886,6.82793,6.57006,6.32272,6.17921
3235,0.0,0.26433,0.52863,0.79288,1.05705,1.32110,1.58498,1.84858,2.11167,2.37308,...,7.86401,7.54583,7.22964,6.91740,6.61026,6.30943,6.16410,5.92381,5.66819,5.51063
3236,0.0,0.29535,0.59038,0.88473,1.17800,1.46966,1.75890,2.04415,2.32147,2.57579,...,8.04332,7.77087,7.48963,7.20482,6.92180,6.94235,6.89541,6.64275,6.41654,6.30327
3237,0.0,0.26397,0.52792,0.79182,1.05565,1.31937,1.58292,1.84621,2.10899,2.37014,...,7.86309,7.54459,7.22820,6.91583,6.60857,6.30736,6.14820,5.91066,5.65532,5.49714


Now we save the bands and the parameters in .txt files:

In [13]:
df_params.to_csv("params_data.csv", index=False)

In [14]:
df_frequencies.to_csv("frequencies_data.csv", index=False)