In [1]:
import pandas as pd
import numpy as np

In [2]:
# read from CSV file
dataset = pd.read_csv('Bond-length-and-angles.csv')

# print some of our data
dataset.head()

Unnamed: 0,Molecular components,Length 1 (Å),Unnamed: 2,Length 2 (Å),Unnamed: 4,Angle (°)
0,H – N(1) – C(2),H – N(1),1.082,N(1) – C(2),1.249,129.1
1,H – N(1) – C(13),H – N(1),1.082,N(1) – C(13),1.275,118.3
2,C(13) – N(1) – C(2),C(13) – N(1),1.275,N(1) – C(2),1.249,112.6
3,N(1) – C(2) = O,N(1) – C(2),1.249,C(2) = O,1.23,120.5
4,C(7) – C(2) = O,C(7) – C(2),1.52,C(2) = O,1.23,127.5


So we need X and Y components. Length 1(A) will become our X component, Length 2(A) will become our Y component

In [3]:
# [:, 1] = all rows on first column
X = dataset.ix[:, 1].values

# [:5] = row 0-5
X[:5]

array(['H \xe2\x80\x93 N(1)', 'H \xe2\x80\x93 N(1)',
       'C(13) \xe2\x80\x93 N(1)', 'N(1) \xe2\x80\x93 C(2)',
       'C(7) \xe2\x80\x93 C(2)'], dtype=object)

In [4]:
# [:, 3] = all rows on third column
Y = dataset.ix[:, 3].values

# [:5] = row 0-5
Y[:5]

array(['N(1) \xe2\x80\x93 C(2)', 'N(1) \xe2\x80\x93 C(13)',
       'N(1) \xe2\x80\x93 C(2)', 'C(2) = O', 'C(2) = O'], dtype=object)

Now we want to make it unique for both components

In [5]:
print 'before unique got: ' + str(X.shape[0]) + ' rows'
# special function to remove redundancy
Y = np.unique(Y)
X = np.unique(X)
print 'after unique got: ' + str(X.shape[0]) + ' rows'

before unique got: 41 rows
after unique got: 34 rows


Now we can see 5 samples from both components, but it got some non-ascii in there, '\xe2\x80\x93'. How about we change it become '-'

In [6]:
for i in xrange(X.shape[0]):
    # a function to replace certain string with a certain string
    X[i] = X[i].replace('\xe2\x80\x93', '-')
    Y[i] = Y[i].replace('\xe2\x80\x93', '-')
    
print Y[:5]
print X[:5]

['C(10) - C(11)' 'C(11) - C(12)' 'C(12) - C(13)' 'C(13) - C(12)'
 'C(13) - N(1)']
['C(10) - C(11)' 'C(11) - C(12)' 'C(12) - C(13)' 'C(13) - N(1)'
 'C(14) - C(15)']


In [7]:
whole_molecules = []
X_count = 0
for i in xrange(Y.shape[0]):
    molecules = []
    for k in xrange(X.shape[0]):
        y_split = Y[i].split(' ')
        x_split = X[k].split(' ')
        if y_split[-1] == x_split[0]:
            molecule = y_split[:-1] + x_split[:]
            molecule = ''.join(molecule)
            molecules.append(molecule)
        else:
            molecules.append('X')
            X_count += 1
    whole_molecules.append(molecules)

In [8]:
pd.set_option('display.max_columns', X.shape[0])
molecules = pd.DataFrame(whole_molecules, Y, X)
molecules

Unnamed: 0,C(10) - C(11),C(11) - C(12),C(12) - C(13),C(13) - N(1),C(14) - C(15),C(15) - C(20),C(16) - C(17),C(17) - C(20),C(19) - C(20),C(2) - C(7),C(20) - C(19),C(20) - C(21),C(21) - C(20),C(21) - N(4),C(21) - O,C(3) - C(14),C(5) - C(6),C(5) - N(4),C(7) - C(2),C(7) - C(8),C(8) - (C9),C(8) - C(13),C(8) - C(7),C(9) - C(10),C(9) - C(8),H - C(21),H - N(1),N - C(21),N(1) - C(2),N(4) - C(3),N(4) - C(5),O - C(16),O - C(21),OH - C(7)
C(10) - C(11),X,C(10)-C(11)-C(12),X,X,X,X,X,X,X,X,X,X,X,X,X,X,X,X,X,X,X,X,X,X,X,X,X,X,X,X,X,X,X,X
C(11) - C(12),X,X,C(11)-C(12)-C(13),X,X,X,X,X,X,X,X,X,X,X,X,X,X,X,X,X,X,X,X,X,X,X,X,X,X,X,X,X,X,X
C(12) - C(13),X,X,X,C(12)-C(13)-N(1),X,X,X,X,X,X,X,X,X,X,X,X,X,X,X,X,X,X,X,X,X,X,X,X,X,X,X,X,X,X
C(13) - C(12),X,X,C(13)-C(12)-C(13),X,X,X,X,X,X,X,X,X,X,X,X,X,X,X,X,X,X,X,X,X,X,X,X,X,X,X,X,X,X,X
C(13) - N(1),X,X,X,X,X,X,X,X,X,X,X,X,X,X,X,X,X,X,X,X,X,X,X,X,X,X,X,X,C(13)-N(1)-C(2),X,X,X,X,X
C(14) - C(15),X,X,X,X,X,C(14)-C(15)-C(20),X,X,X,X,X,X,X,X,X,X,X,X,X,X,X,X,X,X,X,X,X,X,X,X,X,X,X,X
C(15) - C(20),X,X,X,X,X,X,X,X,X,X,C(15)-C(20)-C(19),C(15)-C(20)-C(21),X,X,X,X,X,X,X,X,X,X,X,X,X,X,X,X,X,X,X,X,X,X
C(16) - C(17),X,X,X,X,X,X,X,C(16)-C(17)-C(20),X,X,X,X,X,X,X,X,X,X,X,X,X,X,X,X,X,X,X,X,X,X,X,X,X,X
C(17) - C(20),X,X,X,X,X,X,X,X,X,X,C(17)-C(20)-C(19),C(17)-C(20)-C(21),X,X,X,X,X,X,X,X,X,X,X,X,X,X,X,X,X,X,X,X,X,X
C(19) - C(18),X,X,X,X,X,X,X,X,X,X,X,X,X,X,X,X,X,X,X,X,X,X,X,X,X,X,X,X,X,X,X,X,X,X


In [9]:
print 'total X count: ' + str(X_count)
print 'total legit molecules: ' + str((X.shape[0] ** 2) - X_count)

total X count: 1112
total legit molecules: 44


In [10]:
molecules.to_csv('molecules.csv')