In [1]:
import numpy as np
import pandas as pd

Create data in the same form as in my script. The 'data' array is in the same form as the X data would be after being flattened, and the 'ohe_data' array is the same form as the y data appears from the input data set.

In [2]:
data = np.array([[1.3,6.2,7.3,], 
        [4.2,8.1,0.3], 
        [3.7,1.1,2.8], 
        [13.2,8.2,3.1,], 
        [1.3,5.1,3.9], 
        [3.2,4.1,12.5]])

ohe_data = np.array([[0, 0, 0, 1, 0],
 [1, 0, 0, 0, 0],
 [0, 0, 1, 0, 0],
 [0, 0, 1, 0, 0],
 [0, 0, 0, 0, 1],
 [0, 1, 0, 0, 0]])

heading = ['a', 'b', 'c']


Since the y data from the input dataset is one-hot encoded, I use code like the following to convert it into a form that can be used in the 'class' column of a pandas dataframe. 

In [3]:
classes = np.array([b'g', b'q', b'w', b'z', b't'])

y_class = [classes[np.argmax(i)] for i in ohe_data]

print(y_class)
print(type(y_class[0]))

[b'z', b'g', b'w', b'w', b't', b'q']
<class 'numpy.bytes_'>


I make the data frame and add the 'class' column 

In [4]:
df = pd.DataFrame(data = data, columns = heading)
print(df)

      a    b     c
0   1.3  6.2   7.3
1   4.2  8.1   0.3
2   3.7  1.1   2.8
3  13.2  8.2   3.1
4   1.3  5.1   3.9
5   3.2  4.1  12.5


In [5]:
df['class'] = y_class
print(df)

      a    b     c class
0   1.3  6.2   7.3  b'z'
1   4.2  8.1   0.3  b'g'
2   3.7  1.1   2.8  b'w'
3  13.2  8.2   3.1  b'w'
4   1.3  5.1   3.9  b't'
5   3.2  4.1  12.5  b'q'


In [6]:
print(df.dtypes)

a        float64
b        float64
c        float64
class     object
dtype: object


The dataframe 'knows' that the class labels are the same in the dataset as in the original 'classes' array that I used to set the values

In [7]:
df['class']==classes[0]

0    False
1     True
2    False
3    False
4    False
5    False
Name: class, dtype: bool

However, upon saving the dataframe as a csv and reloading, it fails to replicate this response, even though the dataframe superficially appears the same.

In [8]:
df.to_csv('b_string_test.csv', index=False)
df_reloaded = pd.read_csv('b_string_test.csv', nrows = None)
print(df_reloaded)

      a    b     c class
0   1.3  6.2   7.3  b'z'
1   4.2  8.1   0.3  b'g'
2   3.7  1.1   2.8  b'w'
3  13.2  8.2   3.1  b'w'
4   1.3  5.1   3.9  b't'
5   3.2  4.1  12.5  b'q'


In [9]:
print(df_reloaded.dtypes)

a        float64
b        float64
c        float64
class     object
dtype: object


In [10]:
df_reloaded['class']==classes[0]

0    False
1    False
2    False
3    False
4    False
5    False
Name: class, dtype: bool