In [4]:
import pandas as pd
import numpy as np
import pyreadr
import pprint

In [20]:
pd.set_option('display.width', 53)
pd.set_option('display.max_columns', 5)
pd.set_option('display.max_rows', 25)

In [8]:
nls97r = pyreadr.read_r('data/nls97.rds')[None]

In [10]:
nls97r.dtypes

R0000100    int32
R0536300    int32
R0536401    int32
R0536402    int32
R1235800    int32
R1482600    int32
R9793800    int32
R9793900    int32
R9871900    int32
R9872000    int32
R9872200    int32
R9872400    int32
S8646900    int32
S8647000    int32
S8647100    int32
S8647200    int32
S8647300    int32
S8647400    int32
S8647500    int32
S8647600    int32
S8647700    int32
S8647800    int32
T6651700    int32
U1836800    int32
U1836900    int32
U1837000    int32
U1837100    int32
U1837200    int32
U1837300    int32
U1845400    int32
U1852400    int32
U1852600    int32
U1852700    int32
U2166200    int32
U2166300    int32
U2166400    int32
U2166500    int32
U2857300    int32
U2962800    int32
U2962900    int32
U2963000    int32
Z9063900    int32
dtype: object

In [12]:
nls97r.head(10)

Unnamed: 0,R0000100,R0536300,R0536401,R0536402,R1235800,R1482600,R9793800,R9793900,R9871900,R9872000,...,U1852700,U2166200,U2166300,U2166400,U2166500,U2857300,U2962800,U2962900,U2963000,Z9063900
0,1,2,9,1981,1,4,350,470,309,310,...,-5,-5,-5,-5,-5,-5,-5,-5,-5,52
1,2,1,7,1982,1,2,460,440,217,280,...,-4,-4,-4,-4,-4,-4,4,2,6,0
2,3,2,9,1983,1,2,-4,-4,-4,-4,...,0,-4,-4,-4,-4,-1,6,2,6,0
3,4,2,2,1981,1,2,-4,-4,253,216,...,1,-4,-4,-4,-4,-4,3,2,6,4
4,5,1,10,1982,1,2,-4,-4,243,235,...,0,-4,-4,-4,-4,-4,2,2,5,12
5,6,2,1,1982,1,2,-4,-4,162,100,...,0,-4,-4,-4,-4,3,4,2,6,6
6,7,1,4,1983,1,2,-4,-4,-4,-4,...,-5,-5,-5,-5,-5,-5,-5,-5,-5,0
7,8,2,6,1981,1,4,620,620,281,304,...,-5,-5,-5,-5,-5,-5,-5,-5,-5,39
8,9,1,10,1982,1,4,380,460,286,290,...,-4,-4,-4,-4,-4,-4,6,2,4,0
9,10,1,3,1984,1,4,600,560,362,375,...,0,-4,-4,-4,-4,-4,6,2,6,0


In [16]:
# load the value labels
with open('data/nlscodes.txt', 'r') as reader:
    setvalues = eval(reader.read())

In [18]:
pprint.pprint(setvalues)

{'R0536300': {0.0: 'No Information', 1.0: 'Male', 2.0: 'Female'},
 'R1235800': {0.0: 'Oversample', 1.0: 'Cross-sectional'},
 'S8646900': {1.0: '1. Definitely',
              2.0: '2. Probably ',
              3.0: '3. Probably not',
              4.0: '4. Definitely not'},
 'S8647000': {1.0: '1. Definitely',
              2.0: '2. Probably ',
              3.0: '3. Probably not',
              4.0: '4. Definitely not'},
 'S8647100': {1.0: '1. Definitely',
              2.0: '2. Probably ',
              3.0: '3. Probably not',
              4.0: '4. Definitely not'},
 'S8647200': {1.0: '1. Definitely',
              2.0: '2. Probably ',
              3.0: '3. Probably not',
              4.0: '4. Definitely not'},
 'S8647300': {1.0: '1. Definitely',
              2.0: '2. Probably ',
              3.0: '3. Probably not',
              4.0: '4. Definitely not'},
 'S8647400': {1.0: '1. Definitely',
              2.0: '2. Probably ',
              3.0: '3. Probably not',
              4.0

In [22]:
# get the R data
nls97r = pyreadr.read_r('data/nls97.rds')[None]

In [26]:
newcols = ['personid','gender','birthmonth',
  'birthyear','sampletype','category',
  'satverbal','satmath','gpaoverall',
  'gpaeng','gpamath','gpascience','govjobs',
  'govprices','govhealth','goveld','govind',
  'govunemp','govinc','govcollege',
  'govhousing','govenvironment','bacredits',
  'coltype1','coltype2','coltype3','coltype4',
  'coltype5','coltype6','highestgrade',
  'maritalstatus','childnumhome','childnumaway',
  'degreecol1','degreecol2','degreecol3',
  'degreecol4','wageincome','weeklyhrscomputer',
  'weeklyhrstv','nightlyhrssleep',
  'weeksworkedlastyear']

In [28]:
# set value labels, missing values, and change data type to category
nls97r.replace(setvalues, inplace=True)

In [30]:
nls97r.head()

Unnamed: 0,R0000100,R0536300,...,U2963000,Z9063900
0,1,Female,...,-5,52
1,2,Male,...,6,0
2,3,Female,...,6,0
3,4,Female,...,6,4
4,5,Male,...,5,12


In [32]:
nls97r.replace(list(range(-9,0)), np.nan, inplace=True)

In [34]:
for col in nls97r[[k for k in setvalues]].columns:
    nls97r[col] = nls97r[col].astype('category')

In [36]:
nls97r.dtypes

R0000100       int32
R0536300    category
R0536401       int32
R0536402       int32
R1235800    category
              ...   
U2857300    category
U2962800    category
U2962900    category
U2963000     float64
Z9063900     float64
Length: 42, dtype: object