## Working On Cody's Code

In [1]:
import pandas as pd
import numpy as np

In [2]:
NS14 = pd.read_csv('data/NSDUH-2014-DS0001-data-excel.tsv', sep='\t', index_col=0)

In [3]:
NS14.shape

(55271, 3147)

In [4]:
NS14.columns[0:5]

Index([u'QUESTID2', u'CIGEVER', u'CIGOFRSM', u'CIGWILYR', u'CIGTRY'], dtype='object')

In [5]:
NS14.index[0:5]

Int64Index([1, 2, 3, 4, 5], dtype='int64', name=u'CASEID')

In [6]:
selected_columns = ['IRCIGRC','IRCGRRC','IRALCRC','IRMJRC','IRCOCRC','IRCRKRC',
                   'IRHERRC','IRHALRC','IRINHRC','IRANLRC','IROXYRC','IRTRNRC',
                   'IRSTMRC','IRSEDRC']
#For these vars, 9 is never used, 4 is more than 3 years ago, 3 is 12 months ago+, 
#2 is 30days-12months ago, 1 is past 30 days

In [None]:
DrugRecency = NS14[selected_columns]
DrugRecency.shape

(55271, 14)

There is another way to do this.  It is faster and avoids bringing the entire dataset into memory...

In [None]:
%timeit NS14 = pd.read_csv('data/NSDUH-2014-DS0001-data-excel.tsv', sep='\t', index_col=0)

In [None]:
%timeit DrugRecency = pd.read_csv('data/NSDUH-2014-DS0001-data-excel.tsv', usecols=selected_columns, sep='\t', index_col=0)

In [None]:
DrugRecency = pd.read_csv('data/NSDUH-2014-DS0001-data-excel.tsv', usecols=selected_columns, sep='\t', index_col=0)

In [None]:
DrugRecency.shape

Looks like we lost one of the columns.  Probably the index column...

In [None]:
DrugRecency.index

Try it again, including the actual index column this time...

In [None]:
selected_columns = ['CASEID','IRCIGRC','IRCGRRC','IRALCRC','IRMJRC','IRCOCRC','IRCRKRC',
                   'IRHERRC','IRHALRC','IRINHRC','IRANLRC','IROXYRC','IRTRNRC',
                   'IRSTMRC','IRSEDRC']
DrugRecency = pd.read_csv('data/NSDUH-2014-DS0001-data-excel.tsv', usecols=selected_columns, sep='\t', index_col=0)
DrugRecency.shape

In [None]:
DrugRecency.index[:5]

In [None]:
DrugRecency.columns[:5]

In [None]:
sample = DrugRecency.head().copy() # Take a copy as we will be changing some of the values!

In [None]:
sample.loc[1,:]

In [None]:
# Here's a better way to iterate over a DataFrame.  iterrows is a Python generator.
for index, row in sample.iterrows():
    print index, row['IRCIGRC']

In [None]:
# Replace the 9's with NaN
sample.replace(9, np.nan, inplace=True)

In [None]:
sample

In [None]:
# Since it works, we can now apply it to the full data set
DrugRecency.replace(9, np.nan, inplace=True)