In [1]:
import pandas as pd

# The U.S. National Epidemiological Survey on Alcohol and Related Conditions (NESARC) is a survey
# designed to determine the magnitude of alcohol use and psychiatric disorders in the U.S. population. 
# It is a representative sample of the non-institutionalized population 18 years and older.
#
# Selected variables 
# MAJORDEP12 - MAJOR DEPRESSION IN THE LAST 12 MONTHS (NON-HIERARCHICAL) (
#     0 - No,
#     1 - Yes)
# ALCABDEP12DX - ALCOHOL ABUSE/DEPENDENCE IN THE LAST 12 MONTHS (
#     0 - No alcohol diagnosis,
#     1 - Alcohol abuse only,
#     2 - Alcohol dependence only,
#     3 - Alcohol abuse and dependence)
# TAB12MDX - NICOTINE DEPENDENCE IN THE LAST 12 MONTHS (
#     0 - No nicotine dependence,
#     1 - Nicotine dependence)

cols = ['MAJORDEP12', 'ALCABDEP12DX', 'TAB12MDX']

# Load the data
df = pd.read_csv('nesarc.csv', usecols=cols, low_memory=False)

In [2]:
# Look at the data
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 43093 entries, 0 to 43092
Data columns (total 3 columns):
MAJORDEP12      43093 non-null int64
ALCABDEP12DX    43093 non-null int64
TAB12MDX        43093 non-null int64
dtypes: int64(3)
memory usage: 1.3 MB


In [3]:
# Look for missing values
df.isnull().sum()

MAJORDEP12      0
ALCABDEP12DX    0
TAB12MDX        0
dtype: int64

In [4]:
# Look at the MAJORDEP12 frequency distributions 
print('Counts for major depression in the last 12 month:\n{0}'.format(
        df.MAJORDEP12.value_counts(sort=False)))
print('Percentages for major depression prior in the last 12 month:\n{0}'.format(
        df.MAJORDEP12.value_counts(sort=False, normalize=True)))

Counts for major depression in the last 12 month:
0    39608
1     3485
Name: MAJORDEP12, dtype: int64
Percentages for major depression prior in the last 12 month:
0    0.919128
1    0.080872
Name: MAJORDEP12, dtype: float64


In [5]:
# Look at the ALCABDEP12DX frequency distributions 
print('Counts for alcohol abuse/dependence in the last 12 month:\n{0}'.format(
        df.ALCABDEP12DX.value_counts(sort=False)))
print('Percentages for alcohol abuse/dependence in the last 12 month:\n{0}'.format(
        df.ALCABDEP12DX.value_counts(sort=False, normalize=True)))

Counts for alcohol abuse/dependence in the last 12 month:
0    39766
1     1843
2      553
3      931
Name: ALCABDEP12DX, dtype: int64
Percentages for alcohol abuse/dependence in the last 12 month:
0    0.922795
1    0.042768
2    0.012833
3    0.021604
Name: ALCABDEP12DX, dtype: float64


In [6]:
# Look at the TAB12MDX frequency distributions
print('Counts for nicotine dependence in the last 12 month:\n{0}'.format(
        df.TAB12MDX.value_counts(sort=False)))
print('Percentages for nicotine dependence in the last 12 month:\n{0}'.format(
        df.TAB12MDX.value_counts(sort=False, normalize=True)))

Counts for nicotine dependence in the last 12 month:
0    38131
1     4962
Name: TAB12MDX, dtype: int64
Percentages for nicotine dependence in the last 12 month:
0    0.884854
1    0.115146
Name: TAB12MDX, dtype: float64


In [7]:
# Model interpretation for the frequency distributions
#
# A random sample of 43093 adolescents were asked a variety of questions on medical conditions
# to diagnose a presence of major depression in the last 12 months. 
# Of the total number, 39608 or 92% fell into category 0 (No) and 3485 or 8%
# fell into category 1 (Yes).  
# 
# For the next question the same group of the respondents were asked on alcohol abuse/dependence
# conditions in the last 12 months. 39766 or 93% fell into category 0 (No alcohol diagnosis),
# 1843 or 4% fell into category 1 (Alcohol abuse only), 553 or about 1% fell into category 2
# (Alcohol dependence only), and 931 or about 2% fell into category 3 (Alcohol abuse and dependence).
#
# The next question asked the group on nicotine dependence conditions in the last 12 months.
# 38131 or 88% fell into category 0 (No nicotine dependence) and 4962 or about 12% fell
# into category 1 (Nicotine dependence).