In [1]:
# Data manipulation and analysis
import pandas as pd
import numpy as np

# Visualization
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px
import plotly.graph_objects as go

# Statistical analysis
import scipy.stats as stats
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split

# Memory optimization
import gc
import dask.dataframe as dd  # For large datasets

In [15]:
df = dd.read_csv(r"C:\Users\zanny\Desktop\School\NCAIR Cohort\Data Science Beginners\Project\National Survey of Drug Use and Health\NSDUH_2015-2019.csv", assume_missing=True, low_memory=False)

In [5]:
df.columns

Index(['cigever', 'cigofrsm', 'cigwilyr', 'cigtry', 'cigyfu', 'cigmfu',
       'cigrec', 'cig30use', 'cg30est', 'cig30av',
       ...
       'casuprob2', 'rcvysubprb', 'camhprob2', 'rcvymhprb', 'almedyr2',
       'opmedyr2', 'alopmedyr', 'kratflg', 'kratyr', 'kratmon'],
      dtype='object', length=2812)

In [17]:
df.head()

Unnamed: 0,cigever,cigofrsm,cigwilyr,cigtry,cigyfu,cigmfu,cigrec,cig30use,cg30est,cig30av,...,casuprob2,rcvysubprb,camhprob2,rcvymhprb,almedyr2,opmedyr2,alopmedyr,kratflg,kratyr,kratmon
0,1.0,99.0,99.0,16.0,2014.0,1.0,2.0,93.0,93.0,93.0,...,,,,,,,,,,
1,1.0,99.0,99.0,15.0,9999.0,99.0,3.0,93.0,93.0,93.0,...,,,,,,,,,,
2,2.0,99.0,99.0,991.0,9991.0,91.0,91.0,91.0,91.0,91.0,...,,,,,,,,,,
3,2.0,3.0,4.0,991.0,9991.0,91.0,91.0,91.0,91.0,91.0,...,,,,,,,,,,
4,1.0,99.0,99.0,17.0,9999.0,99.0,1.0,22.0,99.0,3.0,...,,,,,,,,,,


In [21]:
list(df.columns)

['cigever',
 'cigofrsm',
 'cigwilyr',
 'cigtry',
 'cigyfu',
 'cigmfu',
 'cigrec',
 'cig30use',
 'cg30est',
 'cig30av',
 'cig30br2',
 'cig30tpe',
 'cig30men',
 'cig30mln',
 'cig30ro2',
 'cigdlymo',
 'cigage',
 'cigdlyfu',
 'cigdlmfu',
 'cig100lf',
 'smklssevr',
 'smklsstry',
 'smklssyfu',
 'smklssmfu',
 'smklssrec',
 'smklss30n',
 'smklss30e',
 'cigarevr',
 'cigartry',
 'cigaryfu',
 'cigarmfu',
 'cigarrec',
 'cgr30use',
 'ci30est',
 'cgr30br2',
 'pipever',
 'pipe30dy',
 'alcever',
 'alctry',
 'alcyfu',
 'alcmfu',
 'alcrec',
 'alcyrtot',
 'altotfg',
 'alfqflg',
 'albstway',
 'aldaypyr',
 'aldaypmo',
 'aldaypwk',
 'alcdays',
 'al30est',
 'aldysfg',
 'alcus30d',
 'alcbng30d',
 'mjever',
 'mjage',
 'mjyfu',
 'mjmfu',
 'mjrec',
 'mjyrtot',
 'mrtotfg',
 'mjfqflg',
 'mrbstway',
 'mrdaypyr',
 'mrdaypmo',
 'mrdaypwk',
 'mjday30a',
 'mr30est',
 'cocever',
 'cocage',
 'cocyfu',
 'cocmfu',
 'cocrec',
 'cocyrtot',
 'cctotfg',
 'ccfqflg',
 'ccbstway',
 'ccdaypyr',
 'ccdaypmo',
 'ccdaypwk',
 'cocus30a

In [25]:
for i, col in enumerate(df.columns):
    print(f"{i}: {col}")

0: cigever
1: cigofrsm
2: cigwilyr
3: cigtry
4: cigyfu
5: cigmfu
6: cigrec
7: cig30use
8: cg30est
9: cig30av
10: cig30br2
11: cig30tpe
12: cig30men
13: cig30mln
14: cig30ro2
15: cigdlymo
16: cigage
17: cigdlyfu
18: cigdlmfu
19: cig100lf
20: smklssevr
21: smklsstry
22: smklssyfu
23: smklssmfu
24: smklssrec
25: smklss30n
26: smklss30e
27: cigarevr
28: cigartry
29: cigaryfu
30: cigarmfu
31: cigarrec
32: cgr30use
33: ci30est
34: cgr30br2
35: pipever
36: pipe30dy
37: alcever
38: alctry
39: alcyfu
40: alcmfu
41: alcrec
42: alcyrtot
43: altotfg
44: alfqflg
45: albstway
46: aldaypyr
47: aldaypmo
48: aldaypwk
49: alcdays
50: al30est
51: aldysfg
52: alcus30d
53: alcbng30d
54: mjever
55: mjage
56: mjyfu
57: mjmfu
58: mjrec
59: mjyrtot
60: mrtotfg
61: mjfqflg
62: mrbstway
63: mrdaypyr
64: mrdaypmo
65: mrdaypwk
66: mjday30a
67: mr30est
68: cocever
69: cocage
70: cocyfu
71: cocmfu
72: cocrec
73: cocyrtot
74: cctotfg
75: ccfqflg
76: ccbstway
77: ccdaypyr
78: ccdaypmo
79: ccdaypwk
80: cocus30a
81: cc3

In [27]:
df["irsex"].isnull().sum()

<dask_expr.expr.Scalar: expr=(~ NotNull(frame=ReadCSV(f4e0842)['irsex'])).sum(), dtype=int64>

In [29]:
df["irsex"].value_counts().compute()

irsex
1.0    134657
2.0    148111
Name: count, dtype: int64

In [None]:
women_df = df[df["irsex"] == 2]
men_df = df[df["irsex"] == 1]

In [None]:
women_df.to_csv("women_data.csv", single_file=True)
men_df.to_csv("men_data.csv", single_file=True)

['C:\\Users\\zanny\\Desktop\\School\\NCAIR Cohort\\Data Science Beginners\\Project\\National Survey of Drug Use and Health\\men_data.csv']