# Smoking Status Dataset

In [131]:
# import packages
import numpy as np
import pandas as pd
import plotly.express as px
import researchpy as rp
import scipy
from scipy import stats
from scipy.stats import ttest_ind
import sklearn
from sklearn import compose
from sklearn.compose import make_column_selector
from sklearn.compose import make_column_transformer
from sklearn import ensemble
from sklearn.ensemble import AdaBoostClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn import linear_model
from sklearn.linear_model import LogisticRegression
from sklearn import model_selection
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import train_test_split
from sklearn import preprocessing
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import OneHotEncoder

## Data Exploration

Data for this analysis were sourced from Kaggle (https://www.kaggle.com/datasets/kukuroo3/body-signal-of-smoking). The objective of this analysis is to predict smoking status using various body signal measurements. No data dictionary was supplied by the orignal poster. 

In [132]:
# read data from .csv file
data = pd.read_csv('Smoking_Status_Data.csv')

# peek at data
display(data.head())

# check data frame dimension
display(data.shape)

Unnamed: 0,ID,gender,age,height(cm),weight(kg),waist(cm),eyesight(left),eyesight(right),hearing(left),hearing(right),...,hemoglobin,Urine protein,serum creatinine,AST,ALT,Gtp,oral,dental caries,tartar,smoking
0,0,F,40,155,60,81.3,1.2,1.0,1.0,1.0,...,12.9,1.0,0.7,18.0,19.0,27.0,Y,0,Y,0
1,1,F,40,160,60,81.0,0.8,0.6,1.0,1.0,...,12.7,1.0,0.6,22.0,19.0,18.0,Y,0,Y,0
2,2,M,55,170,60,80.0,0.8,0.8,1.0,1.0,...,15.8,1.0,1.0,21.0,16.0,22.0,Y,0,N,1
3,3,M,40,165,70,88.0,1.5,1.5,1.0,1.0,...,14.7,1.0,1.0,19.0,26.0,18.0,Y,0,Y,0
4,4,F,40,155,60,86.0,1.0,1.0,1.0,1.0,...,12.5,1.0,0.6,16.0,14.0,22.0,Y,0,N,0


(55692, 27)

In [133]:
# check data types
display(data.info())

# drop ID column
data.drop('ID', 
          axis = 1,
          inplace = True)
# properly cast integer predictors
data[['age', 
      'height(cm)', 
      'weight(kg)']] = data[['age', 
                             'height(cm)', 
                             'weight(kg)']].astype(float)
# factor categorical predictors
data[['hearing(left)', 
      'hearing(right)', 
      'dental caries', 
      'smoking']] = data[['hearing(left)', 
                          'hearing(right)', 
                          'dental caries', 
                          'smoking']].astype(object)

# check for missing values
display(data.isna().sum())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 55692 entries, 0 to 55691
Data columns (total 27 columns):
 #   Column               Non-Null Count  Dtype  
---  ------               --------------  -----  
 0   ID                   55692 non-null  int64  
 1   gender               55692 non-null  object 
 2   age                  55692 non-null  int64  
 3   height(cm)           55692 non-null  int64  
 4   weight(kg)           55692 non-null  int64  
 5   waist(cm)            55692 non-null  float64
 6   eyesight(left)       55692 non-null  float64
 7   eyesight(right)      55692 non-null  float64
 8   hearing(left)        55692 non-null  float64
 9   hearing(right)       55692 non-null  float64
 10  systolic             55692 non-null  float64
 11  relaxation           55692 non-null  float64
 12  fasting blood sugar  55692 non-null  float64
 13  Cholesterol          55692 non-null  float64
 14  triglyceride         55692 non-null  float64
 15  HDL                  55692 non-null 

None

gender                 0
age                    0
height(cm)             0
weight(kg)             0
waist(cm)              0
eyesight(left)         0
eyesight(right)        0
hearing(left)          0
hearing(right)         0
systolic               0
relaxation             0
fasting blood sugar    0
Cholesterol            0
triglyceride           0
HDL                    0
LDL                    0
hemoglobin             0
Urine protein          0
serum creatinine       0
AST                    0
ALT                    0
Gtp                    0
oral                   0
dental caries          0
tartar                 0
smoking                0
dtype: int64

In [134]:
# calculate smoking rate
display(data['smoking'].value_counts())
print(f"Smoking Rate: {np.mean(data['smoking'] == 1).round(3)}")

# get indices of smokers
smokeIdx = np.where(data['smoking'] == 1)

0    35237
1    20455
Name: smoking, dtype: int64

Smoking Rate: 0.367
