In [None]:
!pip install category_encoders

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import sklearn
from sklearn.pipeline import make_pipeline, Pipeline
from sklearn.model_selection import RandomizedSearchCV
from sklearn.metrics import f1_score
from xgboost import XGBClassifier
from sklearn.model_selection import train_test_split
from category_encoders import OrdinalEncoder
from sklearn.impute import SimpleImputer
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.linear_model import LogisticRegressionCV
from sklearn.compose import make_column_selector
from sklearn.compose import make_column_transformer
from sklearn.inspection import permutation_importance
from sklearn.metrics import accuracy_score
from sklearn.metrics import f1_score
from sklearn.metrics import recall_score
from yellowbrick.model_selection import feature_importances
from yellowbrick.features import pca_decomposition
from yellowbrick.target import class_balance
from yellowbrick.target.feature_correlation import feature_correlation
import warnings
warnings.filterwarnings('ignore')
%matplotlib inline

In [None]:
data = pd.read_csv('cancer patient data sets.csv')
data.info()

In [None]:
display(data.describe().T)

In [None]:
data.head(5)

#함수선언

In [None]:
## 전처리
def eda(df):
  df.drop(['index','Patient Id','Obesity','Alcohol use','Balanced Diet','Weight Loss','Clubbing of Finger Nails'], axis=1, inplace=True)
  df['dangerous'] = np.where(df['Level'].str.contains('Medium|High'), 1,0).astype(int)
  # df['Level'] = df['Level'].str.replace('Low', repl='0')
  # df['Level'] = df['Level'].str.replace('Medium', repl='1')
  # df['Level'] = df['Level'].str.replace('High', repl='2')
  # df.Level = df.Level.astype(int)
  agegrp=[]
  for i in range(0, len(df.Age)):
    if df.Age.iloc[i] < 20:
      agegrp.append('teen')
    elif df.Age.iloc[i] < 30:
      agegrp.append('youth')
    elif df.Age.iloc[i] < 40:
      agegrp.append('middle')
    elif df.Age.iloc[i] < 50:
      agegrp.append('old')
    else:
      agegrp.append('very old')
  df['agegrp'] = agegrp
  
  return df

In [None]:
## 머신러닝 모델
def model(X_train, y_train):
  pipe = make_pipeline(
      OrdinalEncoder(),
      DecisionTreeClassifier(random_state=42)
  )
  pipe.fit(X_train, y_train)
  return pipe

#실행

In [None]:
data_ = data.copy()

In [None]:
df = eda(data_)
df.head(5)

In [None]:
corr = df.corr() 
corr.style.background_gradient()

In [None]:
## 훈련/검증/테스트 split
train, test = train_test_split(df, test_size=0.2, random_state=2)
train, val = train_test_split(train, test_size=0.25, random_state=2)
train.shape, val.shape, test.shape

In [None]:
target = 'Level'
features = df.columns.drop(target)
X_train = train[features]
y_train = train[target]
X_val = val[features]
y_val = val[target]

In [None]:
model = model(X_train, y_train)
print('검증 정확도: ', model.score(X_val, y_val))

In [None]:
import graphviz
from sklearn.tree import export_graphviz

tree = model.named_steps['decisiontreeclassifier']

dot_data = export_graphviz(
    tree,
    feature_names=X_train.columns, 
    class_names=y_train.unique().astype(str), 
    filled=True, 
    proportion=True
)

graphviz.Source(dot_data)

In [None]:
features = df.columns.drop([target,
                            'dangerous',
                            'Snoring',
                            'Swallowing Difficulty',])
X_train = train[features]
y_train = train[target]
X_val = val[features]
y_val = val[target]

pipe = make_pipeline(
      OrdinalEncoder(),
      DecisionTreeClassifier(max_depth=3, random_state=42)
  )
pipe.fit(X_train, y_train)

print('검증 정확도: ', pipe.score(X_val, y_val))

In [None]:
import graphviz
from sklearn.tree import export_graphviz

tree = pipe.named_steps['decisiontreeclassifier']

dot_data = export_graphviz(
    tree,
    feature_names=X_train.columns, 
    class_names=y_train.unique().astype(str), 
    filled=True, 
    proportion=True
)

graphviz.Source(dot_data)

In [None]:
df[target].value_counts(normalize=True)

In [None]:
g = sns.FacetGrid(df, col=target)
g.map(plt.hist, 'agegrp', bins=10, alpha=.5)

In [None]:
df['Level'] = df['Level'].str.replace('Low', repl='0')
df['Level'] = df['Level'].str.replace('Medium', repl='1')
df['Level'] = df['Level'].str.replace('High', repl='2')
df.Level = df.Level.astype(int)

In [None]:
t = df[['agegrp', target]].groupby(['agegrp'], as_index=False).sum().sort_values(by=target, ascending=False)
t.plot('agegrp', linewidth=4, marker='o')

In [None]:
ap = df[['Air Pollution', target]].groupby(['Air Pollution'], as_index=False).mean().sort_values(by=target, ascending=False)
ap.plot(x='Air Pollution' ,y='Level', linewidth=3, marker='o')

In [None]:
ap.hist('Level', bins=8)

In [None]:
ap = df[['Gender', target]].groupby(['Gender'], as_index=False).mean().sort_values(by=target, ascending=False)
ap.plot('Gender', linewidth=3, marker='o')

In [None]:
df[['Air Pollution', target]].groupby(['Air Pollution'], as_index=False).mean().sort_values(by=target, ascending=False)

In [None]:
grid = sns.FacetGrid(df, col='dangerous', row='Air Pollution', hue="Air Pollution", height=2, aspect=2)
grid.map(plt.hist, target, alpha=.5, bins=20)
grid.add_legend();

In [None]:
df[['Air Pollution', target]].groupby(['Air Pollution'], as_index=False).describe()

In [None]:
df.drop('dangerous', axis=1, inplace=True)

In [None]:
corr = df.corr(method='pearson')
corr['Air Pollution'].sort_values(ascending=False)

In [None]:
corr = df.corr(method='pearson')
corr[target].sort_values(ascending=False)