In [28]:
%reload_ext autoreload
%autoreload 2

In [45]:
import warnings
import funcs

import pandas as pd 
import numpy as np
from ydata_profiling import ProfileReport

from sklearn.model_selection import train_test_split
from catboost import CatBoostClassifier, Pool
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier

from sklearn.ensemble import (
    GradientBoostingClassifier,
    RandomForestClassifier,
    VotingClassifier,
    ExtraTreesClassifier,
    StackingClassifier,
)

from sklearn.metrics import f1_score
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import cross_val_predict
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
from imblearn.over_sampling import SMOTE, ADASYN
from imblearn.combine import SMOTEENN, SMOTETomek  

import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
warnings.filterwarnings('ignore', category=FutureWarning)

In [30]:
df = pd.read_csv('data/train.csv').drop(columns=['id'])
df_val = pd.read_csv('data/test.csv').drop(columns=['id'])
df.head()

Unnamed: 0,age,workclass,fnlwgt,education,educational-num,marital-status,occupation,relationship,race,gender,capital-gain,capital-loss,hours-per-week,native-country,is_rich
0,41,Private,130408,HS-grad,9,Divorced,Sales,Unmarried,Black,Female,0,0,38,United-States,0
1,17,Private,57723,11th,7,Never-married,Sales,Own-child,White,Male,0,0,30,United-States,0
2,34,Private,127875,Bachelors,13,Never-married,Adm-clerical,Not-in-family,White,Female,0,0,20,United-States,0
3,42,Private,142424,Assoc-voc,11,Never-married,Adm-clerical,Not-in-family,White,Female,0,0,40,United-States,0
4,39,Self-emp-not-inc,185366,Assoc-acdm,12,Married-civ-spouse,Craft-repair,Husband,White,Male,0,0,35,United-States,1


In [35]:
df_train_prep = funcs.preprocess(df)
df_test_prep = funcs.preprocess(df_val)
df_test_prep = df_test_prep.drop(columns=["native-country", "race", "workclass", "education"])
df_train_prep.head()

Unnamed: 0,age,workclass,fnlwgt,education,educational-num,marital-status,occupation,relationship,race,gender,capital-gain,capital-loss,hours-per-week,native-country,is_rich
0,41,Private,130408,0,9,2,1,1,Black,0,0,0,38,United-States,0
1,17,Private,57723,1,7,0,1,1,White,1,0,0,30,United-States,0
2,34,Private,127875,0,13,0,1,1,White,0,0,0,20,United-States,0
3,42,Private,142424,0,11,0,1,1,White,0,0,0,40,United-States,0
4,39,Self-emp-not-inc,185366,0,12,1,0,0,White,1,0,0,35,United-States,1


In [32]:
# smote = SMOTE(random_state=137)

# X_to_gender = df_train_prep.drop(columns=["native-country", "race", "workclass", "education"])
# X_gender = X_to_gender.drop(columns=["gender"])
# y_gender = X_to_gender["gender"]
# X_gender_smote, y_gender_smote = smote.fit_resample(X_gender, y_gender)
# df_train_gender = pd.concat([X_gender_smote, y_gender_smote], axis=1).reset_index(drop=True)
# df_train_gender.head()


In [36]:
X = df_train_prep.drop(columns=["is_rich", "native-country", "race", "workclass", "education"])
y = df_train_prep["is_rich"]

In [37]:
sm_enn = SMOTEENN(random_state=137)

X_train_sm_enn, y_train_sm_enn = sm_enn.fit_resample(X, y)

In [38]:
cb = CatBoostClassifier(
    logging_level="Silent",
    random_state=137,
    eval_metric="F1",
    iterations=100,          # Количество деревьев
    learning_rate=1,       # Скорость обучения
    depth=6,                  # Глубина дерева
    l2_leaf_reg=3,            # Регуляризация
    border_count=254,         # Количество бинов для численных признаков
    loss_function="Logloss",
    cat_features=[
        # "workclass",
        "marital-status",
        "occupation",
        # "education",
        "educational-num",
        "marital-status",
        "relationship",
        "gender",
    ],
)



In [39]:
# cb.fit(X_train_sm_enn, y_train_sm_enn)
# imp = cb.feature_importances_
# for feat, score in zip(X_train_sm_enn.columns, imp):
#     print(f"{feat}: {score:.2f}")

age: 14.99
fnlwgt: 9.71
educational-num: 9.31
marital-status: 10.04
occupation: 1.88
relationship: 17.19
gender: 2.67
capital-gain: 14.69
capital-loss: 4.25
hours-per-week: 15.27


In [51]:
rf = RandomForestClassifier(random_state=137)
xgb = XGBClassifier(random_state=137)
lgb = LGBMClassifier(random_state=137)
gb = GradientBoostingClassifier(random_state=137)
xt = ExtraTreesClassifier(random_state=137)
cb = CatBoostClassifier(
    logging_level="Silent",
    random_state=137,
    eval_metric="F1",
    iterations=100,          # Количество деревьев
    learning_rate=1,       # Скорость обучения
    depth=6,                  # Глубина дерева
    l2_leaf_reg=3,            # Регуляризация
    border_count=254,         # Количество бинов для численных признаков
    loss_function="Logloss",
    cat_features=[
        # "workclass",
        "marital-status",
        "occupation",
        # "education",
        "educational-num",
        "marital-status",
        "relationship",
        "gender",
    ],
)

estimators = [
        ("rf", rf),
        ("cb", cb),
        ("xgb", xgb),
        ("lgb", lgb),
        ("gb", gb),
        ("xt", xt),
    ]

stack = StackingClassifier(
    estimators=estimators,
    final_estimator=xt,
    cv=5,
    n_jobs=-1,
)

vote = VotingClassifier(
    estimators=estimators,
    voting="soft",
    n_jobs=-1,
)

stack.fit(X_train_sm_enn, y_train_sm_enn)
vote.fit(X_train_sm_enn, y_train_sm_enn)

f1_stack = cross_val_score(stack, X_train_sm_enn, y_train_sm_enn, cv=5, scoring="f1").mean()
f1_vote = cross_val_score(vote, X_train_sm_enn, y_train_sm_enn, cv=5, scoring="f1").mean()




F1 score for StackingClassifier: 0.93
F1 score for VotingClassifier: 0.93



In [52]:
print(f"""
F1 score for StackingClassifier: {f1_stack:.6f}
F1 score for VotingClassifier: {f1_vote:.6f}
""")


F1 score for StackingClassifier: 0.932092
F1 score for VotingClassifier: 0.932194



In [40]:
f1_sm_enn = cross_val_score(cb, X_train_sm_enn, y_train_sm_enn, cv=5, scoring="f1").mean()
print(f"""
F1 score for SMOTEENN: {f1_sm_enn:.2f}
""")



F1 score for SMOTEENN: 0.93



In [53]:
y_pred_val = vote.predict(df_test_prep)

In [54]:

y_pred_df = pd.DataFrame({'id': range(0, len(y_pred_val)), 'is_reach': y_pred_val})
y_pred_df.to_csv('y_pred_classification.csv', index=False)

In [57]:
pred_2 = pd.read_csv('y_pred_classification.csv', index_col=0)
pred_1 = pd.read_csv('y_pred_regression.csv', index_col=0)
f1_files = f1_score(pred_1, pred_2)
f1_files

0.9162812948493063

In [None]:
# from feature_engine.creation import DecisionTreeFeatures

# cols = [i for i in df.columns if df[i].dtype == "object"]

# X = df.drop(labels=["is_rich"], axis=1)
# y = df["is_rich"]

# dtf = DecisionTreeFeatures(features_to_combine=3, regression=False)
# dtf.fit(X, y)

# X = dtf.transform(X)
# X.head()
# print(X.columns)

In [None]:
# df_fe = X.drop(columns=[i for i in X.columns if not i.startswith("tree")])
# df_fe.head()

Unnamed: 0,tree(age),tree(fnlwgt),tree(educational-num),tree(capital-gain),tree(capital-loss),tree(hours-per-week),"tree(['age', 'fnlwgt'])","tree(['age', 'educational-num'])","tree(['age', 'capital-gain'])","tree(['age', 'capital-loss'])",...,"tree(['fnlwgt', 'educational-num', 'capital-gain'])","tree(['fnlwgt', 'educational-num', 'capital-loss'])","tree(['fnlwgt', 'educational-num', 'hours-per-week'])","tree(['fnlwgt', 'capital-gain', 'capital-loss'])","tree(['fnlwgt', 'capital-gain', 'hours-per-week'])","tree(['fnlwgt', 'capital-loss', 'hours-per-week'])","tree(['educational-num', 'capital-gain', 'capital-loss'])","tree(['educational-num', 'capital-gain', 'hours-per-week'])","tree(['educational-num', 'capital-loss', 'hours-per-week'])","tree(['capital-gain', 'capital-loss', 'hours-per-week'])"
0,0.318676,0.243602,0.183898,0.20333,0.226193,0.17485,0.318676,0.212374,0.320927,0.34291,...,0.16002,0.164452,0.175461,0.189885,0.183524,0.198138,0.149564,0.135483,0.147478,0.171976
1,0.051051,0.21902,0.056809,0.20333,0.226193,0.17485,0.051051,0.00524,0.004047,0.005467,...,0.047272,0.063898,0.040656,0.189885,0.068013,0.062271,0.043686,0.038091,0.043066,0.061699
2,0.318676,0.243602,0.408646,0.20333,0.226193,0.17485,0.318676,0.443757,0.238332,0.223124,...,0.359248,0.392273,0.199367,0.189885,0.068013,0.062271,0.334991,0.28513,0.305622,0.061699
3,0.318676,0.243602,0.183898,0.20333,0.226193,0.17485,0.318676,0.310123,0.320927,0.34291,...,0.16002,0.246499,0.175461,0.189885,0.183524,0.198138,0.149564,0.135483,0.147478,0.171976
4,0.318676,0.243602,0.183898,0.20333,0.226193,0.17485,0.318676,0.310123,0.320927,0.34291,...,0.16002,0.246499,0.119497,0.189885,0.068013,0.198138,0.149564,0.038091,0.043066,0.061699


In [None]:
# report = ProfileReport(df, title="Is Rich Report", explorative=True)
# report.to_notebook_iframe()

In [None]:
# cb.fit(df_fe, y)
# important_features = [f"{feat}: {score:.2f}" for feat, score in zip(df_fe.columns, cb.feature_importances_)]
# important_features_score = [f"{feat}: {score:.2f}" for feat, score in zip(df_fe.columns, cb.feature_importances_) if score > 5]
# important_features_score

['tree(capital-gain): 11.44',
 "tree(['age', 'educational-num']): 5.14",
 "tree(['age', 'hours-per-week']): 5.34",
 "tree(['fnlwgt', 'capital-loss']): 7.24",
 "tree(['age', 'fnlwgt', 'hours-per-week']): 8.94",
 "tree(['age', 'capital-gain', 'hours-per-week']): 14.95"]

In [None]:
# X_train, X_test, y_train, y_test = train_test_split(
#     df[['age', 'capital-gain', 'hours-per-week', 'educational-num']], y, test_size=0.2
# )

# cb.fit(X_train, y_train)
# f1 = f1_score(y_test, cb.predict(X_test))
# f1

0.5513619954053167

In [None]:
# df_prep = df.copy()
# df_prep["native-country"] = (df_prep["native-country"] == "United-States").astype(int)
# df_prep["gender"] = (df_prep["gender"] == "Male").astype(int)
# df_prep["marital-status"] = df_prep["marital-status"].apply(
#     lambda x: 1 if x.startswith("Married") else (0 if x.startswith("Never") else 2)
# )

# df_prep["workclass"] = df_prep["workclass"].apply(
#     lambda x: (
#         0
#         if x == "Private"
#         else (1 if x.startswith("Self") else (2 if "gov" in x else 3))
#     )
# )

# df_prep["education"] = df_prep["education"].map(
#     {
#         "HS-grad": 0,
#         "Some-college": 0,
#         "Bachelors": 0,
#         "Masters": 0,
#         "Assoc-voc": 0,
#         "11th": 1,
#         "Assoc-acdm": 0,
#         "10th": 1,
#         "7th-8th": 1,
#         "Prof-school": 2,
#         "9th": 1,
#         "12th": 1,
#         "Doctorate": 0,
#         "5th-6th": 3,
#         "1st-4th": 3,
#         "Preschool": 3,
#     }
# )

# df_prep["occupation"] = df_prep["occupation"].map(
#     {
#         "Prof-specialty": 0,
#         "Craft-repair": 0,
#         "Exec-managerial": 1,
#         "Adm-clerical": 1,
#         "Sales": 1,
#         "Other-service": 0,
#         "Machine-op-inspct": 0,
#         "Transport-moving": 0,
#         "Handlers-cleaners": 2,
#         "Farming-fishing": 0,
#         "Tech-support": 0,
#         "Protective-serv": 3,
#         "Priv-house-serv": 0,
#         "Armed-Forces": 3,
#         '?': 2,
#     }
# )

# df_prep["relationship"] = df_prep["relationship"].map(
#     {
#         "Husband": 0,
#         "Not-in-family": 1,
#         "Wife": 2,
#         "Own-child": 1,
#         "Unmarried": 1,
#         "Other-relative": 1,
#     }
# )

# df_prep["race"] = df_prep["race"].apply(
#     lambda x: (
#         0
#         if x == "White"
#         else (1 if x == "Black" else (2 if x == "Asian-Pac-Islander" else 3))
#     )
# )

# df_prep.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 39074 entries, 0 to 39073
Data columns (total 15 columns):
 #   Column           Non-Null Count  Dtype
---  ------           --------------  -----
 0   age              39074 non-null  int64
 1   workclass        39074 non-null  int64
 2   fnlwgt           39074 non-null  int64
 3   education        39074 non-null  int64
 4   educational-num  39074 non-null  int64
 5   marital-status   39074 non-null  int64
 6   occupation       39074 non-null  int64
 7   relationship     39074 non-null  int64
 8   race             39074 non-null  int64
 9   gender           39074 non-null  int32
 10  capital-gain     39074 non-null  int64
 11  capital-loss     39074 non-null  int64
 12  hours-per-week   39074 non-null  int64
 13  native-country   39074 non-null  int32
 14  is_rich          39074 non-null  int64
dtypes: int32(2), int64(13)
memory usage: 4.2 MB
