In [2]:
import os
import pandas as pd
import numpy as np
import sklearn.impute as sk_imp
import sklearn.preprocessing as sk_pre
import sklearn.model_selection as sk_ms
import xgboost as xb
# from sklearn.impute import SimpleImputer

In [3]:
dem_df = pd.read_csv("demographics.csv")
loc_df = pd.read_csv("location.csv")
pop_df = pd.read_csv("population.csv")
sat_df = pd.read_csv('satisfaction.csv')
srv_df = pd.read_csv('services.csv')
status_df = pd.read_csv('status.csv') # Train Target
test_id_df = pd.read_csv('Test_IDs.csv') # Train Index
train_id_df = pd.read_csv('Train_IDs.csv')

dem_df = dem_df.drop(columns=["Count"], axis=1)
dem_df["Under 30"] = (dem_df["Age"] < 30)

loc_df = loc_df.drop(columns=["Count"], axis=1)
loc_df = loc_df.drop(columns=["Country"], axis=1) # We only have "United States" data, thus meaningless
loc_df = loc_df.drop(columns=["State"], axis=1) # We only have "California" data, thus meaningless
loc_df = loc_df.drop(columns=["Lat Long"], axis=1) # We only have "California" data, thus meaningless
loc_df = loc_df.drop(columns=["City"], axis=1) # We only have "California" data, thus meaningless

srv_df = srv_df.drop(columns=["Count"], axis=1)
srv_df = srv_df.drop(columns=["Quarter"], axis=1) # We only have "Q3" data, thus meaningless

df_list = [dem_df, loc_df, pop_df, sat_df, srv_df, status_df]

In [4]:
def outer_merge(left, right):
        return left.merge(right, on='Customer ID', how='outer')

def inner_merge(left, right):
        return left.merge(right, on='Customer ID', how='inner')

merge_df = outer_merge(dem_df, loc_df)
merge_df = outer_merge(merge_df, sat_df)
merge_df = outer_merge(merge_df, srv_df)

In [5]:
train_df = inner_merge(merge_df, train_id_df)
# train_df = train_df.dropna(axis=0, thresh=30)
train_df = train_df.dropna(axis=0, thresh=20)
# train_df = train_df.dropna(axis=0, thresh=35)

test_df = inner_merge(merge_df, test_id_df)
test_df = test_df[train_df.columns]

train_df = inner_merge(train_df, status_df)

In [6]:
category_feature_selection = [
        # demographics
        "Gender", "Under 30", "Senior Citizen", "Married", "Dependents", # Y/N
        # location
        # "City",
        # population
        # None
        # satisfaction
        "Satisfaction Score", # 1 ~ 5
        # services
        "Referred a Friend", # Y/N
        "Offer", # None, Offer A ~ E
        "Phone Service", "Multiple Lines", "Internet Service", # Y/N
        "Internet Type", # No, DSL, Fiber Optic, Cable
        "Online Security", "Online Backup", "Device Protection Plan", "Premium Tech Support", # Y/N
        "Streaming TV", "Streaming Movies", "Streaming Music", "Unlimited Data", # Y/N
        "Contract", # Month-to-Month, One Year, Two Year
        "Paperless Billing", # Y/N
        "Payment Method", # Bank Withdrawal, Credit Card, Mailed Check
]

category_feature = []
numeric_feature = []
for col in train_df.columns:
        if col == "Customer ID" or col == "Churn Category":
                continue

        if col not in category_feature_selection:
                numeric_feature.append(col)
        else:
                category_feature.append(col)

In [7]:
def preprocessing(train_df):
        cat_df = train_df[category_feature]
        imp_fre = sk_imp.SimpleImputer(missing_values=np.nan, strategy='most_frequent')
        impute_cat_df = pd.DataFrame(imp_fre.fit_transform(cat_df), columns=cat_df.columns)
        oh_impute_cat_df = pd.get_dummies(impute_cat_df)

        num_df = train_df[numeric_feature]
        imp_med = sk_imp.SimpleImputer(missing_values=np.nan, strategy='mean')
        impute_num_df = pd.DataFrame(imp_med.fit_transform(num_df), columns=num_df.columns)

        impute_train_df = pd.concat([oh_impute_cat_df, impute_num_df], axis=1)

        return impute_train_df # features only

In [8]:
x_train = preprocessing(train_df)

y_train = train_df["Churn Category"].map({
        "No Churn":	        0,
        "Competitor":	        1,
        "Dissatisfaction":	2,
        "Attitude":	        3,
        "Price":	        4,
        "Other":	        5,
})

x_test = preprocessing(test_df)

In [9]:
from sklearn.metrics import f1_score
def f1_eval(y_pred, dtrain):
    y_true = dtrain.get_label()
    err = 1 - f1_score(y_true, np.round(y_pred))
    return 'f1_err', err

In [10]:
# x_train, x_test, y_train, y_test = sk_ms.train_test_split(x_train, y_train, test_size = 0.33)
xgboost_model = xb.XGBClassifier(use_label_encoder=False, n_estimators=200, eval_metric='auc', objective="multi:softprob", num_class=6)#, booster="gblinear")
xgboost_model.fit(x_train, y_train) # , eval_metric=f1_eval)

XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
              colsample_bynode=1, colsample_bytree=1, enable_categorical=False,
              eval_metric='auc', gamma=0, gpu_id=-1, importance_type=None,
              interaction_constraints='', learning_rate=0.300000012,
              max_delta_step=0, max_depth=6, min_child_weight=1, missing=nan,
              monotone_constraints='()', n_estimators=200, n_jobs=12,
              num_class=6, num_parallel_tree=1, objective='multi:softprob',
              predictor='auto', random_state=0, reg_alpha=0, reg_lambda=1,
              scale_pos_weight=None, subsample=1, tree_method='exact',
              use_label_encoder=False, ...)

In [11]:
predicted = xgboost_model.predict(x_test)
print('訓練集: ',xgboost_model.score(x_train,y_train))

訓練集:  1.0


In [12]:
predicted_df = pd.DataFrame(predicted, columns=["Churn Category"])
out_df = pd.concat([test_df["Customer ID"], predicted_df], axis=1)
out_df.to_csv("out.csv", index=False)
out_df

Unnamed: 0,Customer ID,Churn Category
0,5066-GFJMM,3
1,1730-VFMWO,1
2,7037-MTYVW,0
3,4981-FLTMF,1
4,8443-WVPSS,1
...,...,...
1404,9919-YLNNG,0
1405,8049-WJCLQ,0
1406,7016-NVRIC,0
1407,4976-LNFVV,2


In [13]:
scores = sk_ms.cross_val_score(xgboost_model, x_train, y_train, cv=3, scoring='f1_micro', verbose=3)

[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


[CV] END ................................ score: (test=0.814) total time=   2.8s


[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    2.8s remaining:    0.0s


[CV] END ................................ score: (test=0.814) total time=   2.7s


[Parallel(n_jobs=1)]: Done   2 out of   2 | elapsed:    5.5s remaining:    0.0s


[CV] END ................................ score: (test=0.804) total time=   2.7s


[Parallel(n_jobs=1)]: Done   3 out of   3 | elapsed:    8.4s finished


In [14]:
train_df.shape

(3691, 40)