In [1]:
%matplotlib inline

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import f1_score, make_scorer, roc_auc_score
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier

In [2]:
from google.colab import drive
drive.mount('/content/drive')

ModuleNotFoundError: No module named 'google'

In [3]:
df = pd.read_csv('train.csv')
df_test = pd.read_csv('test.csv')

In [4]:
df.head()

Unnamed: 0,ClientPeriod,MonthlySpending,TotalSpent,Sex,IsSeniorCitizen,HasPartner,HasChild,HasPhoneService,HasMultiplePhoneNumbers,HasInternetService,HasOnlineSecurityService,HasOnlineBackup,HasDeviceProtection,HasTechSupportAccess,HasOnlineTV,HasMovieSubscription,HasContractPhone,IsBillingPaperless,PaymentMethod,Churn
0,55,19.5,1026.35,Male,0,Yes,Yes,Yes,No,No,No internet service,No internet service,No internet service,No internet service,No internet service,No internet service,One year,No,Mailed check,0
1,72,25.85,1872.2,Male,0,Yes,No,Yes,Yes,No,No internet service,No internet service,No internet service,No internet service,No internet service,No internet service,Two year,No,Credit card (automatic),0
2,1,75.9,75.9,Male,0,No,No,Yes,No,Fiber optic,No,No,No,Yes,No,No,Month-to-month,Yes,Electronic check,1
3,32,79.3,2570.0,Female,1,Yes,No,Yes,Yes,Fiber optic,No,No,Yes,No,No,No,Month-to-month,No,Mailed check,0
4,60,115.25,6758.45,Female,0,Yes,Yes,Yes,Yes,Fiber optic,Yes,Yes,Yes,Yes,Yes,Yes,Two year,No,Credit card (automatic),0


In [5]:
data = df.apply(LabelEncoder().fit_transform)

In [6]:
X = np.asarray(data.drop(['Churn'], axis=1))
y = np.asarray(data['Churn'])

In [7]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

## DecisionTreeClassifier

In [24]:
clf = DecisionTreeClassifier()
scorer = make_scorer(roc_auc_score)

grid_params = {'criterion':['gini', 'entropy'], 'max_depth':range(2, 10), 'min_samples_leaf':range(1, 10), 'min_samples_split':range(2, 10) }

grid_obj = GridSearchCV(clf, grid_params, scoring=scorer)
grid_obj = grid_obj.fit(X_train, y_train)

best_clf = grid_obj.best_estimator_
best_clf.fit(X_train,y_train)

best_train_prediction = best_clf.predict(X_train)
best_test_prediction = best_clf.predict(X_test)

print('Training roc_auc_score score:', roc_auc_score(best_train_prediction, y_train))
print('Test roc_auc_score score:', roc_auc_score(best_test_prediction, y_test))

Training roc_auc_score score: 0.75902800269065
Test roc_auc_score score: 0.7196735593677198


In [25]:
grid_obj.best_estimator_

DecisionTreeClassifier(ccp_alpha=0.0, class_weight=None, criterion='gini',
                       max_depth=6, max_features=None, max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=4, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, presort='deprecated',
                       random_state=None, splitter='best')

In [26]:
features = list(zip(grid_obj.best_estimator_.feature_importances_, data.columns))
for x in features:
    if x[0]!=0:
        print(x[1], str(round(x[0], 3)*100)+'%')

ClientPeriod 15.299999999999999%
MonthlySpending 10.2%
TotalSpent 3.3000000000000003%
Sex 0.2%
IsSeniorCitizen 1.2%
HasInternetService 4.9%
HasOnlineSecurityService 12.5%
HasOnlineBackup 0.8%
HasDeviceProtection 0.1%
HasTechSupportAccess 0.8%
HasMovieSubscription 0.5%
HasContractPhone 47.099999999999994%
IsBillingPaperless 1.3%
PaymentMethod 1.9%


In [43]:
data_test = df_test.apply(LabelEncoder().fit_transform)
prediction = best_clf.predict(np.asarray(data_test))
pd.DataFrame(prediction, columns=['Churn']).to_csv('/content/drive/MyDrive/ml/churn/prediction1.csv')

## LogisticRegression

In [36]:
logreg = LogisticRegression()
logreg.fit(X_train, y_train)
test_pred = logreg.predict(X_test)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


In [None]:
grid_params = {'C': [0.001, 0.01, 0.1, 1, 10, 100, 1000] }

grid_obj = GridSearchCV(logreg, grid_params, scoring=scorer)
grid_obj = grid_obj.fit(X_train, y_train)

best_clf = grid_obj.best_estimator_
best_clf.fit(X_train,y_train)

In [42]:
best_train_prediction = best_clf.predict(X_train)
best_test_prediction = best_clf.predict(X_test)

print('Training roc_auc_score score:', roc_auc_score(best_train_prediction, y_train))
print('Test roc_auc_score score:', roc_auc_score(best_test_prediction, y_test))

Training roc_auc_score score: 0.7485193701226309
Test roc_auc_score score: 0.7328646254258269


In [None]:
data_test = df_test.apply(LabelEncoder().fit_transform)
prediction = best_clf.predict(np.asarray(data_test))
pd.DataFrame(prediction, columns=['Churn']).to_csv('/content/drive/MyDrive/ml/churn/predictionlogreg.csv')

## KNeighborsClassifier

In [50]:
knn = KNeighborsClassifier(n_neighbors=3)

grid_params = {
    'n_neighbors': range(1, 10), 
    'weights': ['uniform', 'distance'],
    'metric': ['euclidean', 'manhattan', 'minkowski']
}

grid_obj = GridSearchCV(knn, grid_params, scoring=scorer)
grid_obj = grid_obj.fit(X_train, y_train)

best_clf = grid_obj.best_estimator_
best_clf.fit(X_train,y_train)

best_train_prediction = best_clf.predict(X_train)
best_test_prediction = best_clf.predict(X_test)

print('Training roc_auc_score score:', roc_auc_score(best_train_prediction, y_train))
print('Test roc_auc_score score:', roc_auc_score(best_test_prediction, y_test))

Training roc_auc_score score: 0.9991997439180538
Test roc_auc_score score: 0.6545794148380355


In [51]:
best_clf

KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='manhattan',
                     metric_params=None, n_jobs=None, n_neighbors=4, p=2,
                     weights='distance')

In [52]:
data_test = df_test.apply(LabelEncoder().fit_transform)
prediction = best_clf.predict(np.asarray(data_test))
pd.DataFrame(prediction, columns=['Churn']).to_csv('/content/drive/MyDrive/ml/churn/predictionknn.csv')

### Логистическая регрессия дает лучший результат