In [3]:
import pandas as pd
from scipy import stats
import seaborn as sns
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split

In [4]:
churn_train = pd.read_csv('/Users/alenadenisova/train.csv')
churn_train.head()

Unnamed: 0,id,CustomerId,Surname,CreditScore,Geography,Gender,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary,Exited
0,0,15674932,Okwudilichukwu,668,France,Male,33.0,3,0.0,2,1.0,0.0,181449.97,0
1,1,15749177,Okwudiliolisa,627,France,Male,33.0,1,0.0,2,1.0,1.0,49503.5,0
2,2,15694510,Hsueh,678,France,Male,40.0,10,0.0,2,1.0,0.0,184866.69,0
3,3,15741417,Kao,581,France,Male,34.0,2,148882.54,1,1.0,1.0,84560.88,0
4,4,15766172,Chiemenam,716,Spain,Male,33.0,5,0.0,2,1.0,1.0,15068.83,0


In [5]:
churn_test = pd.read_csv('/Users/alenadenisova/test.csv')
churn_test.head()

Unnamed: 0,id,CustomerId,Surname,CreditScore,Geography,Gender,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary
0,165034,15773898,Lucchese,586,France,Female,23.0,2,0.0,2,0.0,1.0,160976.75
1,165035,15782418,Nott,683,France,Female,46.0,2,0.0,1,1.0,0.0,72549.27
2,165036,15807120,K?,656,France,Female,34.0,7,0.0,2,1.0,0.0,138882.09
3,165037,15808905,O'Donnell,681,France,Male,36.0,8,0.0,1,1.0,0.0,113931.57
4,165038,15607314,Higgins,752,Germany,Male,38.0,10,121263.62,1,1.0,0.0,139431.0


In [6]:
churn_train.shape

(165034, 14)

In [7]:
churn_test.shape

(110023, 13)

In [8]:
churn_train.dtypes

id                   int64
CustomerId           int64
Surname             object
CreditScore          int64
Geography           object
Gender              object
Age                float64
Tenure               int64
Balance            float64
NumOfProducts        int64
HasCrCard          float64
IsActiveMember     float64
EstimatedSalary    float64
Exited               int64
dtype: object

In [9]:
churn_train['Exited'].value_counts()

0    130113
1     34921
Name: Exited, dtype: int64

In [10]:
churn_train.isnull().sum()

id                 0
CustomerId         0
Surname            0
CreditScore        0
Geography          0
Gender             0
Age                0
Tenure             0
Balance            0
NumOfProducts      0
HasCrCard          0
IsActiveMember     0
EstimatedSalary    0
Exited             0
dtype: int64

In [11]:
churn_test.isnull().sum()

id                 0
CustomerId         0
Surname            0
CreditScore        0
Geography          0
Gender             0
Age                0
Tenure             0
Balance            0
NumOfProducts      0
HasCrCard          0
IsActiveMember     0
EstimatedSalary    0
dtype: int64

In [12]:
gender_numeric = {'Male':0, 'Female':1}
churn_train['Gender'] = churn_train['Gender'].map(gender_numeric)
churn_test['Gender'] = churn_test['Gender'].map(gender_numeric)

geography_numeric = {'France':0, 'Spain':1, 'Germany':2}
churn_train['Geography'] = churn_train['Geography'].map(geography_numeric)
churn_test['Geography'] = churn_test['Geography'].map(geography_numeric)

churn_train.head()

Unnamed: 0,id,CustomerId,Surname,CreditScore,Geography,Gender,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary,Exited
0,0,15674932,Okwudilichukwu,668,0,0,33.0,3,0.0,2,1.0,0.0,181449.97,0
1,1,15749177,Okwudiliolisa,627,0,0,33.0,1,0.0,2,1.0,1.0,49503.5,0
2,2,15694510,Hsueh,678,0,0,40.0,10,0.0,2,1.0,0.0,184866.69,0
3,3,15741417,Kao,581,0,0,34.0,2,148882.54,1,1.0,1.0,84560.88,0
4,4,15766172,Chiemenam,716,1,0,33.0,5,0.0,2,1.0,1.0,15068.83,0


In [13]:
churn_test.head()

Unnamed: 0,id,CustomerId,Surname,CreditScore,Geography,Gender,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary
0,165034,15773898,Lucchese,586,0,1,23.0,2,0.0,2,0.0,1.0,160976.75
1,165035,15782418,Nott,683,0,1,46.0,2,0.0,1,1.0,0.0,72549.27
2,165036,15807120,K?,656,0,1,34.0,7,0.0,2,1.0,0.0,138882.09
3,165037,15808905,O'Donnell,681,0,0,36.0,8,0.0,1,1.0,0.0,113931.57
4,165038,15607314,Higgins,752,2,0,38.0,10,121263.62,1,1.0,0.0,139431.0


In [14]:
churn_prediction = churn_test.drop(columns = ['CustomerId', 'Surname', 'CreditScore', 'Geography', 'Gender', 'Age', 'Tenure', 'Balance', 'NumOfProducts', 'HasCrCard', 'IsActiveMember', 'EstimatedSalary'])
churn_prediction.head()

Unnamed: 0,id
0,165034
1,165035
2,165036
3,165037
4,165038


In [15]:
churn_train = churn_train.drop(columns = ['CustomerId', 'Surname'])
churn_test = churn_test.drop(columns = ['CustomerId', 'Surname'])
churn_train.head()

Unnamed: 0,id,CreditScore,Geography,Gender,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary,Exited
0,0,668,0,0,33.0,3,0.0,2,1.0,0.0,181449.97,0
1,1,627,0,0,33.0,1,0.0,2,1.0,1.0,49503.5,0
2,2,678,0,0,40.0,10,0.0,2,1.0,0.0,184866.69,0
3,3,581,0,0,34.0,2,148882.54,1,1.0,1.0,84560.88,0
4,4,716,1,0,33.0,5,0.0,2,1.0,1.0,15068.83,0


In [16]:
churn_test.head()

Unnamed: 0,id,CreditScore,Geography,Gender,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary
0,165034,586,0,1,23.0,2,0.0,2,0.0,1.0,160976.75
1,165035,683,0,1,46.0,2,0.0,1,1.0,0.0,72549.27
2,165036,656,0,1,34.0,7,0.0,2,1.0,0.0,138882.09
3,165037,681,0,0,36.0,8,0.0,1,1.0,0.0,113931.57
4,165038,752,2,0,38.0,10,121263.62,1,1.0,0.0,139431.0


In [17]:
churn_train = churn_train.drop(columns = ['id'])
churn_test = churn_test.drop(columns = ['id'])
churn_train.head()

Unnamed: 0,CreditScore,Geography,Gender,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary,Exited
0,668,0,0,33.0,3,0.0,2,1.0,0.0,181449.97,0
1,627,0,0,33.0,1,0.0,2,1.0,1.0,49503.5,0
2,678,0,0,40.0,10,0.0,2,1.0,0.0,184866.69,0
3,581,0,0,34.0,2,148882.54,1,1.0,1.0,84560.88,0
4,716,1,0,33.0,5,0.0,2,1.0,1.0,15068.83,0


In [18]:
churn_test.head()

Unnamed: 0,CreditScore,Geography,Gender,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary
0,586,0,1,23.0,2,0.0,2,0.0,1.0,160976.75
1,683,0,1,46.0,2,0.0,1,1.0,0.0,72549.27
2,656,0,1,34.0,7,0.0,2,1.0,0.0,138882.09
3,681,0,0,36.0,8,0.0,1,1.0,0.0,113931.57
4,752,2,0,38.0,10,121263.62,1,1.0,0.0,139431.0


In [19]:
X_train = churn_train.drop('Exited', axis = 1)
y_train = churn_train['Exited']

#X_train, X_test, y_train, y_test = train_test_split(features, labels, test_size = 0.4, random_state = 42)
#X_val, X_test, y_val, y_test = train_test_split(X_test, y_test, test_size = 0.5, random_state = 42)

In [20]:
X_train.shape

(165034, 10)

In [21]:
y_train.shape

(165034,)

In [22]:
X_test = churn_test
X_test.shape

(110023, 10)

In [318]:
#y_val.shape

(2000,)

In [320]:
#y_test.shape

(2001,)

In [24]:
X_train.to_csv('/Users/alenadenisova/churn_train_features.csv', index = False)
X_test.to_csv('/Users/alenadenisova/churn_test_features.csv', index = False)

y_train.to_csv('/Users/alenadenisova/churn_train_labels.csv', index = False)


In [25]:
from sklearn.linear_model import LogisticRegression

In [26]:
import joblib
from sklearn.model_selection import GridSearchCV

In [27]:
tr_features = pd.read_csv('/Users/alenadenisova/churn_train_features.csv')
tr_labels = pd.read_csv('/Users/alenadenisova/churn_train_labels.csv') #header=None
#column vector type

print('features: ', tr_features.shape)
print('labels: ', tr_labels.shape)

features:  (165034, 10)
labels:  (165034, 1)


In [328]:
tr_features.head(10)

Unnamed: 0,CreditScore,Geography,Gender,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary
0,834,0,0,23.0,4,131254.81,1,1.0,0.0,20199.3
1,609,0,1,39.0,8,141675.23,1,0.0,1.0,175664.25
2,676,2,1,26.0,1,108348.66,1,0.0,0.0,60231.74
3,516,1,1,33.0,3,0.0,2,1.0,1.0,58685.59
4,831,0,0,30.0,2,0.0,2,0.0,1.0,3430.38
5,698,0,0,37.0,8,0.0,2,0.0,0.0,145004.39
6,628,2,0,40.0,5,181768.32,2,1.0,1.0,129107.97
7,700,0,1,42.0,8,0.0,2,1.0,1.0,105305.72
8,531,2,1,31.0,7,117052.82,1,1.0,0.0,118508.09
9,568,2,1,26.0,10,109819.16,2,1.0,0.0,154491.39


In [28]:
tr_labels.head(10)

Unnamed: 0,Exited
0,0
1,0
2,0
3,0
4,0
5,1
6,0
7,0
8,0
9,0


In [29]:
def print_results(results):
    print('BEST PARAMS: {}]'.format(results.best_params_))
    
    means = results.cv_results_['mean_test_score']
    stds = results.cv_results_['std_test_score']
    for mean, std, params in zip(means, stds, results.cv_results_['params']):
        print('{} (+-{}) for {}'.format(round(mean,3), round(std * 2, 3), params))

In [30]:
lr = LogisticRegression()
parameters = {
    'C': [0.001, 0.01, 0.1, 1, 10, 100, 1000]
}

cv = GridSearchCV(lr, parameters, cv=5)
cv.fit(tr_features, tr_labels.values.ravel()) #.values.ravel - чтобы сделать array из column vector 

print_results(cv)

BEST PARAMS: {'C': 0.1}]
0.786 (+-0.003) for {'C': 0.001}
0.786 (+-0.003) for {'C': 0.01}
0.786 (+-0.003) for {'C': 0.1}
0.786 (+-0.003) for {'C': 1}
0.786 (+-0.003) for {'C': 10}
0.786 (+-0.003) for {'C': 100}
0.786 (+-0.003) for {'C': 1000}


In [31]:
cv.best_estimator_

In [32]:
joblib.dump(cv.best_estimator_, '/Users/alenadenisova/LR_churn_model.pkl')

['/Users/alenadenisova/LR_churn_model.pkl']

In [34]:
model = joblib.load('LR_churn_model.pkl')
print(model)

LogisticRegression(C=0.1)


In [35]:
tr_features = pd.read_csv('/Users/alenadenisova/churn_test_features.csv')
results = model.predict_proba(tr_features)
results

array([[0.90870423, 0.09129577],
       [0.77519575, 0.22480425],
       [0.87797274, 0.12202726],
       ...,
       [0.91332877, 0.08667123],
       [0.92072621, 0.07927379],
       [0.74138302, 0.25861698]])

In [36]:
print(len(results))

110023


In [37]:
print(type(results))

<class 'numpy.ndarray'>


In [38]:
xs  =[b for a,b in results]
>>> xs

[0.09129577060540636,
 0.22480425047515654,
 0.12202726344488284,
 0.1231188257629533,
 0.14037149512064503,
 0.0927207690173233,
 0.19519338557489047,
 0.40966081129253673,
 0.14724683379099032,
 0.10119636729732778,
 0.08244434146890317,
 0.15134371336695399,
 0.06921497685464749,
 0.22045517066898332,
 0.3573114009239916,
 0.19486467557858184,
 0.16426486400732562,
 0.1875863266065148,
 0.09921862217756845,
 0.13510277035358573,
 0.21140093829131645,
 0.24488503808367643,
 0.06461537029636365,
 0.12448814149122302,
 0.183995176766367,
 0.36324388065856994,
 0.33197319556542193,
 0.3398653148821154,
 0.17927602397749673,
 0.24368645811591003,
 0.23724039711148673,
 0.24497967963534464,
 0.16928339868496275,
 0.13992483246605944,
 0.23243742116863692,
 0.11123861175895992,
 0.13388790000058032,
 0.19975034762161972,
 0.14148069896767315,
 0.46743241974103195,
 0.17364578892397037,
 0.22141412694502183,
 0.3124942909696448,
 0.0944730075107934,
 0.09199040319172717,
 0.309969034526403,

In [39]:
print(type(xs))

<class 'list'>


In [40]:
churn_prediction.head()

Unnamed: 0,id
0,165034
1,165035
2,165036
3,165037
4,165038


In [44]:
churn_prediction['Exited0'] = xs
churn_prediction['Exited'] = churn_prediction['Exited0'].round(2)
churn_prediction = churn_prediction.drop(columns = ['Exited0'])
churn_pred_lr = churn_prediction
churn_pred_lr.head()

Unnamed: 0,id,Exited
0,165034,0.09
1,165035,0.22
2,165036,0.12
3,165037,0.12
4,165038,0.14


In [None]:
churn_pred_lr.to_csv('churn_pred_lr.csv', index = False)

In [33]:
#SUPPORT VECTOR MACHINES

from sklearn.svm import SVC #support vector classifier
SVC()

In [None]:
print_results(cv)

In [354]:
svc = SVC()
parameters = { 
    'C': [0.1, 1, 10]
}

cv = GridSearchCV(svc, parameters, cv=5)
cv.fit(tr_features, tr_labels.values.ravel()) #.values.ravel - чтобы сделать array из column vector 

print_results(cv)

BEST PARAMS: {'C': 0.1}]
0.795 (+-0.001) for {'C': 0.1}
0.795 (+-0.001) for {'C': 1}
0.795 (+-0.001) for {'C': 10}


In [355]:
cv.best_estimator_ 

In [356]:
joblib.dump(cv.best_estimator_, '/Users/alenadenisova/Downloads/Ex_Files_Machine_Learning_Algorithms/Exercise Files/SVM_churn_model.pkl')

['/Users/alenadenisova/Downloads/Ex_Files_Machine_Learning_Algorithms/Exercise Files/SVM_churn_model.pkl']

In [59]:
#MULTILAYER PERCEPTRON:

from sklearn.neural_network import MLPRegressor, MLPClassifier
print(MLPRegressor())
print(MLPClassifier())

MLPRegressor()
MLPClassifier()


In [62]:
tr_features = pd.read_csv('/Users/alenadenisova/churn_train_features.csv')
tr_labels = pd.read_csv('/Users/alenadenisova/churn_train_labels.csv')

In [63]:
mlp = MLPClassifier()
parameters = {
    'hidden_layer_sizes': [(10,), (50,), (100,)], #10,50,100-nodes, 1 layer
    'activation': ['relu', 'tanh', ''],
    'learning_rate': ['constant', 'invscaling', 'adaptive']
}    
#invscaling - larger steps at first, smaller later
#adaptive - keeps LR constant, while the training loss keeps decreasing, 
#and whem it stops, LR will be decreased, so that it takes smaller steps

cv = GridSearchCV(mlp, parameters, cv=5)
cv.fit(tr_features, tr_labels.values.ravel()) #.values.ravel - чтобы сделать array из column vector 

print_results(cv)

45 fits failed out of a total of 135.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
45 fits failed with the following error:
Traceback (most recent call last):
  File "/Users/alenadenisova/anaconda3/lib/python3.10/site-packages/sklearn/model_selection/_validation.py", line 686, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "/Users/alenadenisova/anaconda3/lib/python3.10/site-packages/sklearn/neural_network/_multilayer_perceptron.py", line 745, in fit
    self._validate_params()
  File "/Users/alenadenisova/anaconda3/lib/python3.10/site-packages/sklearn/base.py", line 581, in _validate_params
    validate_parameter_constraints(
  File "/Users/alenadenisova/anaconda3/lib/python3.10/site-packages/sklearn/utils/

BEST PARAMS: {'activation': 'tanh', 'hidden_layer_sizes': (10,), 'learning_rate': 'constant'}]
0.683 (+-0.365) for {'activation': 'relu', 'hidden_layer_sizes': (10,), 'learning_rate': 'constant'}
0.78 (+-0.053) for {'activation': 'relu', 'hidden_layer_sizes': (10,), 'learning_rate': 'invscaling'}
0.677 (+-0.363) for {'activation': 'relu', 'hidden_layer_sizes': (10,), 'learning_rate': 'adaptive'}
0.72 (+-0.163) for {'activation': 'relu', 'hidden_layer_sizes': (50,), 'learning_rate': 'constant'}
0.775 (+-0.07) for {'activation': 'relu', 'hidden_layer_sizes': (50,), 'learning_rate': 'invscaling'}
0.673 (+-0.324) for {'activation': 'relu', 'hidden_layer_sizes': (50,), 'learning_rate': 'adaptive'}
0.765 (+-0.102) for {'activation': 'relu', 'hidden_layer_sizes': (100,), 'learning_rate': 'constant'}
0.723 (+-0.251) for {'activation': 'relu', 'hidden_layer_sizes': (100,), 'learning_rate': 'invscaling'}
0.765 (+-0.085) for {'activation': 'relu', 'hidden_layer_sizes': (100,), 'learning_rate': 'a

In [64]:
cv.best_estimator_

In [66]:
joblib.dump(cv.best_estimator_,'/Users/alenadenisova/MLP_churn_model.pkl')

['/Users/alenadenisova/MLP_churn_model.pkl']

In [67]:
model = cv.best_estimator_
model

In [68]:
tr_features = pd.read_csv('/Users/alenadenisova/churn_test_features.csv')
results = model.predict_proba(tr_features)
results

array([[0.84404878, 0.15595122],
       [0.84404878, 0.15595122],
       [0.84404878, 0.15595122],
       ...,
       [0.84404878, 0.15595122],
       [0.84404878, 0.15595122],
       [0.72755316, 0.27244684]])

In [69]:
xs_mlp  =[b for a,b in results]
>>> xs_mlp

[0.15595122024525013,
 0.15595122024525013,
 0.15595122024525013,
 0.15595122024525013,
 0.2724468369090433,
 0.15595122024525013,
 0.15595122024525013,
 0.15595122024525013,
 0.2724468369090433,
 0.15595122024525013,
 0.2724468369090433,
 0.15595122024525013,
 0.15595122024525013,
 0.2724468369090433,
 0.2724468369090433,
 0.15595122024525013,
 0.2724468369090433,
 0.15595122024525013,
 0.15595122024525013,
 0.2822051739570311,
 0.15595122024525013,
 0.2724468369090433,
 0.15595122024525013,
 0.15595122024525013,
 0.2724468369090433,
 0.2724468369090433,
 0.15595122024525013,
 0.15595122024525013,
 0.2724468369090433,
 0.15595122024525013,
 0.15595122024525013,
 0.15595122024525013,
 0.15595122024525013,
 0.2724468369090433,
 0.2724468369090433,
 0.15595122024525013,
 0.15595122024525013,
 0.15595122024525013,
 0.2724468369090433,
 0.2724468369090433,
 0.15595122024525013,
 0.2724468369090433,
 0.2822051739570311,
 0.15595122024525013,
 0.15595122024525013,
 0.15595122024525013,
 0.15

In [70]:
churn_prediction.head()

Unnamed: 0,id,Exited
0,165034,0.09
1,165035,0.22
2,165036,0.12
3,165037,0.12
4,165038,0.14


In [72]:
churn_pred_mlp = churn_prediction.drop(columns = ['Exited'])
churn_pred_mlp

Unnamed: 0,id
0,165034
1,165035
2,165036
3,165037
4,165038
...,...
110018,275052
110019,275053
110020,275054
110021,275055


In [74]:
churn_pred_mlp['Exited'] = xs_mlp

churn_pred_mlp.head(10)

Unnamed: 0,id,Exited
0,165034,0.155951
1,165035,0.155951
2,165036,0.155951
3,165037,0.155951
4,165038,0.272447
5,165039,0.155951
6,165040,0.155951
7,165041,0.155951
8,165042,0.272447
9,165043,0.155951


In [45]:
#RANDOM FOREST:

from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor
print(RandomForestClassifier())
print(RandomForestRegressor())

RandomForestClassifier()
RandomForestRegressor()


In [47]:
tr_features = pd.read_csv('/Users/alenadenisova/churn_train_features.csv')
tr_labels = pd.read_csv('/Users/alenadenisova/churn_train_labels.csv') 

rf = RandomForestClassifier()
parameters = {
    'n_estimators': [5, 50, 250],
    'max_depth': [2, 4, 8, 16, 32, None]
}
cv = GridSearchCV(rf, parameters, cv = 5)
cv.fit(tr_features, tr_labels.values.ravel())

print_results(cv)

BEST PARAMS: {'max_depth': 16, 'n_estimators': 250}]
0.808 (+-0.027) for {'max_depth': 2, 'n_estimators': 5}
0.808 (+-0.007) for {'max_depth': 2, 'n_estimators': 50}
0.802 (+-0.001) for {'max_depth': 2, 'n_estimators': 250}
0.84 (+-0.008) for {'max_depth': 4, 'n_estimators': 5}
0.847 (+-0.004) for {'max_depth': 4, 'n_estimators': 50}
0.847 (+-0.002) for {'max_depth': 4, 'n_estimators': 250}
0.86 (+-0.003) for {'max_depth': 8, 'n_estimators': 5}
0.862 (+-0.003) for {'max_depth': 8, 'n_estimators': 50}
0.862 (+-0.004) for {'max_depth': 8, 'n_estimators': 250}
0.854 (+-0.003) for {'max_depth': 16, 'n_estimators': 5}
0.862 (+-0.003) for {'max_depth': 16, 'n_estimators': 50}
0.863 (+-0.003) for {'max_depth': 16, 'n_estimators': 250}
0.841 (+-0.001) for {'max_depth': 32, 'n_estimators': 5}
0.858 (+-0.004) for {'max_depth': 32, 'n_estimators': 50}
0.859 (+-0.003) for {'max_depth': 32, 'n_estimators': 250}
0.84 (+-0.002) for {'max_depth': None, 'n_estimators': 5}
0.858 (+-0.003) for {'max_dept

In [48]:
cv.best_estimator_

In [49]:
joblib.dump(cv.best_estimator_, '/Users/alenadenisova/RF_churn_model.pkl')

['/Users/alenadenisova/RF_churn_model.pkl']

In [51]:
model = rf
model

In [52]:
tr_features = pd.read_csv('/Users/alenadenisova/churn_test_features.csv')
results = model.predict_proba(tr_features)
results

array([[0.98270665, 0.01729335],
       [0.12779001, 0.87220999],
       [0.97590796, 0.02409204],
       ...,
       [0.98635398, 0.01364602],
       [0.83919945, 0.16080055],
       [0.82614913, 0.17385087]])

In [53]:
xs  =[b for a,b in results]
>>> xs

[0.017293353348143744,
 0.8722099857720464,
 0.024092035559953727,
 0.1879198668582944,
 0.33515707044760196,
 0.04460598371948298,
 0.051898121496185216,
 0.07420089515587219,
 0.5625019238247988,
 0.004416462738910466,
 0.14127223023433383,
 0.014047958177241556,
 0.01535896974100876,
 0.18673106642256626,
 0.6911932940190528,
 0.03798361246233838,
 0.09046244363498608,
 0.35629638069316694,
 0.014188596892029082,
 0.0742584826764143,
 0.01204601368858175,
 0.15162621402564852,
 0.2248597102991411,
 0.01967459414026639,
 0.5313262481393539,
 0.13995187731906078,
 0.9428314336174726,
 0.587673795883045,
 0.12798308034206443,
 0.46201821767482004,
 0.008373854520423326,
 0.029542274356785657,
 0.2025153668016506,
 0.12941917177246845,
 0.0380699581306421,
 0.008272889617768977,
 0.844355471860312,
 0.03180327524847774,
 0.11972063197527547,
 0.9216017120394755,
 0.5773498267059173,
 0.16084698518953713,
 0.15012519859550438,
 0.003031717512234232,
 0.3006332701554959,
 0.34495193100715

In [55]:
churn_prediction.head(10)

Unnamed: 0,id,Exited
0,165034,0.09
1,165035,0.22
2,165036,0.12
3,165037,0.12
4,165038,0.14
5,165039,0.09
6,165040,0.2
7,165041,0.41
8,165042,0.15
9,165043,0.1


In [56]:
churn_pred_rf = churn_prediction.drop(columns = ['Exited'])
churn_pred_rf.head(10)

Unnamed: 0,id
0,165034
1,165035
2,165036
3,165037
4,165038
5,165039
6,165040
7,165041
8,165042
9,165043


In [57]:
churn_pred_rf['Exited0'] = xs
churn_pred_rf['Exited'] = churn_pred_rf['Exited0'].round(2)
churn_pred_rf = churn_pred_rf.drop(columns = ['Exited0'])
churn_pred_rf.head()

Unnamed: 0,id,Exited
0,165034,0.02
1,165035,0.87
2,165036,0.02
3,165037,0.19
4,165038,0.34


In [58]:
churn_pred_rf.head(10)

Unnamed: 0,id,Exited
0,165034,0.02
1,165035,0.87
2,165036,0.02
3,165037,0.19
4,165038,0.34
5,165039,0.04
6,165040,0.05
7,165041,0.07
8,165042,0.56
9,165043,0.0


In [343]:
#BOOSTING

from sklearn.ensemble import GradientBoostingClassifier, GradientBoostingRegressor

gb = GradientBoostingClassifier()
parameters={
    'n_estimators':[5, 50, 250, 500],
    'max_depth':[1, 3, 5, 7, 9],
    'learning_rate': [0.01, 0.1, 1, 10, 100]
}
#max_depth = 1-"Decision Stump"

cv = GridSearchCV(gb, parameters, cv = 5)
cv.fit(tr_features, tr_labels.values.ravel())

print_results(cv)

BEST PARAMS: {'learning_rate': 0.01, 'max_depth': 7, 'n_estimators': 500}]
0.795 (+-0.001) for {'learning_rate': 0.01, 'max_depth': 1, 'n_estimators': 5}
0.795 (+-0.001) for {'learning_rate': 0.01, 'max_depth': 1, 'n_estimators': 50}
0.811 (+-0.005) for {'learning_rate': 0.01, 'max_depth': 1, 'n_estimators': 250}
0.832 (+-0.008) for {'learning_rate': 0.01, 'max_depth': 1, 'n_estimators': 500}
0.795 (+-0.001) for {'learning_rate': 0.01, 'max_depth': 3, 'n_estimators': 5}
0.796 (+-0.005) for {'learning_rate': 0.01, 'max_depth': 3, 'n_estimators': 50}
0.856 (+-0.016) for {'learning_rate': 0.01, 'max_depth': 3, 'n_estimators': 250}
0.858 (+-0.013) for {'learning_rate': 0.01, 'max_depth': 3, 'n_estimators': 500}
0.795 (+-0.001) for {'learning_rate': 0.01, 'max_depth': 5, 'n_estimators': 5}
0.809 (+-0.011) for {'learning_rate': 0.01, 'max_depth': 5, 'n_estimators': 50}
0.861 (+-0.017) for {'learning_rate': 0.01, 'max_depth': 5, 'n_estimators': 250}
0.861 (+-0.017) for {'learning_rate': 0.01,

In [344]:
cv.best_estimator_

In [345]:
joblib.dump(cv.best_estimator_, '/Users/alenadenisova/Downloads/Ex_Files_Machine_Learning_Algorithms/Exercise Files/GB_churn_model.pkl')

['/Users/alenadenisova/Downloads/Ex_Files_Machine_Learning_Algorithms/Exercise Files/GB_churn_model.pkl']

In [346]:
from sklearn.metrics import accuracy_score, precision_score, recall_score
from time import time

In [357]:
val_features = pd.read_csv('/Users/alenadenisova/Downloads/Ex_Files_Machine_Learning_Algorithms/Exercise Files/churn_val_features.csv')
val_labels = pd.read_csv('/Users/alenadenisova/Downloads/Ex_Files_Machine_Learning_Algorithms/Exercise Files/churn_val_labels.csv') #header=None
#column vector type

test_features =pd.read_csv('/Users/alenadenisova/Downloads/Ex_Files_Machine_Learning_Algorithms/Exercise Files/churn_test_features.csv')
test_labels = pd.read_csv('/Users/alenadenisova/Downloads/Ex_Files_Machine_Learning_Algorithms/Exercise Files/churn_test_labels.csv')

print('val features shape: ', val_features.shape)
print('val labels shape: ', val_labels.shape)
print('test features shape: ', test_features.shape)
print('test labels shape: ', test_labels.shape)

val features shape:  (2000, 10)
val labels shape:  (2000, 1)
test features shape:  (2001, 10)
test labels shape:  (2001, 1)


In [358]:
models = {}

for mdl in ['LR', 'SVM', 'MLP', 'RF', 'GB']:
    models[mdl] = joblib.load('/Users/alenadenisova/Downloads/Ex_Files_Machine_Learning_Algorithms/Exercise Files/{}_churn_model.pkl'.format(mdl))
    

In [359]:
models

{'LR': LogisticRegression(C=0.01),
 'SVM': SVC(C=0.1),
 'MLP': MLPClassifier(activation='tanh', hidden_layer_sizes=(10,)),
 'RF': RandomForestClassifier(n_estimators=250),
 'GB': GradientBoostingClassifier(learning_rate=0.01, max_depth=7, n_estimators=500)}

In [360]:
def evaluate_model(name, model, features, labels):
    start = time()
    pred = model.predict(features)
    end = time()
    accuracy = round(accuracy_score(labels, pred),3)
    precision = round(precision_score(labels, pred),3)
    recall = round(recall_score(labels, pred),3)
    print('{} -- Accuracy: {} /  Precision: {} /  Recall: {} / Latency: {} ms'.format(name, 
                                                                                      accuracy,  
                                                                                      precision, 
                                                                                      recall, 
                                                                                      round((end - start)*1000, 1)))

In [361]:
for name, mdl in models.items():
    evaluate_model(name, mdl, val_features, val_labels)

LR -- Accuracy: 0.794 /  Precision: 0.338 /  Recall: 0.07 / Latency: 19.5 ms
SVM -- Accuracy: 0.806 /  Precision: 0.0 /  Recall: 0.0 / Latency: 255.4 ms
MLP -- Accuracy: 0.806 /  Precision: 0.0 /  Recall: 0.0 / Latency: 0.8 ms
RF -- Accuracy: 0.866 /  Precision: 0.748 /  Recall: 0.468 / Latency: 66.9 ms
GB -- Accuracy: 0.858 /  Precision: 0.699 /  Recall: 0.468 / Latency: 33.4 ms


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [363]:
#Лучшую модель:
evaluate_model('Random Forest', models['RF'], test_features, test_labels)

Random Forest -- Accuracy: 0.861 /  Precision: 0.758 /  Recall: 0.488 / Latency: 78.3 ms


In [None]:
churn_pred_rf.to_csv('submission.csv', index = False)