In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import f1_score, mean_squared_error
from sklearn.pipeline import Pipeline

In [2]:
data = pd.read_csv('dataset.csv', sep=',')
data = data.drop('Unnamed: 0', axis=1).drop('Id', axis=1)

In [3]:
feature_columns = data.columns.to_list()[1:-1]
target_column = data.columns.to_list()[-1]
id_column = data.columns.to_list()[0]

In [4]:
data.dropna(inplace=True)

In [5]:
data['Gender'] = LabelEncoder().fit_transform(data['Gender'])
data['Geography'] = LabelEncoder().fit_transform(data['Geography'])
data.drop(['CustomerId', 'Surname'], axis=1, inplace=True)

In [6]:
X = data.drop('Exited', axis=1)
y = data['Exited']

In [7]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=12345)

In [8]:
pipelines = [
    ('Логистическая регрессия', Pipeline([
        ('scaler', StandardScaler()),
        ('classifier', LogisticRegression(solver='liblinear', class_weight='balanced'))
    ])),
    ('Дерево принятия решений', Pipeline([
        ('scaler', StandardScaler()),
        ('classifier', DecisionTreeClassifier())
    ])),
    ('Случайный лес', Pipeline([
        ('scaler', StandardScaler()),
        ('classifier', RandomForestClassifier())
    ]))
]

In [9]:
index = 1
for name, pipeline in pipelines:
  pipeline.fit(X_train, y_train)
  y_pred = pipeline.predict(X_test)
  print(f"# {name}")
  print(f"  F1 score = {f1_score(y_test, y_pred)}")
  print(f"  MSE      = {mean_squared_error(y_test, y_pred)}")
  print(f"  accuracy = {pipeline.score(X_test, y_test)}")
  print()

# Логистическая регрессия
  F1 score = 0.5454545454545454
  MSE      = 0.2631578947368421
  accuracy = 0.7368421052631579

# Дерево принятия решений
  F1 score = 0.5
  MSE      = 0.21052631578947367
  accuracy = 0.7894736842105263

# Случайный лес
  F1 score = 0.28571428571428575
  MSE      = 0.2631578947368421
  accuracy = 0.7368421052631579



In [14]:
data

Unnamed: 0,Score,Geography,Gender,Age,Tenure,Balance,NumOfProducts,Has,IsActiveMember,EstimatedSalary,Exited
0,619,0,0,42,1.0,83807.86,1,0,1,112542.58,0
1,608,2,0,41,8.0,159660.80,3,1,0,113931.57,1
2,502,0,0,42,1.0,0.00,2,0,0,93826.63,0
3,699,0,0,39,2.0,125510.82,1,1,1,79084.10,0
4,850,2,0,43,8.0,113755.78,2,1,0,149756.71,1
...,...,...,...,...,...,...,...,...,...,...,...
94,800,0,0,39,5.0,0.00,2,1,0,96270.64,0
95,771,0,1,35,10.0,57369.61,1,1,1,101699.77,0
96,516,0,1,36,7.0,0.00,1,0,1,42085.58,1
97,709,0,0,42,3.0,75075.31,2,1,0,92888.52,1
