# описание проекта
- Оператор мобильной связи «Мегалайн» выяснил: многие клиенты пользуются архивными тарифами. Они хотят построить систему, способную проанализировать поведение клиентов и предложить пользователям новый тариф: «Смарт» или «Ультра».
- В вашем распоряжении данные о поведении клиентов, которые уже перешли на эти тарифы (из проекта курса «Статистический анализ данных»). Нужно построить модель для задачи классификации, которая выберет подходящий тариф. Предобработка данных не понадобится — вы её уже сделали.
- Постройте модель с максимально большим значением accuracy. Чтобы сдать проект успешно, нужно довести долю правильных ответов по крайней мере до 0.75. Проверьте accuracy на тестовой выборке самостоятельно.

# 1.Сначала изучу данные и разделю по выборкам

In [1]:
import pandas as pd
df = pd.read_csv('users_behavior.csv')
df.head(10)

Unnamed: 0,calls,minutes,messages,mb_used,is_ultra
0,40.0,311.9,83.0,19915.42,0
1,85.0,516.75,56.0,22696.96,0
2,77.0,467.66,86.0,21060.45,0
3,106.0,745.53,81.0,8437.39,1
4,66.0,418.74,1.0,14502.75,0
5,58.0,344.56,21.0,15823.37,0
6,57.0,431.64,20.0,3738.9,1
7,15.0,132.4,6.0,21911.6,0
8,7.0,43.39,3.0,2538.67,1
9,90.0,665.41,38.0,17358.61,0


In [4]:
df.describe()

Unnamed: 0,calls,minutes,messages,mb_used,is_ultra
count,3214.0,3214.0,3214.0,3214.0,3214.0
mean,63.038892,438.208787,38.281269,17207.673836,0.306472
std,33.236368,234.569872,36.148326,7570.968246,0.4611
min,0.0,0.0,0.0,0.0,0.0
25%,40.0,274.575,9.0,12491.9025,0.0
50%,62.0,430.6,30.0,16943.235,0.0
75%,82.0,571.9275,57.0,21424.7,1.0
max,244.0,1632.06,224.0,49745.73,1.0


In [5]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3214 entries, 0 to 3213
Data columns (total 5 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   calls     3214 non-null   float64
 1   minutes   3214 non-null   float64
 2   messages  3214 non-null   float64
 3   mb_used   3214 non-null   float64
 4   is_ultra  3214 non-null   int64  
dtypes: float64(4), int64(1)
memory usage: 125.7 KB


- данных не так много. Здесь явная задача классификации. За целевой признак беру is_ultra. Остальные признаки буду использовать для решений по столбцу is_ultra.

In [32]:
from sklearn.model_selection import train_test_split
df_train_data, df_div_data = train_test_split(df, test_size=0.4, random_state=2281456)
df_valid_data, df_test_data = train_test_split(df_div_data, test_size=0.5, random_state=2281456)
print('Train data size', df_train_data.shape[0])
print('Valid data size', df_valid_data.shape[0])
print('Test data size', df_test_data.shape[0])

Train data size 1928
Valid data size 643
Test data size 643


- Разбил по принципу 3/1/1

- Раздам признаки далее

In [33]:
df_train_data_features = df_train_data.drop(['is_ultra'], axis=1)
df_train_data_target = df_train_data['is_ultra']
df_valid_data_features = df_valid_data.drop(['is_ultra'], axis=1)
df_valid_data_target = df_valid_data['is_ultra']
df_test_data_features = df_test_data.drop(['is_ultra'], axis=1)
df_test_data_target = df_test_data['is_ultra']


- обучу по df_train_data, валидация по df_valid_data. Лучшую из моделей будет протестирована на df_test_data.

# 2. обучение моделей и подбор лучшего гиперпараметра


In [15]:
best_depth = 0
best_accuracy = 0
from sklearn.tree import DecisionTreeClassifier 
from sklearn.metrics import accuracy_score
for depth in range(1,20):
    model = DecisionTreeClassifier(random_state=2281456, max_depth=depth)
    model.fit(df_train_data_features, df_train_data_target)
    valid_predictions = model.predict(df_valid_data_features)
    accuracy = accuracy_score(df_valid_data_target, valid_predictions)
    print('Depth', depth, 'Accuracy', accuracy)
    if accuracy > best_accuracy:
        best_depth = depth
        best_accuracy = accuracy
print('Best depth', best_depth, 'Best acc', best_accuracy)

Depth 1 Accuracy 0.7558320373250389
Depth 2 Accuracy 0.7838258164852255
Depth 3 Accuracy 0.7947122861586314
Depth 4 Accuracy 0.7853810264385692
Depth 5 Accuracy 0.7947122861586314
Depth 6 Accuracy 0.7900466562986003
Depth 7 Accuracy 0.7916018662519441
Depth 8 Accuracy 0.8009331259720062
Depth 9 Accuracy 0.7947122861586314
Depth 10 Accuracy 0.7900466562986003
Depth 11 Accuracy 0.7916018662519441
Depth 12 Accuracy 0.776049766718507
Depth 13 Accuracy 0.7573872472783826
Depth 14 Accuracy 0.7418351477449455
Depth 15 Accuracy 0.7418351477449455
Depth 16 Accuracy 0.7465007776049767
Depth 17 Accuracy 0.7433903576982893
Depth 18 Accuracy 0.7356143079315708
Depth 19 Accuracy 0.7433903576982893
Best depth 8 Best acc 0.8009331259720062


- Возьму лучшую глубину в качестве начального гипермараметра

In [24]:
best_tree_model = DecisionTreeClassifier(random_state=2281456, max_depth=8)
best_tree_model.fit(df_train_data_features, df_train_data_target)
valid_predictions = best_tree_model.predict(df_valid_data_features)
accuracy = accuracy_score(df_valid_data_target, valid_predictions)
print('Depth 8', 'Accuracy', accuracy)

Depth 8 Accuracy 0.8009331259720062


In [25]:
best_tree_model = DecisionTreeClassifier(random_state=2281456, max_depth=8, criterion="entropy")
best_tree_model.fit(df_train_data_features, df_train_data_target)
valid_predictions = best_tree_model.predict(df_valid_data_features)
accuracy = accuracy_score(df_valid_data_target, valid_predictions)
print('Depth 8', 'Accuracy', accuracy)

Depth 8 Accuracy 0.7869362363919129


In [26]:
best_tree_model = DecisionTreeClassifier(random_state=2281456, max_depth=8, splitter='random')
best_tree_model.fit(df_train_data_features, df_train_data_target)
valid_predictions = best_tree_model.predict(df_valid_data_features)
accuracy = accuracy_score(df_valid_data_target, valid_predictions)
print('Depth 8', 'Accuracy', accuracy)

Depth 8 Accuracy 0.7838258164852255


## Gini показал себя лучше всех. Дальше буду учитывать именно его

- перехожу к оценке RandomForestClassifier

In [81]:
best_estim = 0
best_accuracy = 0
from sklearn.ensemble import RandomForestClassifier 
from sklearn.metrics import accuracy_score
for estim in range(1,20):
    model = RandomForestClassifier(random_state=2281459, n_estimators=estim)
    model.fit(df_train_data_features, df_train_data_target)
    valid_predictions = model.predict(df_valid_data_features)
    accuracy = accuracy_score(df_valid_data_target, valid_predictions)
    print('Estimators', estim, 'Accuracy', accuracy)
    if accuracy > best_accuracy:
        best_estim = estim
        best_accuracy = accuracy
print('Best estimators number', best_estim, 'Best acc', best_accuracy)

Estimators 1 Accuracy 0.7045101088646968
Estimators 2 Accuracy 0.7667185069984448
Estimators 3 Accuracy 0.7527216174183515
Estimators 4 Accuracy 0.7900466562986003
Estimators 5 Accuracy 0.7729393468118196
Estimators 6 Accuracy 0.7931570762052877
Estimators 7 Accuracy 0.7869362363919129
Estimators 8 Accuracy 0.807153965785381
Estimators 9 Accuracy 0.8055987558320373
Estimators 10 Accuracy 0.7978227060653188
Estimators 11 Accuracy 0.7962674961119751
Estimators 12 Accuracy 0.7993779160186625
Estimators 13 Accuracy 0.7978227060653188
Estimators 14 Accuracy 0.8040435458786936
Estimators 15 Accuracy 0.8040435458786936
Estimators 16 Accuracy 0.80248833592535
Estimators 17 Accuracy 0.80248833592535
Estimators 18 Accuracy 0.8087091757387247
Estimators 19 Accuracy 0.8133748055987559
Best estimators number 19 Best acc 0.8133748055987559


In [86]:
best_forest_model = RandomForestClassifier(random_state=2281459, n_estimators=19)
best_forest_model.fit(df_train_data_features, df_train_data_target)
valid_predictions = best_forest_model.predict(df_valid_data_features)
accuracy = accuracy_score(df_valid_data_target, valid_predictions)
print('N_estim 19', 'Accuracy', accuracy)

N_estim 19 Accuracy 0.8133748055987559


In [87]:
best_forest_model = RandomForestClassifier(random_state=2281459, n_estimators=19, criterion='entropy')
best_forest_model.fit(df_train_data_features, df_train_data_target)
valid_predictions = best_forest_model.predict(df_valid_data_features)
accuracy = accuracy_score(df_valid_data_target, valid_predictions)
print('N_estim 19', 'Accuracy', accuracy)

N_estim 19 Accuracy 0.8087091757387247


In [88]:
best_forest_model = RandomForestClassifier(random_state=2281459, n_estimators=19, bootstrap=False)
best_forest_model.fit(df_train_data_features, df_train_data_target)
valid_predictions = best_forest_model.predict(df_valid_data_features)
accuracy = accuracy_score(df_valid_data_target, valid_predictions)
print('N_estim 19', 'Accuracy', accuracy)

N_estim 19 Accuracy 0.7729393468118196


In [85]:
best_forest_model = RandomForestClassifier(random_state=2281459, n_estimators=19, warm_start=True)
best_forest_model.fit(df_train_data_features, df_train_data_target)
valid_predictions = best_forest_model.predict(df_valid_data_features)
accuracy = accuracy_score(df_valid_data_target, valid_predictions)
print('N_estim 19', 'Accuracy', accuracy)

N_estim 19 Accuracy 0.8133748055987559


## Беру последнее из значений за лучшее.

- перехожу к логистической регрессии

In [75]:
best_max_iter = 0
best_accuracy = 0
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score
for max_iter in range(1,21):
    model = LogisticRegression(random_state=100500, max_iter=max_iter)
    model.fit(df_train_data_features, df_train_data_target)
    valid_predictions = model.predict(df_valid_data_features)
    accuracy = accuracy_score(df_valid_data_target, valid_predictions)
    print('Itter count', max_iter, 'Accuracy', accuracy)
    if accuracy > best_accuracy:
        best_max_iter = max_iter
        best_accuracy = accuracy
print('Best count iter', best_max_iter, 'Best acc', best_accuracy)

Itter count 1 Accuracy 0.7107309486780715
Itter count 2 Accuracy 0.7107309486780715
Itter count 3 Accuracy 0.7107309486780715
Itter count 4 Accuracy 0.7107309486780715
Itter count 5 Accuracy 0.7107309486780715
Itter count 6 Accuracy 0.7107309486780715
Itter count 7 Accuracy 0.7107309486780715
Itter count 8 Accuracy 0.7107309486780715
Itter count 9 Accuracy 0.7107309486780715
Itter count 10 Accuracy 0.7107309486780715
Itter count 11 Accuracy 0.7107309486780715
Itter count 12 Accuracy 0.7107309486780715
Itter count 13 Accuracy 0.7107309486780715
Itter count 14 Accuracy 0.713841368584759
Itter count 15 Accuracy 0.7216174183514774
Itter count 16 Accuracy 0.7247278382581649
Itter count 17 Accuracy 0.7247278382581649
Itter count 18 Accuracy 0.7262830482115086
Itter count 19 Accuracy 0.7262830482115086
Itter count 20 Accuracy 0.7278382581648523
Best count iter 20 Best acc 0.7278382581648523


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt

In [76]:
best_reg_model = LogisticRegression(random_state=100500)
best_reg_model.fit(df_train_data_features, df_train_data_target)
valid_predictions = best_reg_model.predict(df_valid_data_features)
accuracy = accuracy_score(df_valid_data_target, valid_predictions)
print('iter count', 100, 'Accuracy', accuracy)

iter count 100 Accuracy 0.7231726283048211


## В ходе выполненной работы я высянил, что:
- Точность модели решающего дерева при глубине 8 равна 0.801
- Точность модели случайного леса при количестве деревьев 19 равна 0.813
- Точность модели логистической регрессии равна 0.723
- Лучшим из результатов считаю модель случайного леса с глубиной 19

# 4.Проверка моделей на тестовой выборке

In [90]:
test_predictions = best_tree_model.predict(df_test_data_features)
accuracy = accuracy_score(df_test_data_target, test_predictions)
print('DecTree',accuracy)
test_predictions = best_forest_model.predict(df_test_data_features)
accuracy = accuracy_score(df_test_data_target, test_predictions)
print('RandomForest acc',accuracy)
test_predictions = best_reg_model.predict(df_test_data_features)
accuracy = accuracy_score(df_test_data_target, test_predictions)
print('Reg acc',accuracy)

DecTree 0.6967340590979783
RandomForest acc 0.7682737169517885
Reg acc 0.6967340590979783


# Исходя из данных проверки моделей могу сделать вывод, что условиям задания соответствует только модель случайного леса