In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

In [2]:
df = pd.read_csv('/datasets/users_behavior.csv')

In [3]:
df.shape

(3214, 5)

In [4]:
df.head(10)

Unnamed: 0,calls,minutes,messages,mb_used,is_ultra
0,40.0,311.9,83.0,19915.42,0
1,85.0,516.75,56.0,22696.96,0
2,77.0,467.66,86.0,21060.45,0
3,106.0,745.53,81.0,8437.39,1
4,66.0,418.74,1.0,14502.75,0
5,58.0,344.56,21.0,15823.37,0
6,57.0,431.64,20.0,3738.9,1
7,15.0,132.4,6.0,21911.6,0
8,7.0,43.39,3.0,2538.67,1
9,90.0,665.41,38.0,17358.61,0


In [5]:
#Defining features and target
features = df.drop(['is_ultra'],axis=1)
target = df['is_ultra']

I will split the data with a 3:1:1 ratio in two steps.

In [13]:
#Splitting data into main and test sets
test_size = 0.2
features_main, features_test, target_main, target_test = \
    train_test_split(features,target, test_size=test_size, random_state=12345)

In [7]:
#Splitting main data into train and validation sets
valid_size = 0.2
features_train, features_valid, target_train, target_valid = \
    train_test_split(features_main,target_main, test_size=valid_size, random_state=12345)

In [8]:
#Finding the max depth for best accuracy(Decision Tree model)
for depth in range(1, 10):
    model=DecisionTreeClassifier(random_state=12345, max_depth=depth)
    model.fit(features_train,target_train)
    predictions_valid = model.predict(features_valid)
    
    print("max_depth =", depth, ": ", end='')
    print(accuracy_score(target_valid, predictions_valid)) 

max_depth = 1 : 0.7223300970873786
max_depth = 2 : 0.7475728155339806
max_depth = 3 : 0.7553398058252427
max_depth = 4 : 0.7533980582524272
max_depth = 5 : 0.7572815533980582
max_depth = 6 : 0.7611650485436893
max_depth = 7 : 0.7650485436893204
max_depth = 8 : 0.7631067961165049
max_depth = 9 : 0.7533980582524272


In [9]:
#Finding the best accuracy on Random Forest model
best_score = 0
best_est = 0
for est in range(1, 10): 
    model = RandomForestClassifier(random_state=12345, n_estimators=est)
    model.fit(features_train,target_train) 
    score = model.score(features_valid,target_valid)
    if score > best_score:
        best_score = score
        best_est = est


print("Accuracy of the best model (n_estimators = {}): {}".format(best_est, best_score))

Accuracy of the best model (n_estimators = 9): 0.7708737864077669


In [10]:
#Finding accuracy of Logistic Regression model
model = LogisticRegression(random_state=12345, solver='liblinear')
model.fit(features_train,target_train)
score_train = model.score(features_valid, target_valid)  

print("Accuracy of the logistic regression model:", score_train)

Accuracy of the logistic regression model: 0.7184466019417476


Random forest model returned the best accuracy with 9 estimators therefore I will continue with this model.

In [11]:
test_model = RandomForestClassifier(random_state=12345, n_estimators=9)
test_model.fit(features_train,target_train)
test_predictions = test_model.predict(features_test)

In [12]:
#Finding the accuracy
test_model.score(features_test,target_test)

0.7713841368584758

### Conclusion

The data was split into train, validation, and test sets witha 3:1:1 ratio. The model was trained with the train set and hyperparamters were tuned using the validation set. The accuracy of different models were compared and random forest model with 9 estimators was found to be best accurate among others with an accuracy of 0.77. Finally, the model was used on the test set and the accuracy was found to be 0.77. 