In [1]:
# loading all the different machine learning models so if I want I can try the different models
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

In [2]:
# Load the data and take a look to get an example of what the data looks like
df = pd.read_csv('/datasets/users_behavior.csv')
print(df.head())
print(df.info())
print(df['is_ultra'].value_counts())

   calls  minutes  messages   mb_used  is_ultra
0   40.0   311.90      83.0  19915.42         0
1   85.0   516.75      56.0  22696.96         0
2   77.0   467.66      86.0  21060.45         0
3  106.0   745.53      81.0   8437.39         1
4   66.0   418.74       1.0  14502.75         0
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3214 entries, 0 to 3213
Data columns (total 5 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   calls     3214 non-null   float64
 1   minutes   3214 non-null   float64
 2   messages  3214 non-null   float64
 3   mb_used   3214 non-null   float64
 4   is_ultra  3214 non-null   int64  
dtypes: float64(4), int64(1)
memory usage: 125.7 KB
None
0    2229
1     985
Name: is_ultra, dtype: int64


In [4]:
features = df.drop('is_ultra', axis=1)
target = df['is_ultra']

In [5]:
# Train 60%, validation 20%, test 20%
features_train, features_temp, target_train, target_temp = train_test_split(
    features, target, test_size=0.4, random_state=12345)
features_valid, features_test, target_valid, target_test = train_test_split(
    features_temp, target_temp, test_size=0.5, random_state=12345)

In [6]:
# Decision Tree

best_tree = None
best_tree_acc = 0
for depth in range(1, 21):
    model = DecisionTreeClassifier(max_depth=depth, random_state=12345)
    model.fit(features_train, target_train)
    predictions_valid = model.predict(features_valid)
    acc = accuracy_score(target_valid, predictions_valid)
    if acc > best_tree_acc:
        best_tree_acc = acc
        best_tree = model

print("Best Decision Tree validation accuracy:", best_tree_acc)

Best Decision Tree validation accuracy: 0.7853810264385692


In [7]:
# We can see that the Decision Trees accuracy is just over 75%

In [8]:
# Random Forest

best_forest = None
best_forest_acc = 0
for est in [10, 50, 100, 200]:
    for depth in [5, 10, 15, 20]:
        model = RandomForestClassifier(
            n_estimators=est, max_depth=depth, random_state=12345)
        model.fit(features_train, target_train)
        predictions_valid = model.predict(features_valid)
        acc = accuracy_score(target_valid, predictions_valid)
        if acc > best_forest_acc:
            best_forest_acc = acc
            best_forest = model

print("Best Random Forest validation accuracy:", best_forest_acc)

Best Random Forest validation accuracy: 0.7947122861586314


In [10]:
# We can see that this Random Forest model is more accurate than the decision tree. 
# It took longer to load than the Decision Tree but was more accurate. 
# It could probably be more accurate if I were to increase the number of runs but that would 
# also increase the time it would take to test.

In [11]:
# Logistic Regression
log_reg = LogisticRegression(random_state=12345, max_iter=1000)
log_reg.fit(features_train, target_train)
log_reg_pred = log_reg.predict(features_valid)
log_reg_acc = accuracy_score(target_valid, log_reg_pred)
print("Logistic Regression validation accuracy:", log_reg_acc)

Logistic Regression validation accuracy: 0.7107309486780715


In [12]:
# Based on this, we can see that the logistic regression is the least accurate of the 3 models.
# This means that the random forest is the most accurate of the 3.

In [13]:
final_model = best_forest
test_predictions = final_model.predict(features_test)
final_acc = accuracy_score(target_test, test_predictions)
print("Final Test Accuracy:", final_acc)

Final Test Accuracy: 0.7900466562986003


In [15]:
# Due to us knowing that the forest was the best model we would use that here for the final
# model and predictions. AFter the other set of data that we used, the accuracy did go down
# by 0.4...% but it remained very close to the originally trained model.