# Sprint 7 Project

In [1]:
#import necessary libraries
import pandas as pd
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression


## Open and look through data file

In [2]:
df = pd.read_csv('/datasets/users_behavior.csv')

In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3214 entries, 0 to 3213
Data columns (total 5 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   calls     3214 non-null   float64
 1   minutes   3214 non-null   float64
 2   messages  3214 non-null   float64
 3   mb_used   3214 non-null   float64
 4   is_ultra  3214 non-null   int64  
dtypes: float64(4), int64(1)
memory usage: 125.7 KB


In [4]:
df.sample(10)

Unnamed: 0,calls,minutes,messages,mb_used,is_ultra
419,91.0,686.38,62.0,20499.92,0
76,73.0,633.22,21.0,18421.17,0
1857,62.0,352.66,82.0,28205.37,0
1136,89.0,602.81,56.0,19293.91,0
68,138.0,1009.11,64.0,27807.13,1
2621,76.0,451.09,84.0,23212.12,0
3132,47.0,324.43,62.0,11778.95,0
2504,111.0,758.66,47.0,16045.16,0
2002,34.0,274.53,13.0,5762.76,0
2318,39.0,242.87,11.0,15370.83,0


## Split source data into training, validation, and test sets

In [5]:
#split source data into training set, and temp set to get validation and test sets
df_train, df_temp = train_test_split(df, test_size=0.4, random_state=315)
#use temp set to split into validation and test sets
df_valid, df_test = train_test_split(df_temp, test_size=0.5, random_state=315)

In [6]:
#variables for features and target of training set
train_features = df_train.drop(['is_ultra'],axis=1)
train_target = df_train['is_ultra']

In [7]:
#variables for features and target of validation set
valid_features = df_valid.drop(['is_ultra'],axis=1)
valid_target = df_valid['is_ultra']

In [8]:
#variables for features and target of test set
test_features = df_test.drop(['is_ultra'],axis=1)
test_target = df_test['is_ultra']

In [9]:
print(train_features.shape)
print(train_target.shape)
print(valid_features.shape)
print(valid_target.shape)
print(test_features.shape)
print(test_target.shape)

(1928, 4)
(1928,)
(643, 4)
(643,)
(643, 4)
(643,)


By using 'test_size = 0.4' and then 'test_size= 0.5' we are able to split our data into 3 sets, with the ratio of 3:1:1

## Investigate different models

In [10]:
model = DecisionTreeClassifier(random_state = 315)
model.fit(train_features, train_target)

DecisionTreeClassifier(random_state=315)

In [11]:
from sklearn._config import get_config, set_config
set_config(print_changed_only=False)
model

DecisionTreeClassifier(ccp_alpha=0.0, class_weight=None, criterion='gini',
                       max_depth=None, max_features=None, max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, random_state=315,
                       splitter='best')

In [12]:
for depth in range(1, 6):
    model = DecisionTreeClassifier(random_state=315, max_depth=depth)
    model.fit(train_features, train_target)
    valid_predictions = model.predict(valid_features)
    print('max_depth =', depth, ': ', end='')
    print(accuracy_score(valid_target, valid_predictions))

max_depth = 1 : 0.7589424572317263
max_depth = 2 : 0.7931570762052877
max_depth = 3 : 0.8118195956454122
max_depth = 4 : 0.8040435458786936
max_depth = 5 : 0.8040435458786936


Max depth of 3 seems to give us the most accurate decision tree model. 

In [13]:
best_score = 0
best_est = 0
for est in range(1, 50):
    model2 = RandomForestClassifier(random_state=315, n_estimators=est)
    model2.fit(train_features, train_target)
    score = model2.score(valid_features, valid_target)
    if score > best_score:
        best_score = score
        best_est = est

print("Accuracy of the best model on the validation set (n_estimators = {}): {}".format(best_est, best_score))

Accuracy of the best model on the validation set (n_estimators = 47): 0.8118195956454122


In [14]:
model3 = LogisticRegression(random_state=315, solver="liblinear")
model3.fit(train_features, train_target)
train_score = model3.score(train_features, train_target)
valid_score = model3.score(valid_features, valid_target)

print("Accuracy of the logistic regression model on the training set:",train_score,)
print("Accuracy of the logistic regression model on the validation set:",valid_score,)

Accuracy of the logistic regression model on the training set: 0.7012448132780082
Accuracy of the logistic regression model on the validation set: 0.7060653188180405


I have made 3 models: A decision tree model, and random forest model, and a logistic regression model. The logisitic regression model is the least accurate of the 3, therefore I will not be continuing with this model. As for the decision tree and random forest models, they each produced the same accuracy score under certain conditions. The decision tree with a max depth of 3, and the random tree with 47 trees in the forest both produced the the highest accuracy score. For my final model, I will continue with the decision tree model. Had the random forest model produced a higher accuracy score than I would've proceded with that one, but because the scores are the same I am opting for the model with higher speed. 

## Test Model Quality 

In [15]:
final_model = DecisionTreeClassifier(random_state=315, max_depth=3)
final_model.fit(train_features, train_target)
test_predictions = final_model.predict(test_features)
print(accuracy_score(test_target, test_predictions))

0.7978227060653188


The accuracy score of the model using the test set is just under 80%. While we would certainly be happier with a better accuracy score, it is still higher than the threshold of 75%. 

## Sanity Check

In [16]:
from sklearn.dummy import DummyClassifier
dummy_clf = DummyClassifier(strategy='most_frequent')
dummy_clf.fit(train_features, train_target)

dummy_predictions = dummy_clf.predict(valid_features)
dummy_accuracy = accuracy_score(valid_target, dummy_predictions)
print(dummy_accuracy)

0.6920684292379471


From the sanity check we see that our model's accuracy score was higher than the sanity check's accuracy score. In other words, the model we have made performs better than chance. 