In [None]:
import pandas as pd
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split

# Import the titanic dataset into a pandas dataframe
dataframe = pd.read_csv('https://sololearn.com/uploads/files/titanic.csv')
# Convert the sex column into boolean values
dataframe['male'] = dataframe['Sex'] == 'male'
# Seperate the features and target into numpy arrays
x = dataframe[['Pclass', 'Age', 'Siblings/Spouses','Parents/Children', 'Fare', 'male']].values
y = dataframe['Survived'].values
# Split the features and target into train (for training the model) and test (for evaluating the model)
x_train, x_test, y_train, y_test = train_test_split(x, y, random_state=22)

# Instantiate a decision tree model:
model = DecisionTreeClassifier()
# Train the decision tree model using the fit method:
model.fit(x_train, y_train)
# Predict using the decision tree model
predictionArr = model.predict([[3, 22, 1, 0, 7.25, True]])
print(predictionArr)





[0]


In [None]:
# Using the titanic dataset, create a logistic regression model and decision tree model and compare
#  their evaluations (accuracy, precision, recall)

import pandas as pd
import numpy as np
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import KFold
from sklearn.metrics import accuracy_score, precision_score, recall_score

# Import the titanic dataset into a dataframe
dataframe = pd.read_csv('https://sololearn.com/uploads/files/titanic.csv')
dataframe['male'] = dataframe['Sex'] == 'male'
# Divide the data into features and target (note that x is a 2d numpy arr, and y is a 1d numpy arr)
x = dataframe[['Pclass', 'male', 'Age', 'Siblings/Spouses','Parents/Children', 'Fare']].values
y = dataframe['Survived'].values

# Do the k-fold cross validation stuff
logisticRegressionAccuracy, logisticRegressionPrecision, logisticRegressionRecall = [], [], []
decisionTreeAccuracy, decisionTreePrecision, decisionTreeRecall = [], [], []

kf = KFold(n_splits=5, shuffle=True, random_state=10)

for train_index, test_index in kf.split(x):
    x_train, x_test, y_train, y_test = x[train_index], x[test_index], y[train_index], y[test_index]

    # Create and evaluate the logistic regression model
    logisticRegressionModel = LogisticRegression()
    logisticRegressionModel.fit(x_train, y_train)

    y_pred = logisticRegressionModel.predict(x_test)
    logisticRegressionAccuracy.append(accuracy_score(y_test, y_pred))
    logisticRegressionPrecision.append(precision_score(y_test, y_pred))
    logisticRegressionRecall.append(recall_score(y_test, y_pred))

    # Create and evaluate the Decision tree model
    decisionTreeModel = DecisionTreeClassifier()
    decisionTreeModel.fit(x_train, y_train)

    y_pred = decisionTreeModel.predict(x_test)
    decisionTreeAccuracy.append(accuracy_score(y_test, y_pred))
    decisionTreePrecision.append(precision_score(y_test, y_pred))
    decisionTreeRecall.append(recall_score(y_test, y_pred))

# Print the results
print('Logistic Regression Accuracy:', np.mean(logisticRegressionAccuracy))
print('Decision Tree Accuracy:',np.mean(decisionTreeAccuracy))

print('\nLogistic Regression Precision:', np.mean(logisticRegressionPrecision))
print('Decision Tree Precision:', np.mean(decisionTreePrecision))

print('\nLogistic Regression Recall:', np.mean(logisticRegressionRecall))
print('Decision Tree Recall:', np.mean(decisionTreeRecall))




Logistic Regression Accuracy: 0.7970354853043865
Decision Tree Accuracy: 0.7857677902621722

Logistic Regression Precision: 0.7618898922983288
Decision Tree Precision: 0.7271825525389317

Logistic Regression Recall: 0.6900529617441382
Decision Tree Recall: 0.7114489064856713


In [None]:
# Create a decision tree model that uses gini to measure the quality/putiry/homogenity of the splits,
#   and create a decision tree model that uses entropy to measure the quality/purity/homogenity of the splits
import pandas as pd
import numpy as np
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import KFold
from sklearn.metrics import accuracy_score, precision_score, recall_score

# Import the titanic dataset into a pandas dataframe
dataframe = pd.read_csv('https://sololearn.com/uploads/files/titanic.csv')
dataframe['male'] = dataframe['Sex'] == 'male'
# Seperate the features and target into numpy arrays
x = dataframe[['Pclass', 'male', 'Age', 'Siblings/Spouses','Parents/Children', 'Fare']].values
y = dataframe['Survived'].values

# Do k-fold cross validation stuff
giniAccuracy, giniPrecision, giniRecall, entropyAccuracy, entropyPrecision, entropyRecall = [], [], [], [], [], []
kf = KFold(n_splits=5, shuffle=True)

for train_indices, test_indices in kf.split(x):
    x_train, y_train, x_test, y_test = x[train_indices], y[train_indices], x[test_indices], y[test_indices]

    # Create and evaluate the Decision tree model that uses gini to measure the quality of splits
    decisionTreeModelGini = DecisionTreeClassifier(criterion='gini')
    decisionTreeModelGini.fit(x_train, y_train)
    
    y_pred = decisionTreeModelGini.predict(x_test)
    giniAccuracy.append(accuracy_score(y_test, y_pred))
    giniPrecision.append(precision_score(y_test, y_pred))
    giniRecall.append(recall_score(y_test, y_pred))

    # Create and evaluate the Decision tree model that uses entropy to measure the quality of splits
    decisionTreeModelEntropy = DecisionTreeClassifier(criterion='entropy')
    decisionTreeModelEntropy.fit(x_train, y_train)
    
    y_pred = decisionTreeModelEntropy.predict(x_test)
    entropyAccuracy.append(accuracy_score(y_test, y_pred))
    entropyPrecision.append(precision_score(y_test, y_pred))
    entropyRecall.append(recall_score(y_test, y_pred))

print(np.mean(giniAccuracy), np.mean(giniPrecision), np.mean(giniRecall))
print(np.mean(entropyAccuracy), np.mean(entropyPrecision), np.mean(entropyRecall))



0.7778708817368121 0.7178768273768273 0.7159371705819793
0.7789690852536024 0.7257887178716494 0.6927321544807883


In [None]:
# Create a decision tree and print out a visualization of the decision tree
import pandas as pd
import numpy as np
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import KFold
from sklearn.metrics import accuracy_score, precision_score, recall_score

# Import the titanic dataset into a dataframe
dataframe = pd.read_csv('https://sololearn.com/uploads/files/titanic.csv')
dataframe['male'] = dataframe['Sex'] == 'male'
# Divide the data into features and target (note that x is a 2d numpy arr, and y is a 1d numpy arr)
x = dataframe[['Pclass', 'male']].values
y = dataframe['Survived'].values

# Create and train the Decision tree model
model = DecisionTreeClassifier()
model.fit(x, y)

# Print out a visualization of the decision tree
from sklearn.tree import export_graphviz
import graphviz
dot_file = export_graphviz(model, feature_names=['Pclass', 'male'])
graph = graphviz.Source(dot_file)
graph.render(filename='decisionTree', format='png', cleanup=True)





In [None]:
# Create a decision tree model and prune it using the three prepruning techniques
import pandas as pd
import numpy as np
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import KFold
from sklearn.metrics import accuracy_score, precision_score, recall_score

# Import the titanic dataset into a dataframe
dataframe = pd.read_csv('https://sololearn.com/uploads/files/titanic.csv')
dataframe['male'] = dataframe['Sex'] == 'male'
# Divide the data into features and target (note that x is a 2d numpy arr, and y is a 1d numpy arr)
x = dataframe[['Pclass', 'male']].values
y = dataframe['Survived'].values

# Create a decision tree with the following prepruning properties:
#    1) max depth = 3
#    2) minimum samples per leaf = 2
#    3) maximum number of leaf nodes = 10
model = DecisionTreeClassifier(max_depth=3, min_samples_leaf=2, max_leaf_nodes=10)
# Train the model
model.fit(x, y)

# Print out a visualization of the decision tree
from sklearn.tree import export_graphviz
import graphviz
dot_file = export_graphviz(model, feature_names=['Pclass', 'male'])
graph = graphviz.Source(dot_file)
graph.render(filename='decisionTree', format='png', cleanup=True)


In [9]:
# Use the GridSearch class to see which pruning parameters will give us the best model
import pandas as pd
import numpy as np
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import KFold
from sklearn.metrics import accuracy_score, precision_score, recall_score
from sklearn.model_selection import GridSearchCV

# Import the titanic dataset into a dataframe
dataframe = pd.read_csv('https://sololearn.com/uploads/files/titanic.csv')
dataframe['male'] = dataframe['Sex'] == 'male'
# Divide the data into features and target (note that x is a 2d numpy arr, and y is a 1d numpy arr)
x = dataframe[['Pclass', 'male', 'Age', 'Siblings/Spouses','Parents/Children', 'Fare']].values
y = dataframe['Survived'].values

param_grid = {'max_depth': [5, 15, 25], 'min_samples_leaf':[1, 3], 'max_leaf_nodes': [10, 20, 35, 50]}

# Create the grid search object
model = DecisionTreeClassifier()
gridSearch = GridSearchCV(model, param_grid, scoring='f1', cv=5)
gridSearch.fit(x, y)

# Use the best_params_ attribute/variable to see which combination of parameters is the best
best_params = gridSearch.best_params_
print(best_params) 
print(gridSearch.best_score_)

{'max_depth': 15, 'max_leaf_nodes': 35, 'min_samples_leaf': 1}
0.772739325261858
