In [None]:
%%capture
# !pip install pymongo pprint dateparser matplotlib pandas sklearn numpy seaborn

In [1]:
import pymongo
import pprint
import dateparser
import pandas as pd
import numpy as np
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split

# %matplotlib inline

In [2]:
course_cluster_uri = "mongodb://agg-student:agg-password@cluster0-shard-00-00-jxeqq.mongodb.net:27017,cluster0-shard-00-01-jxeqq.mongodb.net:27017,cluster0-shard-00-02-jxeqq.mongodb.net:27017/test?ssl=true&replicaSet=Cluster0-shard-0&authSource=admin"
course_client = pymongo.MongoClient(course_cluster_uri)
titanic = course_client['coursera-agg']['titanic']

In [7]:
initial_project = {
    "$project": {
        "_id": 0,
        "name": 0,
        "point_of_embarkation": 0,
        "ticket_number": 0,
        "passenger_id": 0,
        "cabin": 0,
    }
}

In [8]:
# todo - correct the age.

# age_correction = {
#     "$addFields" : {
#         "age" : {"$cond" : [{"$eq" : [{"$type" : "$age"}, "string"]}, 0, "$age"]}
#     }
# }


In [9]:
# todo - one hot encode gender_female. 1 if female, 0 if male
# one_hot_female = {
#     "$addFields" : {
#         "gender_female" : {
#             "$cond" : [{"$eq" : ["$gender", "female"]}, 1, 0]
#         }
#     }
#     
# }

In [10]:
# todo - the inverse of above. 1 if male, 0 if female
# one_hot_male = {
#     "$addFields" : {
#         "gender_male" : {
#             "$cond" : [{"$eq" : ["$gender", "male"]}, 1, 0]
#         }
#     }
# }

In [14]:
encoding_stage = {
    "$addFields": {
        "gender_female": {"$cond" : [{"$eq" : ["$gender", "female"]}, 1, 0]},
        "gender_male": {"$cond" : [{"$eq" : ["$gender", "male"]}, 1, 0]},
        "age": {"$cond" : [{"$eq" : [{"$type" : "$age"}, "string"]}, 0, "$age"]}
    }
}

In [15]:
final_project = {
    "$project": {
        "gender": 0
    }
}

In [16]:
pipeline = [initial_project, encoding_stage, final_project]
display(list(titanic.aggregate(pipeline)))

[{'survived': 0,
  'class': 3,
  'age': 0,
  'siblings_spouse': 0,
  'parents_children': 0,
  'fare_paid': 8.05,
  'gender_female': 0,
  'gender_male': 1},
 {'survived': 0,
  'class': 3,
  'age': 0,
  'siblings_spouse': 2,
  'parents_children': 0,
  'fare_paid': 21.6792,
  'gender_female': 0,
  'gender_male': 1},
 {'survived': 0,
  'class': 1,
  'age': 54,
  'siblings_spouse': 0,
  'parents_children': 0,
  'fare_paid': 51.8625,
  'gender_female': 0,
  'gender_male': 1},
 {'survived': 0,
  'class': 2,
  'age': 21,
  'siblings_spouse': 0,
  'parents_children': 0,
  'fare_paid': 73.5,
  'gender_female': 0,
  'gender_male': 1},
 {'survived': 1,
  'class': 3,
  'age': 17,
  'siblings_spouse': 4,
  'parents_children': 2,
  'fare_paid': 7.925,
  'gender_female': 1,
  'gender_male': 0},
 {'survived': 0,
  'class': 1,
  'age': 45,
  'siblings_spouse': 1,
  'parents_children': 0,
  'fare_paid': 83.475,
  'gender_female': 0,
  'gender_male': 1},
 {'survived': 1,
  'class': 2,
  'age': 17,
  'sibl

In [17]:
df = pd.DataFrame.from_dict(list(titanic.aggregate(pipeline)))
df.head()

Unnamed: 0,age,class,fare_paid,gender_female,gender_male,parents_children,siblings_spouse,survived
0,0.0,3,8.05,0,1,0,0,0
1,0.0,3,21.6792,0,1,0,2,0
2,54.0,1,51.8625,0,1,0,0,0
3,21.0,2,73.5,0,1,0,0,0
4,17.0,3,7.925,1,0,2,4,1


In [18]:
X = df.drop('survived', axis=1)

In [19]:
y = df['survived']

In [20]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.15)

In [21]:
dtree = DecisionTreeClassifier()

In [22]:
%%capture
dtree.fit(X_train, y_train)

In [23]:
predictions = dtree.predict(X_test)

In [24]:
print(confusion_matrix(y_test, predictions))
print("\n")
print(classification_report(y_test, predictions))

[[74 12]
 [18 30]]


              precision    recall  f1-score   support

           0       0.80      0.86      0.83        86
           1       0.71      0.62      0.67        48

    accuracy                           0.78       134
   macro avg       0.76      0.74      0.75       134
weighted avg       0.77      0.78      0.77       134



In [25]:
rfc = RandomForestClassifier(n_estimators=20)

In [26]:
%%capture
rfc.fit(X_train, y_train)

In [27]:
rfc_pred = rfc.predict(X_test)

In [28]:
print(confusion_matrix(y_test, rfc_pred))
print("\n")
print(classification_report(y_test, rfc_pred, target_names=['test', 'predictions']))

[[78  8]
 [14 34]]


              precision    recall  f1-score   support

        test       0.85      0.91      0.88        86
 predictions       0.81      0.71      0.76        48

    accuracy                           0.84       134
   macro avg       0.83      0.81      0.82       134
weighted avg       0.83      0.84      0.83       134



In [29]:
iterations = 1000
dtree_avg_accuracy = 0
rfc_avg_accuracy = 0
for _ in range(iterations):
    dtree.fit(X_train, y_train)
    dtree_avg_accuracy += dtree.score(X_test, y_test)
    rfc.fit(X_train, y_train)
    rfc_avg_accuracy += rfc.score(X_test, y_test)
    
print(f"""
After {iterations} iterations:
  Single Decision Tree accuracy: {dtree_avg_accuracy / iterations}
  Random Forest accuracy:        {rfc_avg_accuracy / iterations}
  
  Lab Answer:  dtree={round(dtree_avg_accuracy / iterations, 2)}, rfc={round(rfc_avg_accuracy / iterations, 2)}
""")


After 1000 iterations:
  Single Decision Tree accuracy: 0.7964179104477618
  Random Forest accuracy:        0.8344850746268618
  
  Lab Answer:  dtree=0.8, rfc=0.83

