# connect to database

In [57]:
from neo4j.v1 import GraphDatabase, basic_auth

In [58]:
# ----- connection -------

db_location = "bolt://localhost:7687"
username = "neo4j"
password = "password"
driver = GraphDatabase.driver(db_location, auth=basic_auth(username, password))

# create data

for clasification, we should create an example to train our skill in recognizing where we have to divide our data to different classes.

the important thing is we have certain classes = it's when you want to predict a specific class to which your product related to.

let's take an example, you will see )

In [59]:
# first erase database

with driver.session() as session:
    
    session.run("MATCH (n) DETACH DELETE n")

In [61]:
# then we want to predict to which class people related to
# there are 1 type of objects just people
# there are properties (X_data) what have impact to our answer (Y_data) where Y_data now it's a set of specific numbers = categories
# we need to choose the right properties what could have impact for our question = for our answers (Y data)
# Y data = [1, 2, 3] it's a set of 3 type of classes people are
# X data it's those properties of each object what really can affect result = 
# example = age, monthly_income, working_hours_per_day, latitude, longitude

with driver.session() as session:
    
    # creating people with properties
    session.run("FOREACH(person IN range(1,20) | CREATE (:Person {latitude: round(rand()*180), longitude: round(rand()*90), age: round(rand()*15+23), monthly_income: round(rand()*1000*16+1000), working_hours_per_day: round(rand()*16+2)}))")
    
    
    # creating random = Y_data and set random class to which certain person related to
    session.run("MATCH (person :Person) "
                "WITH person, [1, 2, 3] as classes, toInt(round(rand()*10)%3) as position "
                "WITH person, classes, position, classes[position] as class "
                "SET person.class = class ")

# load data

In [62]:
import pandas as pd

In [64]:
with driver.session() as session:
    
    # get data from database
    with session.begin_transaction() as get_data:
        result = get_data.run("MATCH (person :Person) "
                              "RETURN person.latitude as latitude, person.longitude as longitude, person.age as age, person.monthly_income as monthly_income, person.working_hours_per_day as working_hours_per_day, person.class as class ")
        training_data = pd.DataFrame([{k: v for k, v in r.items()} for r in result])

# Set X_data (features) and Y_data (responses)

In [65]:
# create 2 variables called X_data and Y_data:
# X_data shall be a matrix with features columns
# and Y_data shall be a matrix with responses columns

features_columns = [x for x in training_data.columns if x not in ['class']]
responses_columns = ['class']
X_data = training_data[features_columns]
Y_data = training_data[responses_columns]

In [68]:
X_data.head(3)

Unnamed: 0,age,latitude,longitude,monthly_income,working_hours_per_day
0,30.0,177.0,28.0,3629.0,12.0
1,23.0,37.0,11.0,13602.0,8.0
2,32.0,74.0,2.0,9353.0,13.0


In [69]:
Y_data.head(5)

Unnamed: 0,class
0,2
1,1
2,1
3,2
4,2


# Decision Trees Model

split data into “pure” regions

In [70]:
from sklearn import tree

In [71]:
# give a name to model and put data to fit
model_4 = tree.DecisionTreeClassifier().fit(X_data, Y_data)

# Prediction
we have to use the same type of data to predict new result

In [72]:
# create new data 

new_data = pd.DataFrame(data = {'latitude': [37, 11, 10], 'longitude': [33, 15, 17], 'age': [63, 17, 33], 'monthly_income': [3900, 1000, 21200], 'working_hours_per_day': [10, 6, 7]})

In [73]:
new_data

Unnamed: 0,age,latitude,longitude,monthly_income,working_hours_per_day
0,63,37,33,3900,10
1,17,11,15,1000,6
2,33,10,17,21200,7


In [74]:
model_4.predict(new_data)

array([2, 3, 2])

let's write the output data in more correct way with data frame

In [75]:
output = model_4.predict(new_data)

In [76]:
output_data = pd.DataFrame({'class': output})

In [77]:
output_data

Unnamed: 0,class
0,2
1,3
2,2


# Save Model

In [46]:
import pickle

In [47]:
# ------ model name -------
model_pkl_filename = 'prediction_model.pkl'

In [49]:
with open(model_pkl_filename, 'wb') as pickled_model:
    pickle.dump(model_4, pickled_model)

# Load Model

In [50]:
# ----- load prediction model -------
with open(model_pkl_filename, 'rb') as model_pkl:
    prediction_model = pickle.load(model_pkl)

In [51]:
prediction_model

DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=None,
            max_features=None, max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, presort=False, random_state=None,
            splitter='best')

# Use Prediction Model

In [52]:
# create new data 

new_data = pd.DataFrame(data = {'latitude': [48], 'longitude': [52], 'age': [29], 'monthly_income': [12000], 'working_hours_per_day': [5]})

In [53]:
new_data

Unnamed: 0,age,latitude,longitude,monthly_income,working_hours_per_day
0,29,48,52,12000,5


In [54]:
result = prediction_model.predict(new_data)

In [55]:
result

array([2.])