# connect to database

In [1]:
from neo4j.v1 import GraphDatabase, basic_auth

In [2]:
# ----- connection -------

db_location = "bolt://localhost:7687"
username = "neo4j"
password = "password"
driver = GraphDatabase.driver(db_location, auth=basic_auth(username, password))

# create data

In [99]:
with driver.session() as session:
    
    # erase database
    with session.begin_transaction() as clear_db:
        clear_db.run("MATCH (n) DETACH DELETE n")
    
    # create year
    with session.begin_transaction() as create_year:
        create_year.run("CREATE (year :Year) set year.year=2018") 
    
    # create months
    with session.begin_transaction() as create_months:
        create_months.run("MATCH (year :Year) "
                              "WHERE year.year = 2018 "
                              "FOREACH (month IN range (4,12) | MERGE (:Month {month: month, year: 2018})-[:BELONGS_TO]->(year))"
                             )
        
    # create days in April
    with session.begin_transaction() as create_days:
        create_days.run("MATCH (month :Month) "
                         "WHERE month.month = 4 AND month.year = 2018 "
                         "FOREACH(day IN range (1,30) | MERGE (:Day {day: day, month: 4, year: 2018})-[:BELONGS_TO]->(month))"
                        ) 
    
    # create sportsmans
    with session.begin_transaction() as create_sportsmans:
        create_sportsmans.run("WITH ['Anna', 'Tom', 'Nick', 'Kateryna', 'Maryna', 'Jay'] AS name "
                         "FOREACH (i IN range(1,24) | CREATE (s :Sportsman {sportsmen_id: i, FullName: name[i % size(name)], age: round(rand()*34)+14, duration_training: round(rand()*8)+1}))"
                        ) 
        
    for i in range(1,84):
        
        # create training
        with session.begin_transaction() as create_training:
            create_training.run("WITH round(rand()*23+1) as sportsmen_id, round(rand()*29+1) as Day, 4 as Month, 2018 as Year "
                                "MATCH (sportsman :Sportsman), (year :Year), (month :Month), (day :Day) "
                                "WHERE sportsman.sportsmen_id = sportsmen_id "
                                "AND year.year=Year AND month.month=Month AND month.year=Year "
                                "AND day.day=Day AND day.month=Month AND day.year=Year "
                                "MERGE(sportsman)-[:HAS_TRAINING]->(day)"
                               ) 
    
    

# load data

In [100]:
import pandas as pd

In [101]:
with driver.session() as session:
    
    # get data from database
    with session.begin_transaction() as get_data:
        result = get_data.run("MATCH(sportsman :Sportsman)-[ht :HAS_TRAINING]-(day :Day)-[:BELONGS_TO]->(:Month)-[:BELONGS_TO]->(:Year) "
                              "WITH sportsman.sportsman_id as sportsman_id, sportsman.FullName as name, sportsman.age as age, sportsman.duration_training as duration_training, COUNT(ht) as training "
                              "RETURN sportsman_id, name, age, duration_training, training"
                            )
        training_data = pd.DataFrame([{k: v for k, v in r.items()} for r in result])

In [102]:
training_data

Unnamed: 0,age,duration_training,name,sportsman_id,training
0,19.0,3.0,Nick,,3
1,47.0,7.0,Anna,,5
2,26.0,3.0,Maryna,,3
3,46.0,7.0,Maryna,,1
4,23.0,9.0,Kateryna,,7
5,43.0,5.0,Tom,,8
6,45.0,7.0,Maryna,,1
7,23.0,1.0,Nick,,2
8,24.0,2.0,Maryna,,1
9,36.0,7.0,Anna,,3


# Set X_data (features) and Y_data (responses)

In [103]:
# create 2 variables called X_data and Y_data:
# X_data shall be a matrix with features columns
# and Y_data shall be a matrix with responses columns

features_columns = [x for x in training_data.columns if x not in ['sportsman_id','name', 'training']]
responses_columns = ['training']
X_data = training_data[features_columns]
Y_data = training_data[responses_columns]

In [104]:
X_data.head()

Unnamed: 0,age,duration_training
0,19.0,3.0
1,47.0,7.0
2,26.0,3.0
3,46.0,7.0
4,23.0,9.0


In [105]:
Y_data

Unnamed: 0,training
0,3
1,5
2,3
3,1
4,7
5,8
6,1
7,2
8,1
9,3


In [106]:
y = Y_data["training"].values

In [107]:
y

array([3, 5, 3, 1, 7, 8, 1, 2, 1, 3, 1, 3, 3, 3, 5, 4, 2, 3, 3, 2, 3, 6, 4,
       1])

# Linear Regression Model

In [108]:
# import Linear Regression
from sklearn.linear_model import LinearRegression

# give a name to Linear regression model
model_2 = LinearRegression()

# put our data to train model
model_2.fit(X_data, y)

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=1, normalize=False)

# Prediction
we have to use the same type of data to predict new result

In [109]:
# create new data 

new_data = pd.DataFrame(data = {'age': [23, 31, 17], 'duration_training': [3, 1, 2]})

In [110]:
new_data

Unnamed: 0,age,duration_training
0,23,3
1,31,1
2,17,2


In [111]:
# predict a value of getting values 0 or 1
model_2.predict(new_data)

array([ 2.80193371,  1.91632515,  2.52753352])

let's write the output data in more correct way with data frame

In [112]:
output = model_2.predict(new_data)

In [113]:
output_data = pd.DataFrame({'training': output})

In [114]:
output_data

Unnamed: 0,training
0,2.801934
1,1.916325
2,2.527534


next, let's play with predicted data to be correct

number of traning can't be float = need apply round to dataFrame

In [115]:
output_data['round_training'] = output_data['training'].round()

In [116]:
output_data

Unnamed: 0,training,round_training
0,2.801934,3.0
1,1.916325,2.0
2,2.527534,3.0


# Save Model

In [117]:
import pickle

In [118]:
# ------ model name -------
model_pkl_filename = 'prediction_model.pkl'

In [120]:
with open(model_pkl_filename, 'wb') as pickled_model:
    pickle.dump(model_2, pickled_model)

# Load Model

In [121]:
# ----- load prediction model -------
with open(model_pkl_filename, 'rb') as model_pkl:
    prediction_model = pickle.load(model_pkl)

In [122]:
prediction_model

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=1, normalize=False)

# Use Prediction Model

let's predict how many traning you will do

set up your age and how many hours you usually train

In [131]:
# create new data 

new_data = pd.DataFrame(data = {'age': [29], 'duration_training': [1]})

In [132]:
new_data

Unnamed: 0,age,duration_training
0,29,1


In [133]:
prediction_model.predict(new_data)

array([ 1.95000597])