In [78]:
import numpy as np
from numpy.random import randn, gamma
from sklearn.linear_model import LogisticRegression
from neo4j.v1 import GraphDatabase, basic_auth
import pandas as pd

In [79]:
db_location = "bolt://localhost:7687"
username = "neo4j"
password = "P^5lw9MJi@BQ"
driver = GraphDatabase.driver(db_location, auth=basic_auth(username, password))

In [80]:
# CREATE DATA

with driver.session() as session:
    
    # Erase database
    with session.begin_transaction() as clear_db:
        clear_db.run("MATCH (n) DETACH DELETE n")
    
    # Add users with properties score_math up to 10 and score_English up to 10
    with session.begin_transaction() as create_users:
        create_users.run("FOREACH(r IN range(1,10)| "
                         "CREATE (user :User {user_id: r, time_spent: rand()*500, friends_number: rand()*100}))") 
    
    # Add random relationships between users
    with session.begin_transaction() as add_relationships:
         add_relationships.run("MATCH(user_1 :User), (user_2 :User) "
                                "WHERE rand()<0.3 AND NOT user_1.user_id = user_2.user_id "
                                "CREATE(user_1)-[:IS_FRIEND]->(user_2)")

In [81]:
#IMPORT DATA

with driver.session() as session:
    
    # Get vectors with data from database
    with session.begin_transaction() as get_data:
        result = get_data.run(
            "MATCH(user_1 :User), (user_2 :User) "
            "WHERE NOT user_1.user_id = user_2.user_id "
            "WITH user_1.user_id as friend_id, user_2.user_id as user_id, "
                "user_1.time_spent as friend_time_spent, user_2.time_spent as user_time_spent, "
                "user_1.friends_number as friend_friends_number, user_2.friends_number as user_friends_number, "
                "CASE WHEN (user_1)-[:IS_FRIEND]->(user_2) THEN 1 ELSE 0 END as is_friend "
            "RETURN friend_id, user_id, friend_time_spent, user_time_spent, friend_friends_number, user_friends_number, is_friend "
        )
        training_data = pd.DataFrame([{k: v for k, v in r.items()} for r in result])

In [90]:
# training_data

In [84]:
features_columns = [x for x in training_data.columns if x not in ['friend_id','user_id', 'is_friend']]
responses_columns = ['is_friend']
X_data = training_data[features_columns]
Y_data = training_data[responses_columns]

In [85]:
X_data.head()

Unnamed: 0,friend_friends_number,friend_time_spent,user_friends_number,user_time_spent
0,4.927981,54.378804,75.198953,361.988421
1,4.927981,54.378804,25.326768,155.986747
2,4.927981,54.378804,37.839174,86.504659
3,4.927981,54.378804,52.491045,257.267734
4,4.927981,54.378804,6.433075,197.152716


In [92]:
y = Y_data["is_friend"].values

In [93]:
y

array([1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 1, 1, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       1, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], dtype=int64)

In [94]:
model_lr = LogisticRegression()

model_lr.fit(X_data, y)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)

In [97]:
new_data = pd.DataFrame(data = {'friend_friends_number': [283, 59, 94], 'friend_time_spent': [490, 283, 15], 
                                'user_friends_number': [635, 44, 137], 'user_timespent': [252, 65, 98]})

In [98]:
new_data

Unnamed: 0,friend_friends_number,friend_time_spent,user_friends_number,user_timespent
0,283,490,635,252
1,59,283,44,65
2,94,15,137,98


In [99]:
model_lr.predict(new_data)

array([0, 0, 0], dtype=int64)

In [106]:
predicted_data = pd.DataFrame({'interested_in': model_lr.predict_proba(new_data)[:, 1]})

In [107]:
predicted_data

Unnamed: 0,interested_in
0,0.002252
1,0.05517
2,0.05328
