In [9]:
!pip install neo4j-driver



# connect to database

In [22]:
from neo4j.v1 import GraphDatabase, basic_auth

In [23]:
# ----- connection -------

db_location = "bolt://localhost:7687"
username = "neo4j"
password = "password"
driver = GraphDatabase.driver(db_location, auth=basic_auth(username, password))

# create data

In [28]:
with driver.session() as session:
    
    # erase database
    with session.begin_transaction() as clear_db:
        clear_db.run("MATCH (n) DETACH DELETE n")
    
    # add users with properties score_math up to 10 and score_English up to 10
    with session.begin_transaction() as create_users:
        create_users.run("FOREACH(r IN range(1,10)| "
                         "CREATE (u :User {user_id: r, score_math: rand()*10, score_English: rand()*10}))") 
    
    # add random relationships between users
    with session.begin_transaction() as add_relationships:
        add_relationships.run("MATCH(user_1 :User), (user_2 :User) "
                              "WHERE rand()<0.3 AND NOT user_1.user_id=user_2.user_id "
                              "CREATE(user_1)-[:INTERESTED_IN]->(user_2)")

# load data

In [29]:
import pandas as pd

In [30]:
with driver.session() as session:
    
    # get vectors with data from database
    with session.begin_transaction() as get_data:
        result = get_data.run(
            "MATCH(user_1 :User), (user_2 :User) "
            "WHERE NOT user_1.user_id=user_2.user_id "
            "WITH user_1.user_id as interested_user_id, user_2.user_id as user_id, "
                "user_1.score_math as interested_score_math, user_2.score_math as user_score_math, "
                "user_1.score_English as interested_score_English, user_2.score_English as user_score_English, "
                "CASE WHEN (user_1)-[:INTERESTED_IN]->(user_2) THEN 1 ELSE 0 END as interested_in "
            "RETURN interested_user_id, user_id, interested_score_math, user_score_math, interested_score_English, user_score_English, interested_in "
        )
        training_data = pd.DataFrame([{k: v for k, v in r.items()} for r in result])

In [31]:
training_data

Unnamed: 0,interested_in,interested_score_English,interested_score_math,interested_user_id,user_id,user_score_English,user_score_math
0,0,6.276941,8.256997,10,5,8.239424,2.590635
1,0,6.276941,8.256997,10,6,2.796148,7.553336
2,0,6.276941,8.256997,10,7,9.625737,3.836696
3,0,6.276941,8.256997,10,8,9.601327,3.077997
4,0,6.276941,8.256997,10,9,2.069655,1.196208
5,0,6.276941,8.256997,10,1,6.953483,0.726032
6,0,6.276941,8.256997,10,2,9.616818,0.297277
7,0,6.276941,8.256997,10,3,9.427460,0.213491
8,0,6.276941,8.256997,10,4,1.107701,0.261009
9,0,8.239424,2.590635,5,10,6.276941,8.256997


In [32]:
training_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 90 entries, 0 to 89
Data columns (total 7 columns):
interested_in               90 non-null int64
interested_score_English    90 non-null float64
interested_score_math       90 non-null float64
interested_user_id          90 non-null int64
user_id                     90 non-null int64
user_score_English          90 non-null float64
user_score_math             90 non-null float64
dtypes: float64(4), int64(3)
memory usage: 5.0 KB


# Set X_data (features) and Y_data (responses)

In [34]:
# create 2 variables called X_data and Y_data:
# X_data shall be a matrix with features columns
# and Y_data shall be a matrix with responses columns

features_columns = [x for x in training_data.columns if x not in ['interested_user_id','user_id', 'interested_in']]
responses_columns = ['interested_in']
X_data = training_data[features_columns]
Y_data = training_data[responses_columns]

In [35]:
X_data.head()

Unnamed: 0,interested_score_English,interested_score_math,user_score_English,user_score_math
0,6.276941,8.256997,8.239424,2.590635
1,6.276941,8.256997,2.796148,7.553336
2,6.276941,8.256997,9.625737,3.836696
3,6.276941,8.256997,9.601327,3.077997
4,6.276941,8.256997,2.069655,1.196208


In [37]:
Y_data

Unnamed: 0,interested_in
0,0
1,0
2,0
3,0
4,0
5,0
6,0
7,0
8,0
9,0


In [38]:
y = Y_data["interested_in"].values

In [39]:
y

array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0,
       0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 1, 0, 1, 1, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 1])

# Logistic Regression Model

In [40]:
# import Logistic Regression
from sklearn.linear_model import LogisticRegression

# give a name to Logistic regression model
model_1 = LogisticRegression()

# put our data to train model
model_1.fit(X_data, y)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)

# Prediction
we have to use the same type of data to predict new result

In [41]:
# create new data 

new_data = pd.DataFrame(data = {'interested_score_English': [9.3, 8.2, 0.1], 'interested_score_math': [0.45, 4.6, 8.3], 'user_score_English': [2.7, 4.2, 5.3], 'user_score_math': [7.4, 6.5, 5.4]})

In [42]:
new_data

Unnamed: 0,interested_score_English,interested_score_math,user_score_English,user_score_math
0,9.3,0.45,2.7,7.4
1,8.2,4.6,4.2,6.5
2,0.1,8.3,5.3,5.4


In [43]:
# predict a value of getting values 0 or 1
model_1.predict(new_data)

array([0, 0, 0])

In [44]:
# predict probability of getting value 1 (we will have probabilities of getting 0 an 1, we need 1)
model_1.predict_proba(new_data)[:, 1]

array([ 0.06479153,  0.06791929,  0.31946321])

let's write the output data in more correct way with data frame

In [19]:
output = model_1.predict_proba(new_data)[:, 1]

In [20]:
output_data = pd.DataFrame({'interested_in': output})

In [21]:
output_data

Unnamed: 0,interested_in
0,0.08667
1,0.067126
2,0.195868
