### Export the data to Python (Jupyter Notebook + PySpark)

In [1]:
!pip install neo4j pandas



In [2]:
from neo4j import GraphDatabase
import pandas as pd

# Fill these with your Aura/Desktop credentials
uri = "bolt://localhost:7687"  # e.g., bolt://localhost:7687 or Neo4j Aura bolt URL
username = "neo4j"
password = "12345678"

driver = GraphDatabase.driver(uri, auth=(username, password))

In [3]:
def fetch_friend_edges(tx):
    query = """
    MATCH (u1:User)-[:FRIEND]-(u2:User)
    WHERE u1.id < u2.id
    RETURN u1.id AS user1, u2.id AS user2
    """
    return list(tx.run(query))

with driver.session() as session:
    edges = session.read_transaction(fetch_friend_edges)

df = pd.DataFrame(edges)
df.drop_duplicates(inplace=True)
df.head()


  edges = session.read_transaction(fetch_friend_edges)


Unnamed: 0,0,1
0,61,62
1,61,63
2,61,64
3,61,65
4,61,66


In [4]:
def fetch_users(tx):
    query = """
    MATCH (u:User)
    RETURN u.id AS id, u.age AS age, u.location AS location, u.interests AS interests
    """
    return list(tx.run(query))

with driver.session() as session:
    users = session.read_transaction(fetch_users)

users_df = pd.DataFrame(users)
users_df.head()


  users = session.read_transaction(fetch_users)


Unnamed: 0,0,1,2,3
0,1,26,Seattle,"[travel, tech]"
1,2,26,Chicago,"[travel, sports]"
2,3,47,Austin,"[reading, travel]"
3,4,39,SF,"[tech, music]"
4,5,28,LA,"[tech, cooking]"


In [5]:
df.to_csv("friend_edges.csv", index=False)
users_df.to_csv("user_attributes.csv", index=False)

### Prepare Data for Link Prediction

In [6]:
df = pd.DataFrame(edges, columns=["user1", "user2"])

In [7]:
df.drop_duplicates(inplace=True)
df.head()

Unnamed: 0,user1,user2
0,61,62
1,61,63
2,61,64
3,61,65
4,61,66


In [8]:
users = list(set(df['user1'].tolist() + df['user2'].tolist()))

In [9]:
users

['170',
 '143',
 '157',
 '140',
 '113',
 '128',
 '73',
 '184',
 '133',
 '84',
 '106',
 '155',
 '98',
 '144',
 '148',
 '187',
 '96',
 '122',
 '153',
 '111',
 '162',
 '190',
 '152',
 '138',
 '139',
 '77',
 '199',
 '102',
 '192',
 '64',
 '134',
 '62',
 '172',
 '97',
 '179',
 '191',
 '186',
 '69',
 '104',
 '101',
 '141',
 '92',
 '70',
 '175',
 '173',
 '91',
 '68',
 '163',
 '105',
 '120',
 '99',
 '156',
 '178',
 '174',
 '71',
 '61',
 '116',
 '150',
 '154',
 '127',
 '130',
 '145',
 '135',
 '160',
 '167',
 '79',
 '147',
 '86',
 '136',
 '66',
 '95',
 '196',
 '193',
 '107',
 '158',
 '195',
 '115',
 '89',
 '100',
 '78',
 '74',
 '149',
 '177',
 '142',
 '126',
 '185',
 '159',
 '88',
 '188',
 '181',
 '72',
 '165',
 '112',
 '117',
 '151',
 '198',
 '63',
 '168',
 '90',
 '80',
 '137',
 '83',
 '87',
 '108',
 '119',
 '200',
 '75',
 '109',
 '103',
 '93',
 '183',
 '123',
 '182',
 '132',
 '67',
 '85',
 '110',
 '176',
 '189',
 '169',
 '146',
 '114',
 '125',
 '65',
 '121',
 '166',
 '76',
 '124',
 '129',
 '16

In [10]:
import random

# Set of existing friend pairs for lookup
positive_set = set(tuple(sorted([a, b])) for a, b in zip(df['user1'], df['user2']))

# Generate negative pairs
neg_samples = set()
while len(neg_samples) < len(positive_set):
    u1, u2 = random.sample(users, 2)
    pair = tuple(sorted([u1, u2]))
    if pair not in positive_set:
        neg_samples.add(pair)

# Create DataFrame for negative samples
neg_df = pd.DataFrame(list(neg_samples), columns=['user1', 'user2'])
neg_df['label'] = 0


In [11]:
df['label'] = 1
all_df = pd.concat([df, neg_df], ignore_index=True)
all_df = all_df.sample(frac=1).reset_index(drop=True)  # shuffle the data
all_df.head()

Unnamed: 0,user1,user2,label
0,158,61,1
1,199,94,1
2,187,188,1
3,140,177,1
4,140,152,1


### Train PySpark ML Pipeline

In [12]:
import os
os.environ["PYSPARK_PYTHON"] = "python"


In [13]:
!pip install pyspark



In [14]:
from pyspark.sql import SparkSession
spark = SparkSession.builder.appName("FriendLinkPrediction").getOrCreate()

In [15]:
spark_df = spark.createDataFrame(all_df)
spark_df.printSchema()
spark_df.show(5)

root
 |-- user1: string (nullable = true)
 |-- user2: string (nullable = true)
 |-- label: long (nullable = true)

+-----+-----+-----+
|user1|user2|label|
+-----+-----+-----+
|  158|   61|    1|
|  199|   94|    1|
|  187|  188|    1|
|  140|  177|    1|
|  140|  152|    1|
+-----+-----+-----+
only showing top 5 rows



In [16]:
from pyspark.ml.feature import StringIndexer, VectorAssembler
from pyspark.ml.classification import LogisticRegression
from pyspark.ml import Pipeline

# Convert user1 and user2 to numeric indices
indexer1 = StringIndexer(inputCol="user1", outputCol="user1_index")
indexer2 = StringIndexer(inputCol="user2", outputCol="user2_index")

# Assemble into a single feature vector
assembler = VectorAssembler(inputCols=["user1_index", "user2_index"], outputCol="features")

# Logistic Regression model
lr = LogisticRegression(featuresCol="features", labelCol="label")

# Pipeline
pipeline = Pipeline(stages=[indexer1, indexer2, assembler, lr])

In [17]:
model = pipeline.fit(spark_df)

In [18]:
predictions = model.transform(spark_df)
predictions.select("user1", "user2", "label", "prediction", "probability").show(10)

+-----+-----+-----+----------+--------------------+
|user1|user2|label|prediction|         probability|
+-----+-----+-----+----------+--------------------+
|  158|   61|    1|       1.0|[0.28387352059302...|
|  199|   94|    1|       0.0|[0.65870439753559...|
|  187|  188|    1|       1.0|[0.44532801374742...|
|  140|  177|    1|       0.0|[0.69615190190412...|
|  140|  152|    1|       1.0|[0.39369394129160...|
|  161|  171|    1|       0.0|[0.56258892155990...|
|  126|   71|    0|       0.0|[0.50988054587811...|
|  146|   98|    0|       1.0|[0.42117700766169...|
|  134|   73|    0|       1.0|[0.36884661727711...|
|  140|  149|    1|       0.0|[0.78233808080798...|
+-----+-----+-----+----------+--------------------+
only showing top 10 rows



### Export results and Streamlit Dashboard

In [19]:
predictions_pd = predictions.select("user1", "user2", "label", "prediction", "probability").toPandas()
predictions_pd['prob_score'] = predictions_pd['probability'].apply(lambda x: x[1])  # probability of label=1
predictions_pd.to_csv("friend_recommendations.csv", index=False)

In [21]:
!pip install streamlit

Collecting streamlit
  Obtaining dependency information for streamlit from https://files.pythonhosted.org/packages/13/e6/69fcbae3dd2fcb2f54283a7cbe03c8b944b79997f1b526984f91d4796a02/streamlit-1.45.1-py3-none-any.whl.metadata
  Downloading streamlit-1.45.1-py3-none-any.whl.metadata (8.9 kB)
Collecting altair<6,>=4.0 (from streamlit)
  Obtaining dependency information for altair<6,>=4.0 from https://files.pythonhosted.org/packages/aa/f3/0b6ced594e51cc95d8c1fc1640d3623770d01e4969d29c0bd09945fafefa/altair-5.5.0-py3-none-any.whl.metadata
  Downloading altair-5.5.0-py3-none-any.whl.metadata (11 kB)
Collecting blinker<2,>=1.5.0 (from streamlit)
  Obtaining dependency information for blinker<2,>=1.5.0 from https://files.pythonhosted.org/packages/10/cb/f2ad4230dc2eb1a74edf38f1a38b9b52277f75bef262d8908e60d957e13c/blinker-1.9.0-py3-none-any.whl.metadata
  Downloading blinker-1.9.0-py3-none-any.whl.metadata (1.6 kB)
Collecting gitpython!=3.1.19,<4,>=3.0.7 (from streamlit)
  Obtaining dependency in

In [22]:
import streamlit as st
import pandas as pd

# Load prediction data
df = pd.read_csv("friend_recommendations.csv")

# Sort by highest recommendation probability
df = df.sort_values(by="prob_score", ascending=False)

st.title("🤝 Friend Recommendation System")
st.markdown("This app shows predicted friend recommendations based on user interactions.")

# User selection
users = sorted(df['user1'].unique())
selected_user = st.selectbox("Select a User:", users)

# Filter for top recommendations for this user
recommendations = df[(df['user1'] == selected_user) & (df['prediction'] == 1)]
recommendations = recommendations[['user2', 'prob_score']].sort_values(by='prob_score', ascending=False)

st.subheader(f"Top Recommended Friends for User {selected_user}")
st.dataframe(recommendations.head(10))

# Optional: Show false negatives or interesting patterns
st.markdown("----")
show_all = st.checkbox("Show all predictions?")
if show_all:
    st.dataframe(df.head(100))

2025-05-27 11:48:51.945 
  command:

    streamlit run C:\Users\koner\anaconda3\Lib\site-packages\ipykernel_launcher.py [ARGUMENTS]
2025-05-27 11:48:51.960 Session state does not function when running a script without `streamlit run`


In [23]:
!jupyter nbconvert --to script SocialNetworkNeo4j.ipynb

[NbConvertApp] Converting notebook SocialNetworkNeo4j.ipynb to script
[NbConvertApp] Writing 4815 bytes to SocialNetworkNeo4j.py
