# Social Network Analysis. Predict salary and new connections 

**Predicting missing salaries and new email connections from a company's email network**

**Network Analysis. Supervised Learning. Regression (Salary prediction) and Classification (New connections prediction)**


Data from [Applied Social Network Analysis in Python | Coursera](https://www.coursera.org/learn/python-social-network-analysis/):

`net_emails.txt`: network where each node corresponds to a person at the company, and each edge indicates that at least one email has been sent between two people. 
The network also contains the node attributes Department (*name*) and ManagementSalary (1 = Receiving a management salary)

`net_future_connections.csv`: future conections of pair of nodes currently unconnected (1 = an edge between those two nodes will exist in the future)



In [None]:
%matplotlib inline
%load_ext autoreload
%autoreload 2

import sys
import numpy as np
import pandas as pd

import networkx as nx

sys.path.append("../")
import helper_ds

helper_ds.set_parent_execution_path()
helper_ds.info_system()

# A. Salary Prediction

## A1. Data Processing

#### Load graph 

In [None]:
graph = nx.read_gpickle("data/net_emails.txt")

print(nx.info(graph))

#### Extract node attributes and features to a dataframe

In [None]:
graph.nodes(data=True)[:3]

In [None]:
# Dataframe with node attributes
df = pd.DataFrame(index=graph.nodes())  # df: complete df
attributes = [k for k in graph.nodes(data=True)[0][1]]
for a in attributes:
    df[a] = pd.Series(nx.get_node_attributes(graph, a))

# node features
df["clustering"] = pd.Series(nx.clustering(graph))
df["degree"] = pd.Series(graph.degree())

df.head()

#### Explore the target and separate the prediction set

In [None]:
target = ["ManagementSalary"]
features = [col for col in df if col not in target]

print(df[target].squeeze().value_counts(dropna=False))

In [None]:
n_rows_original = df.shape[0]

df_pred = df[df["ManagementSalary"].isnull()]
df = df[(df["ManagementSalary"] == 0) | (df["ManagementSalary"] == 1)]

assert df.shape[0], df_pred.shape[0] == n_rows_original

#### Split data into training and test set

In [None]:
from sklearn.model_selection import train_test_split

df_train, df_test = train_test_split(df, test_size=0.2, stratify=df[target], random_state=0)

del df

`df_pred`: prediction set (no labels) <br>
`df_train`: training_set  <br>
`df_test`: test_set  <br>

#### Classify features
Change categorical variables as dtype 'categorical' and sort columns: numerical + categorical + target

In [None]:
import helper_ds

cat = ["Department", "ManagementSalary"]
num = ["clustering", "degree"]

df_train = helper_ds.sort_columns_by_type(df_train, target, categorical=cat)

pd.DataFrame(dict(df_train.dtypes), index=["Type"])[df_train.columns].head()

In [None]:
# df_train, dict_categories = helper_ds.remove_categories(df_train, target, ratio=0.01, show=True,
#                                                      dict_categories=None)

### Show training data

#### Numerical Features

In [None]:
df_train[num].describe(percentiles=[0.5])

In [None]:
helper_ds.show_numerical(df_train[num], kde=True)

In [None]:
helper_ds.show_target_vs_numerical(df_train, target, jitter=0.2, fit_reg=False, point_size=100)

In [None]:
helper_ds.correlation(df_train, target)

#### Categorical Features

In [None]:
df_train[cat].describe()

In [None]:
helper_ds.show_categorical(df_train[cat], target, sharey=True)

In [None]:
helper_ds.show_target_vs_categorical(df_train, target)

#### Missing values

In [None]:
high_missing = helper_ds.missing(df_train, limit=0.4)
# helper_ds.fill_simple(df_train, target, missing_categorical=999, inplace=True)

In [None]:
copy_df = df_train.copy()  # checkpoint
del df_train

## A2. Neural Network

In [None]:
df_train = copy_df.copy()  # Restore checkpoint
data = df_train.copy()
# from now on use data instead of df

### Prepare data for Neural Network

#### Scale numerical variables

In [None]:
data, scale_param = helper_ds.scale(data)

#### Create dummy features

In [None]:
# features only; target encoded later
data, dict_dummies = helper_ds.replace_by_dummies(data, target)

# save features order for tests and predictions
model_features = [f for f in data if f not in target]

data.head(3)

#### Split the data into training and validation sets

In [None]:
def validation_split(data, val_size=0.15):

    train, val = train_test_split(data, test_size=val_size, random_state=0, shuffle=True, stratify=data[target])

    # Separate the data into features and target (x=features, y=target)
    x_train, y_train = train.drop(target, axis=1).values, train[target].values
    x_val, y_val = val.drop(target, axis=1).values, val[target].values

    return x_train, y_train, x_val, y_val


x_train, y_train, x_val, y_val = validation_split(data, val_size=0.2)

#### One-hot encode the output

In [None]:
import keras


def one_hot_output(y_train, y_val):
    num_classes = len(np.unique(y_train))
    y_train = keras.utils.to_categorical(y_train, num_classes)
    y_val = keras.utils.to_categorical(y_val, num_classes)
    return y_train, y_val


y_train, y_val = one_hot_output(y_train, y_val)

In [None]:
print("train size \t X:{} \t Y:{}".format(x_train.shape, y_train.shape))
print("val size \t X:{} \t Y:{}".format(x_val.shape, y_val.shape))

### Build the Neural Network

In [None]:
import keras
from keras.models import Sequential
from keras.layers.core import Dense, Dropout
from keras import regularizers


def build_nn_binary_classification(input_size, output_size, summary=False):

    input_nodes = input_size // 8

    model = Sequential()
    model.add(Dense(input_nodes, input_dim=input_size, kernel_regularizer=regularizers.l2(0.001)))

    model.add(Dense(output_size, activation="softmax", kernel_regularizer=regularizers.l2(0.001)))

    model.compile(loss="binary_crossentropy", optimizer="adam", metrics=["accuracy"])

    if summary:
        model.summary()

    return model


build_nn = build_nn_binary_classification

### Train the Neural Network

In [None]:
import os
from time import time


def train_nn(model, x_train, y_train, validation_data=None, path=False, show=True):
    """
    Train the neural network model. If no validation_datais provided, a split for validation
    will be used
    """

    if show:
        print("Training ....")

    callbacks = [keras.callbacks.EarlyStopping(monitor="val_loss", patience=0, verbose=0)]
    t0 = time()

    history = model.fit(
        x_train,
        y_train,
        epochs=200,
        batch_size=64,
        verbose=0,
        validation_data=validation_data,
        # class_weight=cw, # worse results
        callbacks=callbacks,
    )

    if show:
        print("time: \t {:.1f} s".format(time() - t0))
        helper_ds.show_training(history)

    if path:
        model.save(path)
        print("\nModel saved at", path)

    return history


model = None
model = build_nn_binary_classification(x_train.shape[1], y_train.shape[1], summary=True)
train_nn(model, x_train, y_train, validation_data=(x_val, y_val));

### Evaluate the Model

In [None]:
data_test = helper_ds.sort_columns_by_type(df_test, target, categorical=cat)
data_test, _ = helper_ds.scale(data_test, scale_param)
data_test, _ = helper_ds.replace_by_dummies(data_test, target, dict_dummies)
data_test = data_test[model_features + target]  # sort columns to match training features order
x_test, y_test = data_test.drop(target, axis=1).values, data_test[target].values
y_test = keras.utils.to_categorical(y_test, 2)

In [None]:
from sklearn.metrics import roc_auc_score

score = model.evaluate(x_test, y_test, verbose=0)
print("\nNeural Network Accuracy: {:.3f}\n".format(score[1]))

y_pred = model.predict(x_test)

print("Neural Network ROC AUC:  {:.3f} \n".format(roc_auc_score(y_test, y_pred)))

## A3. Compare with non-neural network models

In [None]:
y_train = y_train[:, 1]
y_test = y_test[:, 1]

In [None]:
# from sklearn.utils import class_weight
# y_plain = np.ravel(y_train)
# cw = class_weight.compute_class_weight('balanced', np.unique(y_plain), y_plain)
# cw = {idx : value for idx, value in enumerate(cw)}

In [None]:
helper_ds.ml_classification(x_train, y_train, x_test, y_test, cross_validation=False)

# B. Future Connection Prediction

## B1. Data Processing

In [None]:
del df_train, df_test, df_pred

In [None]:
df = pd.read_csv("data/net_future_connections.csv", index_col=0, converters={0: eval})
df.head(6)

#### Extract edge-based attributes from the above graph

In [None]:
df["Common Neighbors"] = df.index.map(lambda city: len(list(nx.common_neighbors(graph, city[0], city[1]))))
df["Jaccard Coefficient"] = [i[2] for i in nx.jaccard_coefficient(graph, df.index)]
df["ResourceWarningurce Allocation"] = [i[2] for i in nx.resource_allocation_index(graph, df.index)]
df["Adamic-Adar Index"] = [i[2] for i in nx.adamic_adar_index(graph, df.index)]
df["Preferential Attachment"] = [i[2] for i in nx.preferential_attachment(graph, df.index)]
df.head()

#### Explore the target and separate the prediction set

In [None]:
target = ["Future Connection"]
features = [col for col in df if col not in target]

df["Future Connection"].value_counts(dropna=False)

In [None]:
n_rows_original = df.shape[0]

df_pred = df[df["Future Connection"].isnull()]
df = df[(df["Future Connection"] == 0) | (df["Future Connection"] == 1)]

assert df.shape[0], df_pred.shape[0] == n_rows_original

#### Split data into training and test set

In [None]:
from sklearn.model_selection import train_test_split

df_train, df_test = train_test_split(df, test_size=0.2, stratify=df[target], random_state=0)

del df

`df_pred`: prediction set (no labels) <br>
`df_train`: training_set  <br>
`df_test`: test_set  <br>

#### Classify features
Change categorical variables as dtype 'categorical' and sort columns: numerical + categorical + target

In [None]:
import helper_ds

cat = ["Future Connection"]
num = features  # all the features are numerical here

df_train = helper_ds.sort_columns_by_type(df_train, target, categorical=cat)

pd.DataFrame(dict(df_train.dtypes), index=["Type"])[df_train.columns].head()

### Show training data

#### Numerical Features

In [None]:
df_train[num].describe(percentiles=[0.5])

In [None]:
helper_ds.show_numerical(df_train, kde=True)

In [None]:
helper_ds.show_target_vs_numerical(df_train, target, jitter=0.2, fit_reg=False, point_size=10)

In [None]:
helper_ds.correlation(df_train, target)

#### Missing values

In [None]:
high_missing = helper_ds.missing(df_train, limit=0.4)

In [None]:
copy_df = df_train.copy()  # checkpoint
del df_train

## B2. Neural Network

In [None]:
df_train = copy_df.copy()  # Restore checkpoint
data = df_train.copy()
# from now on use data instead of df

### Prepare data for Neural Network

#### Scale numerical variables

In [None]:
data, scale_param = helper_ds.scale(data)

model_features = [f for f in data if f not in target]

#### Split the data into training and validation sets

In [None]:
def validation_split(data, val_size=0.15):

    train, val = train_test_split(data, test_size=val_size, random_state=0, shuffle=True, stratify=data[target])

    # Separate the data into features and target (x=features, y=target)
    x_train, y_train = train.drop(target, axis=1).values, train[target].values
    x_val, y_val = val.drop(target, axis=1).values, val[target].values

    return x_train, y_train, x_val, y_val


x_train, y_train, x_val, y_val = validation_split(data, val_size=0.2)

#### One-hot encode the output

In [None]:
import keras


def one_hot_output(y_train, y_val):
    num_classes = len(np.unique(y_train))
    y_train = keras.utils.to_categorical(y_train, num_classes)
    y_val = keras.utils.to_categorical(y_val, num_classes)
    return y_train, y_val


y_train, y_val = one_hot_output(y_train, y_val)

In [None]:
print("train size \t X:{} \t Y:{}".format(x_train.shape, y_train.shape))
print("val size \t X:{} \t Y:{}".format(x_val.shape, y_val.shape))

### Build the Neural Network

In [None]:
def build_nn_binary_classification(input_size, output_size, summary=False):

    input_nodes = input_size

    model = Sequential()

    model.add(
        Dense(
            input_nodes,
            input_dim=input_size,
            kernel_regularizer=regularizers.l2(0.0001),
        )
    )

    model.add(
        Dense(
            output_size,
            activation="softmax",
            kernel_regularizer=regularizers.l2(0.0001),
        )
    )

    model.compile(loss="binary_crossentropy", optimizer="adam", metrics=["accuracy"])

    if summary:
        model.summary()

    return model


build_nn = build_nn_binary_classification

### Train the Neural Network

In [None]:
def train_nn(model, x_train, y_train, validation_data=None, path=False, show=True):
    """
    Train the neural network model. If no validation_data is provided, a split for validation
    will be used
    """

    if show:
        print("Training ....")

    callbacks = [keras.callbacks.EarlyStopping(monitor="val_loss", patience=0, verbose=0)]
    t0 = time()

    history = model.fit(
        x_train,
        y_train,
        epochs=20,
        batch_size=1024,
        verbose=0,
        validation_data=validation_data,
        callbacks=callbacks,
    )

    if show:
        print("time: \t {:.1f} s".format(time() - t0))
        helper_ds.show_training(history)

    if path:
        model.save(path)
        print("\nModel saved at", path)

    return history


model = None
model = build_nn_binary_classification(x_train.shape[1], y_train.shape[1], summary=True)
train_nn(model, x_train, y_train, validation_data=(x_val, y_val));

### Evaluate the Model

In [None]:
data_test = helper_ds.sort_columns_by_type(df_test, target, categorical=cat)
data_test, _ = helper_ds.scale(data_test, scale_param)
data_test = data_test[model_features + target]  # sort columns to match training features order
x_test, y_test = data_test.drop(target, axis=1).values, data_test[target].values
y_test = keras.utils.to_categorical(y_test, 2)

In [None]:
from sklearn.metrics import roc_auc_score

score = model.evaluate(x_test, y_test, verbose=0)
print("\nNeural Network Accuracy: {:.3f}\n".format(score[1]))

y_pred = model.predict(x_test)

print("Neural Network ROC AUC:  {:.3f} \n".format(roc_auc_score(y_test, y_pred)))

## B3. Compare with non-neural network models

In [None]:
y_train = y_train[:, 1]
y_test = y_test[:, 1]

In [None]:
# from sklearn.ensemble import RandomForestClassifier
# clf=None
# clf = RandomForestClassifier()
# clf.fit(x_train, np.ravel(y_train))
# print("\nRandom Forest Accuracy: {:.3f}\n".format(clf.score(x_train, y_train)))
# y_pred = clf.predict_proba(x_test)
# print('Random Forest ROC_AUC: {:.3f}'.format(roc_auc_score(y_test, y_pred[:,1])))

In [None]:
helper_ds.ml_classification(x_train, y_train, x_test, y_test, cross_validation=False)