# Machine learning on graphs

- prediction based on graph metrics

---

_You are currently looking at **version 1.2** of this notebook. To download notebooks and datafiles, as well as get help on Jupyter notebooks in the Coursera platform, visit the [Jupyter Notebook FAQ](https://www.coursera.org/learn/python-social-network-analysis/resources/yPcBs) course resource._

---

In [None]:
%matplotlib inline

import matplotlib as mpl
import matplotlib.pyplot as plt

import networkx as nx
import pandas as pd
import numpy as np
import pickle

In [None]:
!pip list | grep -i netw

---

## Part 1 - Random Graph Identification

For the first part of this assignment you will analyze randomly generated graphs and determine which algorithm created them.

In [None]:
P1_Graphs = pickle.load(open('../_data/A4_graphs.dms','rb'))
P1_Graphs

<br>
`P1_Graphs` is a list containing 5 networkx graphs. Each of these graphs were generated by one of three possible algorithms:
* Preferential Attachment (`'PA'`)
* Small World with low probability of rewiring (`'SW_L'`)
* Small World with high probability of rewiring (`'SW_H'`)

Anaylze each of the 5 graphs and determine which of the three algorithms generated the graph.

*The `graph_identification` function should return a list of length 5 where each element in the list is either `'PA'`, `'SW_L'`, or `'SW_H'`.*

__Real World:__

 - degrees are distributed as __Power Law__ (log/log degree distribution is straight line)
 - shortest path < 7,5
 - clustering < 0.1


__Small world:__

 - degrees are NOT distributed as __Power Law__ 
 - More nodes => 
   - higher average shortest path
   - lower average clustering
 - Higher rewiring p: lower clustering and lower shortest path
 
- __SW_L: Lattice - Small World:__
 - higher shortest paths > 7.5, higher clustering > 0.1
 - max clustering = 0.1
 - max shortest path = 7,5

- __SW_H: Small world - Random:__ 
 - lower shortest paths < 7.5, lower clustering > 0.02
 - max clustering = 0.02
 - max shortest path = 4,5
 

In [None]:
def graph_type(G):
    """Return graph type."""
    degrees = Counter(dict(nx.degree(G)).values())
    shortest_path = nx.average_shortest_path_length(G) 
    clustering = nx.average_clustering(G)
    most_common = degrees.most_common(5)
    # distribution is exponential - monotonically decreasing
    if all(i[0] <= j[0] for i, j in zip(most_common, most_common[1:])):
        graph_type = 'PA'
    elif (shortest_path < 7.5) | (clustering < 0.25):
        graph_type = 'SW_H'
    else:
        graph_type = 'SW_L'
        
    template = 'Graph type: {} \nShortest path length: {:.2f}, Clustering: {:.2f}'.format(
        graph_type, shortest_path, clustering)
    return template

### Barabasi-Albert graph 

- Preferential attachment mode

https://networkx.github.io/documentation/networkx-1.9/reference/generators.html

In [None]:
# Distribution of degrees
# Number of edges to attach from a new node to existing nodes
from collections import Counter
for edges in [1, 2, 5, 20]:
    G = nx.barabasi_albert_graph(100, edges)
    fig, (ax1, ax2) = plt.subplots(1,2, figsize=(15,6))
    ttl = nx.number_of_nodes(G)
    degrees = Counter(dict(nx.degree(G)).values()).keys()
    fraction = np.array(list(Counter(dict(nx.degree(G)).values()).values()))/ttl
    _ = ax1.bar(degrees, fraction)
    _ = ax1.set_xlabel('Degree')
    _ = ax1.set_ylabel('Fraction of Nodes')
    _ = ax1.spines['top'].set_visible(False)
    _ = ax1.spines['right'].set_visible(False)
    _ = nx.draw_networkx(G, node_size=.7, edge_with=.2, with_labels=False, alpha=.5, ax=ax2)
    _ = plt.suptitle('Barabasi-Albert graph, # connecting edges:{}\n{}'.format(edges, graph_type(G)))
    _ = ax2.axis('off')
    plt.show();

#### Play ground and sanity check graph data

In [None]:
# dict(nx.degree(G))                            # edges per node
# Counter(dict(nx.degree(G)))                   # edges per node
Counter(dict(nx.degree(G)).values())            # distribution of # edges as dict
Counter(dict(nx.degree(G)).values()).items()    # same as tuples

### Watts-Strogatz graph

- small-world model

In [None]:
# Watts-Strogatz small-world graph.
# The number of nodes
# Each node is connected to k nearest neighbors in ring topology
# The probability of rewiring each edge

from collections import Counter
for knn in [2, 5, 10]:
    for proba in [.1, .2, .4, .8, 1.]:
        try:
            G = nx.connected_watts_strogatz_graph(n, knn, proba, tries=1000, seed=0)
            fig, (ax1, ax2) = plt.subplots(1,2, figsize=(15,6))
            ttl = nx.number_of_nodes(G)
            degrees = Counter(dict(nx.degree(G)).values()).keys()
            fraction = np.array(list(Counter(dict(nx.degree(G)).values()).values()))/ttl
            _ = ax1.bar(degrees, fraction)
            _ = ax1.set_xlabel('Degree')
            _ = ax1.set_ylabel('Fraction of Nodes')
            _ = ax1.spines['top'].set_visible(False)
            _ = ax1.spines['right'].set_visible(False)
            _ = nx.draw_networkx(G, node_size=.7, edge_with=.2, with_labels=False, alpha=.5, ax=ax2)
            _ = plt.suptitle('Connected Watts-Strogatz graph \nnodes: {}, k-nn: {}, proba: {}\n{}'.format(
                n, knn, proba, graph_type(G)))
            _ = ax2.axis('off')
            plt.show();
        except:
            continue

In [None]:
# Holme and Kim algorithm for growing graphs with powerlaw
# The number of nodes
# The number of random edges to add for each new node
# Probability of adding a triangle after adding a random edge

n=100
from collections import Counter
for re in [2, 4, 8]:
    for proba in [.1, .2, .4, .8, 1.]:
        try:
            G = nx.powerlaw_cluster_graph(n, re, proba, seed=0) #(n, knn, proba, tries=1000, seed=0)
            fig, (ax1, ax2) = plt.subplots(1,2, figsize=(15,6))
            ttl = nx.number_of_nodes(G)
            degrees = Counter(dict(nx.degree(G)).values()).keys()
            fraction = np.array(list(Counter(dict(nx.degree(G)).values()).values()))/ttl
            _ = ax1.bar(degrees, fraction)
            _ = ax1.set_xlabel('Degree')
            _ = ax1.set_ylabel('Fraction of Nodes')
            _ = ax1.spines['top'].set_visible(False)
            _ = ax1.spines['right'].set_visible(False)
            _ = nx.draw_networkx(G, node_size=.7, edge_with=.2, with_labels=False, alpha=.5, ax=ax2)
            _ = plt.suptitle('Holme & Kim growing graph \nnodes: {}, k-nn: {}, proba: {}\n{}'.format(
                n, re, proba, graph_type(G)))
            _ = ax2.axis('off')
            plt.show();
        except:
            continue

---

## Part 2 - Company Emails

For the second part of this assignment you will be workking with a company's email network where each node corresponds to a person at the company, and each edge indicates that at least one email has been sent between two people.

The network also contains the node attributes `Department` and `ManagementSalary`.

`Department` indicates the department in the company which the person belongs to, and `ManagementSalary` indicates whether that person is receiving a management position salary.

### Convert networkx 1.X pickle-file to 2.X

The Pickle protocol does not store class methods, only the data. So if you write a pickle file with v1 you should not expect to read it into a v2 Graph. If this happens to you, read it in with v1 installed and write a file with the node and edge information. You can read that into a config with v2 installed and then add those nodes and edges to a fresh graph. 

```python
# Open different environment:
!pip list | grep -i network
!pip install networkx==1.11
import networkx as nx

G = nx.read_gpickle('./email_prediction.txt')
print(nx.info(G))
```

```python
import pandas as pd
edges = pd.DataFrame(list(G.edges(data=True)))
nodes = pd.DataFrame(list(G.nodes(data=True)))

# Use a lambda to pull out the attributes from the attributes dictionary in column 1
nodes['Department'] = nodes.loc[:, 1].map(lambda x: x['Department'])
nodes['ManagementSalary'] = nodes.loc[:, 1].map(lambda x: x['ManagementSalary'])
del nodes[1]

edges.to_csv('email_edges.csv')
nodes.to_csv('email_nodes.csv')
```

### Import edges and nodes into networkx 2.X

In [None]:
nodes = pd.read_csv('../_data/email_nodes.csv', index_col=0, 
                    names=['node', 'Department', 'ManagementSalary'])
edges = pd.read_csv('../_data/email_edges.csv', index_col=0,
                   names=['n1', 'n2', 'attr'])
nodes.sample(3)
edges.sample(3)

In [None]:
# First create graph from edges, then add nodes
G = nx.from_pandas_dataframe(edges, 'n1', 'n2', edge_attr='attr')
print(nx.info(G))

In [None]:
_ = [G.add_node(nodes.loc[n, 'node'], 
                Department=nodes.loc[n, 'Department'], 
                ManagementSalary=nodes.loc[n, 'ManagementSalary']) for n in nodes.index 
     if n in list(G.nodes())]

In [None]:
list(G.nodes(data=True))[:10]

***

### Part 2A - Salary Prediction

Using network `G`, identify the people in the network with missing values for the node attribute `ManagementSalary` and predict whether or not these individuals are receiving a management position salary.

To accomplish this, you will need to create a matrix of node features using networkx, train a sklearn classifier on nodes that have `ManagementSalary` data, and predict a probability of the node receiving a management salary for nodes where `ManagementSalary` is missing.



Your predictions will need to be given as the probability that the corresponding employee is receiving a management position salary.

The evaluation metric for this assignment is the Area Under the ROC Curve (AUC).

Your grade will be based on the AUC score computed for your classifier. A model which with an AUC of 0.88 or higher will receive full points, and with an AUC of 0.82 or higher will pass (get 80% of the full points).

Using your trained classifier, return a series of length 252 with the data being the probability of receiving management salary, and the index being the node id.

    Example:
    
        1       1.0
        2       0.0
        5       0.8
        8       1.0
            ...
        996     0.7
        1000    0.5
        1001    0.0
        Length: 252, dtype: float64

### Network metrics

In [None]:
df = pd.DataFrame(index=G.nodes())
df['Department'] = pd.Series(nx.get_node_attributes(G, 'Department'))
df['ManagementSalary'] = pd.Series(nx.get_node_attributes(G, 'ManagementSalary'))
df['clustering'] = pd.Series(nx.clustering(G))
df['degree_0'] = pd.Series([x[1] for x in G.degree()])
df['degree_1'] = pd.Series([x[1] for x in G.degree()])
df['degree_cent'] = pd.Series(nx.degree_centrality(G))
df['closeness'] = pd.Series(nx.closeness_centrality(G))
df['betweenness'] = pd.Series(nx.betweenness_centrality(G))
df['pagerank'] = pd.Series(nx.pagerank(G, alpha=0.80))
df['hub'] = pd.Series(nx.hits(G)[0])
df['authority'] = pd.Series(nx.hits(G)[1])
df.sample(10)

### Dummy vars

In [None]:
df.Department.value_counts()[:5]

In [None]:
pd.get_dummies(df.Department, prefix='dept').head()
df = pd.concat([df, pd.get_dummies(df.Department, prefix='dept', drop_first=True)], axis=1)

In [None]:
df.sample()

In [None]:
del df['Department']

### Split in train test set

In [None]:
# Split train test
df_test_mask = pd.isnull(df.loc[:, 'ManagementSalary'])
df_train = df[~df_test_mask][:]  # [:] copies the slice
df_test = df[df_test_mask][:]

# Train set X, y
y_train = df_train.pop('ManagementSalary').astype('f').astype('i')
X_train = df_train
idx_train = df_train.index

# Test set X
df_test.drop('ManagementSalary', axis=1, inplace=True)
X_test = df_test
idx_test = df_test.index

### Scaling numerical features

In [None]:
X_train.info()
num_types = ['int','float', 'uint8']
X_train.select_dtypes(num_types).sample()
X_test.select_dtypes(num_types).sample()
features = X_train.select_dtypes(num_types).columns
features

In [None]:
from sklearn.preprocessing import MinMaxScaler

scaler = MinMaxScaler()
X_train_sc = scaler.fit_transform(X_train.select_dtypes(num_types))
X_test_sc = scaler.transform(X_test.select_dtypes(num_types))

In [None]:
X_train_sc.shape

### PCA transform X to principal components

In [None]:
n_components = 10
from sklearn.decomposition import PCA
pca = PCA(n_components=10)
pca.fit(X_train_sc)

### Loading Vectors - Eigen Vectors

In [None]:
pca.components_.shape

In [None]:
vectors = ['V'+str(x) for x in range(1, n_components+1)]
components = ['PC'+str(x) for x in range(1, n_components+1)]
pca_loadings = pd.DataFrame(pca.components_.T, index=features, columns=vectors)
pca_loadings

### Principle Components

In [None]:
X_train_pca = pd.DataFrame(pca.fit_transform(X_train_sc), index=idx_train, columns=components)
X_train_pca.sample(5)

In [None]:
X_test_pca = pd.DataFrame(pca.fit_transform(X_test_sc), index=idx_test, columns=components)
X_test_pca.sample(5)

***

### Learn Classifiers

In [None]:
# Import preprocessing, selection and metrics
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split, cross_val_predict, cross_val_score, GridSearchCV
from sklearn.metrics import roc_auc_score
from sklearn.dummy import DummyClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, AdaBoostClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC, LinearSVC
from sklearn.neural_network import MLPClassifier

In [None]:
classifiers = [
    GaussianNB(),
    DecisionTreeClassifier(random_state=0),
    GradientBoostingClassifier(random_state=0),
    RandomForestClassifier(n_estimators=100, random_state=0),
    AdaBoostClassifier(learning_rate=0.1, n_estimators=100, random_state=0),
    KNeighborsClassifier(),
]

In [None]:
def auc_scores(model, *args, k=5, threshold=0.50):
    """CV scores"""
    X, y = args
    try:
        predictions = cross_val_predict(model, X, y, cv=k, n_jobs=-1)
        pred_probas = (cross_val_predict(model, X, y, cv=k, method='predict_proba', n_jobs=-1)[:, 1] > threshold) * 1
        print('AUC - Test predict  {:.2%}'.format(roc_auc_score(y, predictions)))
        print('AUC - Test probabil {:.2%}'.format(roc_auc_score(y, pred_probas)))
    except:
        None

In [None]:
def salary_predictions(X_train, y_train, X_test, classifiers):

    for clf in classifiers:
        print('-'*80)
        print(clf)

        # Training scores
        clf.fit(X_train, y_train)
        pred_train = clf.predict(X_train)
        print('AUC - Train pred    {:.2%}'.format(roc_auc_score(y_train, pred_train)))

        # CV scores
        auc_scores(clf, X_train, y_train)

        try:
            # predict_proba: probability per class(p, 1-p)
            predicted = pd.DataFrame(clf.predict_proba(X_test), columns=clf.classes_)
            predicted['idx'] = idx_test
            predicted.set_index('idx', inplace=True)
            predicted.drop(0.0, axis=1, inplace=True)
            pred_series = predicted.loc[:, 1.0]  # pd.Series(predicted.values)
            assert type(pred_series) == pd.Series, 'wtf: ' + str(type(pred_series))
        except:
            pred_series = None
            continue

    return pred_series

#### Fit and evaluate classifiers based on scaled dataset

In [None]:
salary_predictions(X_train_sc, y_train, X_test_sc, classifiers)

#### Fit and evaluate classifiers based on PCA dataset

In [None]:
salary_predictions(X_train_pca, y_train, X_test_pca, classifiers)

### Tune best classifier with GridSearchCV

In [None]:
from sklearn.model_selection import GridSearchCV
parameters = {
    'loss': ['deviance', 'exponential'],
    'learning_rate': [.05, .1, .2, .4, .8],
    'max_depth': [3, 4, 5, 6]}

clf = GridSearchCV(GradientBoostingClassifier(random_state=0), parameters)
clf.fit(X_train_sc, y_train)

#### Metrics

In [None]:
sorted(clf.cv_results_.keys())
np.mean(clf.cv_results_['mean_test_score'])

#### Best estimator and parameters

In [None]:
clf.best_estimator_
clf.best_params_

#### Best CV score

In [None]:
clf.best_score_

#### Train on best parameters

In [None]:
gb = GradientBoostingClassifier(**clf.best_params_, random_state=0).fit(X_train_sc, y_train)

#### Predict probability on best model

In [None]:
pd.DataFrame(gb.predict_proba(X_test_sc), columns=clf.classes_).sample(10)

***

### Part 2B - New Connections Prediction

For the last part of this assignment, you will predict future connections between employees of the network. The future connections information has been loaded into the variable `future_connections`. The index is a tuple indicating a pair of nodes that currently do not have a connection, and the `Future Connection` column indicates if an edge between those two nodes will exist in the future, where a value of 1.0 indicates a future connection.

In [None]:
import operator
# Import preprocessing, selection and metrics
from sklearn.model_selection import train_test_split, cross_val_predict, cross_val_score, GridSearchCV
from sklearn.metrics import roc_auc_score
from sklearn.dummy import DummyClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC, LinearSVC

### Data

In [None]:
df = pd.read_csv('../_data/Future_Connections.csv', index_col=0, converters={0: eval})
df.sample(10)

#### Reduce dataset 

In [None]:
# df = df.sample(1000)
df.head()

In [None]:
df['Future Connection'].value_counts()
pd.isnull(df.loc[:, 'Future Connection']).sum()

#### Labels

In [None]:
df_testset = pd.isnull(df.loc[:, 'Future Connection'])

In [None]:
df_testset

In [None]:
y_train = df.loc[~df_testset, 'Future Connection'].values.astype('i')
# y_test = df.loc[df_testset, 'Future Connection'].values

In [None]:
y_train[:10]
# y_test[:10]

In [None]:
df.info()

#### Build data set of network metrics

Using network `G` and `future_connections`, identify the edges in `future_connections` with missing values and predict whether or not these edges will have a future connection.

To accomplish this, you will need to create a matrix of features for the edges found in `future_connections` using networkx, train a sklearn classifier on those edges in `future_connections` that have `Future Connection` data, and predict a probability of the edge being a future connection for those edges in `future_connections` where `Future Connection` is missing.



Your predictions will need to be given as the probability of the corresponding edge being a future connection.

The evaluation metric for this assignment is the Area Under the ROC Curve (AUC).

Your grade will be based on the AUC score computed for your classifier. A model which with an AUC of 0.88 or higher will receive full points, and with an AUC of 0.82 or higher will pass (get 80% of the full points).

Using your trained classifier, return a series of length 122112 with the data being the probability of the edge being a future connection, and the index being the edge as represented by a tuple of nodes.

    Example:
    
        (107, 348)    0.35
        (542, 751)    0.40
        (20, 426)     0.55
        (50, 989)     0.35
                  ...
        (939, 940)    0.15
        (555, 905)    0.35
        (75, 101)     0.65
        Length: 122112, dtype: float64

In [None]:
print(nx.info(G))

In [None]:
# Measure 1: Common Neighbors (intercept)
# The number of common neighbors of nodes 𝑋 and 𝑌
L = [(e[0], e[1], len(list(nx.common_neighbors(G, e[0], e[1])))) for e in df.index]

df['common_nb'] = [p for u, v, p in L]

In [None]:
df.head()

In [None]:
# Measure 1: Common Neighbors (intercept)
# The number of common neighbors of nodes 𝑋 and 𝑌
# L = [(e[0], e[1], len(list(nx.common_neighbors(G, e[0], e[1]))))
#      for e in nx.non_edges(G)]

# df['pair'] = [(u, v) for u, v, p in L]
# df['common_nb'] = [p for u, v, p in L]

In [None]:
# Measure 2: Jaccard Coefficient (intercept over union)
# Number of common neighbors normalized by the total number of neighbors
# common_neighbors/total_neighbors
df['jaccard'] = pd.Series([p for u, v, p in nx.jaccard_coefficient(G, df.index)]).values

# Returns:
# piter – An iterator of 3-tuples in the form (u, v, p) 
# where (u, v) is a pair of nodes and p is their Jaccard coefficient.

In [None]:
df.sample(3)

In [None]:
# Measure 3: Resource
# Fraction of a ”resource” that a node can send to another through their common neighbors
# sum(1/degree_common_neighbor)
df['resource'] = pd.Series([p for u, v, p in nx.resource_allocation_index(G, df.index)]).values

# Returns:
# piter – An iterator of 3-tuples in the form (u, v, p) 
# where (u, v) is a pair of nodes and p is their resource allocation index.

In [None]:
df.sample(3)

In [None]:
# Measure 4:
# Adamic Adar Index
# Similar to resource allocation index, but with log in the denominator
# sum(1/log(degree_common_neighbor))
df['adamic_adar'] = pd.Series([p for u, v, p in nx.adamic_adar_index(G, df.index)]).values

# Returns: 
# piter – An iterator of 3-tuples in the form (u, v, p) 
# where (u, v) is a pair of nodes and p is their Adamic-Adar index.

In [None]:
df.sample(3)

In [None]:
# Method 5:
# Preferential Attachment
# In the preferential attachment model, nodes with high degree get more neighbors
# degree_source * degree_target
df['pref_att'] = pd.Series([p for u, v, p in nx.preferential_attachment(G, df.index)]).values

# Returns:
# piter – An iterator of 3-tuples in the form (u, v, p) 
# where (u, v) is a pair of nodes and p is their preferential attachment score.

In [None]:
df.sample(3)

In [None]:
# Measure 6:
# Community Common Neighbors
# Number of common neighbors with bonus of 1 for each neighbor in same community
# f(u) = 1 if same community else 0
# sum(f(u) * degree)
for i, dept in enumerate(nx.get_node_attributes(G, 'Department')):
    G.node[i]['community'] = dept
    
df['com_common_nb'] = pd.Series([p for u, v, p in nx.cn_soundarajan_hopcroft(G, df.index)]).values

In [None]:
df.sample(3)

In [None]:
# Measure 7:
# Community Resource Allocation
# Similar to resource allocation index, but only considering nodes in the same community
# f(u) = 1 if same community else 0
# sum(f(u)/degree)
df['com_resource'] = pd.Series([p for u, v, p in nx.ra_index_soundarajan_hopcroft(G, df.index)]).values

# Returns:
# piter – An iterator of 3-tuples in the form (u, v, p) 
# where (u, v) is a pair of nodes and p is their score.

In [None]:
df.sample(3)

In [None]:
# Measure 8: TODO
# Community Resource Allocation
# Similar to resource allocation index, but only considering nodes in the same community
# f(u) = 1 if same community else 0
# sum(f(u)/degree)
df['cn_com_resource'] = pd.Series([p for u, v, p in nx.cn_soundarajan_hopcroft(G, df.index)]).values

# Returns:
# piter – An iterator of 3-tuples in the form (u, v, p) 
# where (u, v) is a pair of nodes and p is their score.

In [None]:
df.sample(3)

In [None]:
df.info()

### Train test split

In [None]:
X_train = df[~df_testset][:]
del X_train['Future Connection']

X_test = df[df_testset][:]
del X_test['Future Connection']

In [None]:
X_train.sample()
X_test.sample()

### Scaling numerical features

In [None]:
X_train.info()

In [None]:
num_types = ['int','float', 'uint8']
X_train.select_dtypes(num_types).sample()
X_test.select_dtypes(num_types).sample()
features = X_train.select_dtypes(num_types).columns
features

In [None]:
from sklearn.preprocessing import MinMaxScaler

scaler = MinMaxScaler()
X_train_sc = scaler.fit_transform(X_train.select_dtypes(num_types))
X_test_sc = scaler.transform(X_test.select_dtypes(num_types))

In [None]:
X_train_sc.shape

In [None]:
classifiers = [
    GaussianNB(),
    DecisionTreeClassifier(random_state=0),
    GradientBoostingClassifier(random_state=0),
    RandomForestClassifier(n_estimators=100, random_state=0),
    AdaBoostClassifier(learning_rate=0.1, n_estimators=100, random_state=0),
    KNeighborsClassifier(),
#     LinearSVC(random_state=0)
    ]

### 

In [None]:
def auc_scores(model, *args, k=5, threshold=0.50):
    """CV scores"""
    X, y = args
    predictions = cross_val_predict(model, X, y, cv=k, n_jobs=-1)
    print('AUC - Test predict  {:.2%}'.format(roc_auc_score(y, predictions)))

In [None]:
def new_connections_predictions(X_train, y_train, X_test, classifiers):

    for clf in classifiers:
        print('-'*80)
        print(clf)

        # Training scores
        clf.fit(X_train, y_train)
        pred_train = clf.predict(X_train)
        print('AUC - Train pred    {:.2%}'.format(roc_auc_score(y_train, pred_train)))

        # CV scores
        auc_scores(clf, X_train, y_train)

        try:
            # predict_proba: probability per class(p, 1-p)
            predicted = pd.DataFrame(clf.predict_proba(X_test), columns=clf.classes_)
            predicted['idx'] = idx_test
            predicted.set_index('idx', inplace=True)
            predicted.drop(0.0, axis=1, inplace=True)
            pred_series = predicted.loc[:, 1.0]  # pd.Series(predicted.values)
            assert type(pred_series) == pd.Series, 'wtf: ' + str(type(pred_series))
        except:
            pred_series = None
            continue

    return pred_series

#### Fit and evaluate classifiers based on scaled dataset

In [None]:
new_connections_predictions(X_train_sc, y_train, X_test_sc, classifiers)

#### Fit and evaluate classifiers based on PCA dataset

### Tune best classifier with GridSearchCV

In [None]:
from sklearn.model_selection import GridSearchCV
parameters = {
    'loss': ['deviance', 'exponential'],
    'learning_rate': [.05, .1, .2, .4, .8],
    'max_depth': [3, 4, 5, 6]}

clf = GridSearchCV(GradientBoostingClassifier(random_state=0), parameters, return_train_score=True)
clf.fit(X_train_sc, y_train)

#### Metrics

In [None]:
sorted(clf.cv_results_.keys())

In [None]:
np.mean(clf.cv_results_['mean_test_score']).round(3)

#### Best estimator and parameters

In [None]:
clf.best_estimator_
clf.best_params_

#### Best CV score

In [None]:
clf.best_score_

#### Train on best parameters

In [None]:
gb = GradientBoostingClassifier(**clf.best_params_, random_state=0).fit(X_train_sc, y_train)

In [None]:
loss = 1 - gb.train_score_
loss

#### Predict on best model

In [None]:
gb.predict(X_test_sc)

In [None]:
nb = GaussianNB(priors=None).fit(X_train_sc, y_train)

In [None]:
nb.predict(X_test_sc)

In [None]:
np.mean((nb.predict(X_test_sc) == gb.predict(X_test_sc)))

In [None]:
nb.predict_proba(X_test_sc)

In [None]:
df_pred = pd.DataFrame
df_naive_bayes = pd.DataFrame(nb.predict_proba(X_test_sc), columns=clf.classes_)
df_gradient_boost = pd.DataFrame(gb.predict_proba(X_test_sc), columns=clf.classes_)

In [None]:
df_naive_bayes