In [None]:
import os
import sys
import re
import pandas as pd
import numpy as np
from fuzzywuzzy import process
import fuzzywuzzy as fw

In [None]:
path = r'P:\MyWork\product-recomender'

In [None]:
files = [x for x in os.listdir(path+'\data\\raw') if 'extract' in x]
files

In [None]:
def combine_files(path, files):
    output_list = []
    
    # for loop to read each excel file and extract data sheet. 
    for file in files:
        
        print('\rReading "%s"' %file, end='')
        wb = pd.read_excel(path+file, sheetname=['Main Page'], skiprows=4, index_col=None)
        output_list.append(wb['Main Page'])

    print('\n\t combining files...')   
    # concatentate df's to one
    df = pd.concat(output_list)
    df.sort_values(by='Policy YOA', ascending=True, inplace=True) # sort values in order of YOA
    df.reset_index(drop=True, inplace=True)

    print('\t standardising column names and values...') 
    # basic cleaning of column names
    df.rename(columns=lambda x: x.replace(" ", "_").lower(), inplace=True) #lower case and remove spaces in names
    return df

In [None]:
dat = combine_files(path+'\data\\raw\\', files)
print(len(dat))
dat.dropna(subset=['insured_party', 'department'], inplace=True)
print(len(dat))

In [None]:
dat.to_csv(path+'\data\\processed\\eb-extracts.txt', sep='|', encoding='utf-8')

In [None]:
dat.columns

In [None]:
# function to remove stop words
filename = r'P:\MyWork\product-recomender\data\processed\stopwords'

#Read
with open(filename, 'r') as f:
    stopwords = [line.rstrip("\n").replace("\'", "") for line in f]
stopwords = list(set(stopwords))

In [None]:
def stops(list_of_strings, stopwords):
    cleaned = [x for x in list_of_strings if x not in stopwords]
    cleaned = ' '.join(cleaned)
    return cleaned

In [None]:
sam = dat[['insured_party','department', 'trifocus', 'coverage_name', 'class_of_business']].copy()
sam[:3]

In [None]:
sam = dat[['insured_party','department', 'trifocus', 'coverage_name', 'class_of_business']].copy()
for col in sam.columns:
    sam[col]= sam[col].str.lower()
    sam[col].replace('[^0-9a-zA-Z ]', '', regex=True, inplace=True)
sam[:7]

In [None]:
sam['insured'] = sam['insured_party'].copy()
sam['insured'] = sam['insured'].str.split(' ').apply(stops, stopwords=stopwords)
sam['insured'] = sam['insured'].str.strip()
sam['num'] = 1
sam['idx']= sam.index
sam[:7]

In [None]:
path

# Syndicate-COB

In [None]:
cobs = pd.read_csv(path+'\\references\\cobs.csv')
cobs['tf'] = cobs['TriFocusGroup'].str.extract('(?<={)(.*)(?=})', expand=False).str.lower().copy()
cobs[:4]

In [None]:
mapp = cobs.groupby(['tf','SyndicateCOBName']).agg({'tf':'count'})
mapp.rename(columns={'tf':'count'}, inplace=True)
mapp = pd.DataFrame(mapp.to_records())
mapp.rename(columns={'SyndicateCOBName':'syn-tf'}, inplace=True)
mapp[:5]

In [None]:
sam['syn-tf']=np.nan
for tf in mapp['tf'].unique():
    sam.loc[sam['trifocus']==tf, 'syn-tf'] = mapp.loc[mapp['tf']==tf, 'syn-tf'].unique()[0]
    

### Remove duplications

In [None]:
dat = sam.groupby(['insured', 'syn-tf']).agg({'num':'sum'})
dat = pd.DataFrame(dat.to_records())
dat['insured'].replace('', np.nan, inplace=True)
dat.dropna(subset=['insured'], inplace=True)

# Network Analysis

In [None]:
import matplotlib.pyplot as plt
import networkx as nx
import community
% matplotlib inline

https://stackoverflow.com/questions/49429594/pyhton-pandas-dataframe-to-adjacency-matrix

In [None]:
# Calculate Matrix
col = 'syn-tf'

tf = dat[[col]+['insured']]
df_merge = tf.merge(tf, on='insured')
results = pd.crosstab(df_merge[col+'_x'], df_merge[col+'_y'])

node_weights = np.array(np.diagonal(results.values))
np.fill_diagonal(results.values, 0)

#results.to_csv('P:\MyWork\product-recomender\gephi\trifocus_matrix.csv')
results.loc[:'Cat',:'Cat']

In [None]:
results.to_csv('dat.csv')

# Network

In [None]:
adjacency = np.array(results)
G = nx.from_numpy_matrix(adjacency, create_using=nx.Graph()) # create graph

labels = list(results.columns)
labels = {e: i for (e, i) in enumerate(labels)}
G = nx.relabel_nodes(G,labels) # relabel nodes

## Nodes

In [None]:
d = nx.degree(G)
node_degree= np.array([i[1] for i in d])

## Edges

In [None]:
edges = G.edges()
edge_weights = np.array([G[u][v]['weight'] for u,v in edges])
edge_norm = (edge_weights-min(edge_weights))/(max(edge_weights)-min(edge_weights))

## Components

In [None]:
sg = [G.subgraph(c) for c in nx.connected_components(G)]

## Communities

In [None]:
part = community.best_partition(G, resolution=1)
parts = [part.get(node) for node in G.nodes()]

## Plot

In [None]:
def plot_graph(graph, axis):
    deg = nx.degree(graph)
    node_degree= np.array([i[1] for i in deg])
    edges = graph.edges()
    edge_weights = np.array([graph[u][v]['weight'] for u,v in edges])
    edge_norm = (edge_weights-min(edge_weights))/(max(edge_weights)-min(edge_weights))
    part = community.best_partition(graph, resolution=1)
    parts = [part.get(node) for node in graph.nodes()]
    
    nx.draw(graph, ax=axis,
            pos=nx.spring_layout(graph, center=(1,1), k=15),
            node_size=node_weights/2, with_labels=True,
            cmap = plt.get_cmap('jet'), node_color = parts,
            edgelist=edges, width=edge_norm*30)

In [None]:
fig, axs = plt.subplots(1,1, figsize=(15,15))
plot_graph(G, axs)