In [None]:
import os
import sys
import re
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline

In [None]:
path = r'P:\\MyWork\\cass-property\\'

In [None]:
dat = pd.read_csv(path+r'data\\raw\\data_02.csv',low_memory=False, header=[0,1], encoding='latin-1') # read raw data (commas removed in excel - BI issue)
dat = pd.DataFrame(dat.to_records(index=False)) # as records
dat.columns = [re.sub("Unnamed: [0-9]|[0-9]_level_0", '', x) for x in dat.columns]
dat.columns = [re.sub("[(),']", '', x).replace(' ', '_').lower() for x in dat.columns] # name cols
len(dat)

In [None]:
#dat['er_cause_of_loss'].value_counts().to_csv('codes.csv')
#dat.columns

In [None]:
dat.dropna(subset=['er_exposure_reference'], inplace=True) # drop rows with no exposures
dat.dropna(axis=1, how='all', inplace=True) # drop columns all null
dat['uid'] = np.arange(len(dat)) # add numeric ID
len(dat)

In [None]:
fig,axs = plt.subplots(1,1,figsize=(12,15))
pd.DataFrame(dat.isnull().sum()/len(dat)).sort_values(by=0, ascending=False).plot.barh(ax=axs, legend=False, fontsize=15)
plt.xlim(0,1);
plt.xticks(np.arange(0,1.1,.1));
axs.set_xticklabels(np.arange(0,110,10),fontsize=12);
axs.set_xlabel('% missing',fontsize=15);

plt.tight_layout()
plt.savefig(path+'reports/figures/missing.png', dpi=600)

In [None]:
# dat['er_loss_location_county'].value_counts()
dat['er_cause_of_loss'].isnull().sum()

In [None]:
dat['er_cause_of_loss'].value_counts().sum()

# Variables

#### target
 - er_cause_of_loss. BUT this has known issues as many are misaleblled. 

#### features
 - 'er_claim_lead_narrative'
 - 'er_claim_description'
 - 'er_loss_details'
 - 'er_exposure_description'
 - 'er_claim_narrative'
 
#### additional features

In [None]:
target = ['er_cause_of_loss']
dat[target[0]].value_counts().plot.bar(figsize=(15,3))

In [None]:
features = ['er_claim_lead_narrative','er_claim_description',
            'er_loss_details','er_exposure_description','er_claim_narrative']

dat[features].loc[dat['er_cause_of_loss']=='Rain'].values[:5]

In [None]:
data = dat[target+features+['uid']].copy(deep=True)
data.reset_index(inplace=True,drop=True)
len(data)

# Input Data Cleaning

In [None]:
# merge text data to one field
data['doc'] = data[features].fillna('').apply(lambda x: ' '.join(x), axis=1)

In [None]:
# data['doc'] = data['doc'].replace('[^0-9a-zA-Z]+', ' ', regex=True).str.lower().str.strip()
data['doc'] = data['doc'].replace('[^a-zA-Z]+', ' ', regex=True).str.lower().str.strip()
data['doc'][:5]

In [None]:
data['doc'] = data['doc'].replace('block for all losses', '', regex=True)
data['doc'] = data['doc'].replace('block entry for all losses', '', regex=True)
data['doc'] = data['doc'].replace('damage noc', '', regex=True)

In [None]:
# drop short docs # experiment to set length
data.drop(data.index[list(data.loc[data['doc'].str.len()<4].index)], inplace=True)
data.reset_index(inplace=True,drop=True)
print(len(data))

In [None]:
# remove single characters
data['doc'] = data['doc'].apply(lambda x: re.sub(r'\b\w\b', '', x))

In [None]:
# stemming
# tokenize

# StopWords

In [None]:
#pd.Series(np.concatenate([x.split() for x in data.doc])).value_counts()

In [None]:
# https://stackoverflow.com/questions/24386489/adding-words-to-scikit-learns-countvectorizers-stop-list
from sklearn.feature_extraction import text 
stop_words = text.ENGLISH_STOP_WORDS

In [None]:
nations = pd.read_csv(r'P:\MyWork\ner_resources\country-nationality.txt', encoding='utf-8', sep=',',header=0,
                 usecols=[1,2,3,4],names=['c1','c2', 'nm', 'nt'])

nat = []
for col in nations.columns:
    nat.append(list(nations[col].str.lower().unique()))
nat = [item for sublist in nat for item in sublist]

In [None]:
us_cities = list(pd.read_csv(r'P:\MyWork\ner_resources\us-cities.txt', encoding='utf-8')['city'].str.lower())
us_cities = [x for x in us_cities if isinstance(x, str)]

In [None]:
# function to remove stop words
filename = r'P:\MyWork\ner_resources\beaz-stopwords'

#Read
with open(filename, 'r') as f:
    stopwords = [line.rstrip("\n").replace("\'", "") for line in f]
beaz_stopwords = list(set(stopwords))
beaz_stopwords = [x for x in beaz_stopwords if isinstance(x, str)]

In [None]:
us = pd.read_csv(r'P:\MyWork\ner_resources\US\US.txt', encoding='utf-8', sep='\t', header=None,index_col=None,
                 usecols=[0,3,4],names=['c','state', 'sc'])
us_places = []
for col in us.columns:
    us_places.append(list(us[col].str.lower().unique()))
us_places = [item for sublist in us_places for item in sublist]
us_places = [x for x in us_places if isinstance(x, str)]

In [None]:
brokers = ['risc', 'concannon', 'phelan', 'jensvold',
           'charles', 'windsor', 'acton', 'hall', 'wright',
           'crawford','collis', 'gab','collis', 'vericlaim','johnson', 'crump', 'wilcox', 'hiscox',
           'tapco', 'cunningham', 'quebec','proctor', 'heath', 'thompson', 'aol']

In [None]:
stop_words = stop_words.union(us_places+us_cities+beaz_stopwords+brokers+nat)
stops = [x for x in stop_words if isinstance(x, str)]
stops = list(set(stops))
stops = ['\\b'+re.sub(r'[^a-zA-Z ]+', '', x)+'\\b' for x in stops] # add breaks around word for whole word search

In [None]:
#data['doc_stop'] = data['doc'].str.split().apply(lambda x: [item for item in x if item not in stop_words]).str.join(' ') # only does single stop_words
data['doc_stop'] = data['doc'].copy()
data['doc_stop'].replace(to_replace=stops, value='', inplace=True, regex=True) # does multi-len stop words e.g. "north carolina"

data['doc_stop'].replace('\s+', ' ', regex=True, inplace=True) # remove multi whitespaces
data['doc_stop'].replace(' ', np.nan, inplace=True) # replace empty

# Spelling

In [None]:
# from autocorrect import spell
# data['doc_stop_spell'] = data['doc_stop'].copy()
# data['doc_stop_spell'] = 
# data['doc_stop_spell'].dropna().str.split().apply(lambda x: [spell(item) for item in x]).str.join(' ')
# data[['doc','doc_stop', 'doc_stop_spell']]

### Stemming

In [None]:
import nltk
nltk.download('wordnet')

In [None]:
ps = nltk.stem.PorterStemmer()
data['doc_stop_stem'] = data['doc_stop'].copy()
data['doc_stop_stem'] = data['doc_stop_stem'].dropna().str.split().apply(lambda x: [ps.stem(item) for item in x]).str.join(' ')

In [None]:
data = dat[['uid','sr_section_reference','er_exposure_reference']].merge(data, how='right', left_on='uid', right_on='uid')

# Cluster

- https://stackoverflow.com/questions/27889873/clustering-text-documents-using-scikit-learn-kmeans-in-python
- http://brandonrose.org/clustering
- https://pythonprogramminglanguage.com/kmeans-text-clustering/
- https://stackoverflow.com/questions/28160335/plot-a-document-tfidf-2d-graph/28205420#28205420
- https://stackoverflow.com/questions/36946510/from-text-to-k-means-vectors-input
- http://scikit-learn.org/stable/auto_examples/text/document_clustering.html#
- http://scikit-learn.org/stable/auto_examples/cluster/plot_kmeans_digits.html
- http://scikit-learn.org/stable/modules/clustering.html#k-means
- http://scikit-learn.org/stable/auto_examples/cluster/plot_kmeans_silhouette_analysis.html

In [None]:
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import adjusted_rand_score

from sklearn.feature_extraction import text 
from sklearn.feature_extraction.text import TfidfVectorizer
#http://scikit-learn.org/stable/modules/generated/sklearn.feature_extraction.text.TfidfVectorizer.html

from sklearn.cluster import KMeans

In [None]:
term = 'storm'
print(data['doc_stop_stem'].loc[data['doc_stop_stem'].str.contains(term)==True].count() / len(data))
#data['doc_stop_stem'].loc[data['doc_stop_stem'].str.contains(term)==True][:5]

In [None]:
TV = TfidfVectorizer(stop_words=set(stops), ngram_range=(1,3),
                     min_df=0.0, max_df=0.30)

In [None]:
# calc idf sparse matrix 
X = TV.fit_transform(data['doc_stop_stem'].dropna())
X.shape

# determine number of clusters

In [None]:
k_score = {}
for k in np.arange(20,120,10):
    KM = KMeans(n_clusters=k, init='k-means++', max_iter=300, n_init=1, n_jobs=-2)
    k_score[k] = KM.fit(X).score(X)

In [None]:
#for k,v in k_score.items():
    #print(k,v)
plt.plot(k_score.keys(),k_score.values())

In [None]:
KM = KMeans(n_clusters=25, init='k-means++', max_iter=300, n_init=1, n_jobs=-2)
KM.fit(X)

In [None]:
# silhoutte scoring
from sklearn.metrics import silhouette_samples, silhouette_score
# model.labels_.tolist() # equivalnet to predcit(X)
silhouette_score(X, KM.labels_, sample_size=10000, random_state=99)

# print key words

In [None]:
order_centroids = KM.cluster_centers_.argsort()[:, ::-1]
terms = TV.get_feature_names()

In [None]:
clusters={}
for i in range(25):
    ter = []
    print( "\nCluster %d:" % i, end='')
    for ind in order_centroids[i, :10]:
        #print(i,ind)
        print( '%d %s' %( ind,terms[ind]), end=',')
        ter.append(terms[ind])
    clusters[i] = ter

# Results

In [None]:
data['cluster']=np.nan

In [None]:
data.loc[data['doc_stop_stem'].notnull(),'cluster'] = KM.labels_

In [None]:
data.to_csv('output.csv')

# PCA / SVD Dimension reduction 

 - http://scikit-learn.org/stable/modules/generated/sklearn.decomposition.TruncatedSVD.html#sklearn.decomposition.TruncatedSVD.fit_transform

 - https://stackoverflow.com/questions/42882207/plot-k-means-clusters-after-truncatedsvd-python

#### n_components
 - https://chrisalbon.com/machine_learning/feature_engineering/select_best_number_of_components_in_tsvd/
 - https://stackoverflow.com/questions/48424084/number-of-components-trucated-svd
 - https://cstheory.stackexchange.com/questions/21487/when-to-use-the-johnson-lindenstrauss-lemma-over-svd/21489#21489


In [None]:
from sklearn.decomposition import PCA, TruncatedSVD

In [None]:
SVD = TruncatedSVD(n_components=2)
data2D = SVD.fit_transform(X)

In [None]:
SVD.explained_variance_ratio_.sum()

In [None]:
SVD.explained_variance_ratio_.sum()

In [None]:
plt.scatter(data2D[:,0], data2D[:,1],cmap='jet')