In [1]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import TruncatedSVD, PCA
from sklearn.preprocessing import Normalizer
from sklearn.cluster import KMeans
import numpy as np
import pandas as pd
from collections import defaultdict
from tqdm import tqdm
import matplotlib.pyplot as plt
import nltk
from nltk.util import ngrams
from nltk.corpus import stopwords 
from nltk.tokenize import word_tokenize 
from nltk.stem.lancaster import LancasterStemmer
import plotly.express as px
import plotly
stop_words = set(stopwords.words('english')) 
st = LancasterStemmer()

In [2]:
def clean_document(data, num):
    word_tokens = word_tokenize(data) 
    word_tokens = [word for word in word_tokens if word.isalnum()]
    filtered_sentence = [w for w in word_tokens if not w in stop_words]
    return ' '.join(filtered_sentence)

In [3]:
def cluster_name(cent, terms):
    d = defaultdict(int)
    freq = [terms[t] for t in list(cent)]
    freq = ' '.join(freq).replace('geothermal', '').strip().split(' ')
    freq = [b for b in freq if not b == '']
    for b in freq:
        d[b] +=1
    sorted_d = sorted(d.items(), key=lambda x: x[1], reverse=True)
    return '; '.join([sorted_d[0][0], sorted_d[1][0]])

In [4]:
df = pd.read_csv('../scrape_literature_geothermal_extended.csv')
df.drop_duplicates(inplace=True)
df = df[~df.title.isna()]
document = [d.strip() for d in list(df.title.values)]
len(document)
document = [clean_document(d, 2) for d in document][:10000]

In [5]:
k = 10
n_components = 200
vectorizer = TfidfVectorizer(stop_words='english', ngram_range=(2, 2))
vectors = vectorizer.fit_transform(document)
svd = TruncatedSVD(n_components=n_components)
X = svd.fit_transform(vectors)
model = KMeans(n_clusters=k, init='k-means++', max_iter=100, n_init=1)
model.fit(X)
kmean_indices = model.predict(X)

order_centroids = model.cluster_centers_
order_centroids = svd.inverse_transform(order_centroids).argsort()[:, ::-1]
terms = vectorizer.get_feature_names()

In [6]:
n_components = 3
svd = TruncatedSVD(n_components=n_components)
scatter_plot_points = svd.fit_transform(vectors.toarray())

In [7]:
colors = ["r", "b", "c", "y", "m", "g", "k", "orange", "purple", "grey" ]
labels = [cluster_name(order_centroids[i, :10000], terms) for i in range(k)]
labels_long = []
for g in kmean_indices: 
    labels_long.append(labels[g])

In [8]:
x_axis = [o[0] for o in scatter_plot_points]
y_axis = [o[1] for o in scatter_plot_points]
z_axis = [o[2] for o in scatter_plot_points]

In [9]:
# x_axis = [o[0] for o in scatter_plot_points]
# y_axis = [o[1] for o in scatter_plot_points]

# for i in range(k):
#     indices = kmean_indices==i
#     plt.scatter([j for j, h in zip(x_axis, indices) if h == True], [j for j, h in zip(y_axis, indices) if h == True], label=f'{labels[i]}')

# plt.legend()
# plt.xlabel('SVD Comp #1')
# plt.ylabel('SVD Comp #2')

In [10]:
for i in range(k):
    name = cluster_name(order_centroids[i, :10000], terms)
    print(f'\nCluster {i+1}: ({name})')
    for ind in order_centroids[i, :10]:
        print(f'{terms[ind]}')


Cluster 1: (development; energy)
geothermal field
geothermal energy
geothermal resources
geothermal systems
geothermal reservoir
geothermal development
geothermal reservoirs
country update
new zealand
geothermal wells

Cluster 2: (exploration; analysis)
geothermal exploration
exploration development
case study
exploration case
exploration southern
exploration production
exploration using
new mexico
exploration indonesia
exploration central

Cluster 3: (using; energy)
volcanic zone
taupo volcanic
zone new
new zealand
field taupo
systems taupo
geothermal systems
geothermal field
magnetic anomalies
kopia geothermal

Cluster 4: (power; reservoir)
geothermal power
power plant
power plants
power generation
power station
power development
power projects
plant san
binary geothermal
gas removal

Cluster 5: (reservoir; new)
reservoir engineering
geothermal reservoir
engineering analysis
engineering approach
geothermal field
wairakei reservoir
engineering study
recent developments
engineering as

In [11]:
df = pd.DataFrame(np.array([x_axis, y_axis, z_axis, kmean_indices, labels_long]).T, columns=['Component #1', 'Component #2', 'Component #3', 'Indices', 'Clusters'])

In [12]:
fig = px.scatter_3d(df, x='Component #1', y='Component #2', z='Component #3',
              color='Clusters', opacity=1)

fig.show()

In [13]:
plotly.offline.plot(fig, filename='Clusters.html', auto_open=True)

'Clusters.html'