In [1]:
from collections import defaultdict, Counter
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.decomposition.online_lda import LatentDirichletAllocation
from sklearn.decomposition import NMF 
import matplotlib.pyplot as plt
plt.style.use('seaborn')
%matplotlib inline

### Load in the collocation dataset created in feature_reduction.ipynb

In [2]:
X_wild_soil_w_targets = pd.read_csv("collocation_df.csv", index_col=0)

X_wild_soil_w_targets.head()

Unnamed: 0,soil_type,wild_soil
0,4,cache_la_poudre_wilderness_area gateview_famil...
1,4,cache_la_poudre_wilderness_area gateview_famil...
2,4,cache_la_poudre_wilderness_area gateview_famil...
3,4,cache_la_poudre_wilderness_area gateview_famil...
4,4,cache_la_poudre_wilderness_area gateview_famil...


### How many total features are there and what are their respective counts per soil type?

In [3]:
# inspecting by each soil type
soil_CountVect = dict()
for i in range(1, 8): 
    print("Soil Type {}:".format(i))
    query = X_wild_soil_w_targets.query("soil_type == {}".format(i))["wild_soil"]
    temp_counter = Counter()
    for row in query:
        words = list(row.split(" "))
        temp_counter.update(words)
    top_10 = [i[0] for i in temp_counter.most_common(10)]
    print("Unique words in Soil Type {} vocabulary: {}".format(i, len(temp_counter)))
    print("Top 10 words in Soil Type {}: {} \n".format(i, top_10))        
    soil_CountVect["Soil Type {}:".format(i)] = ", ".join(top_10)

Soil Type 1:
Unique words in Soil Type 1 vocabulary: 53
Top 10 words in Soil Type 1: ['climatic_zone_subalpine', 'geologic_zone_igneous_and_metamorphic', 'extremely_stony', 'rawah_wilderness_area', 'comanche_peak_wilderness_area', 'leighcan_family', 'geologic_zone_glacial', 'till_substratum', 'como', 'legault_families_complex'] 

Soil Type 2:
Unique words in Soil Type 2 vocabulary: 63
Top 10 words in Soil Type 2: ['geologic_zone_igneous_and_metamorphic', 'climatic_zone_subalpine', 'extremely_stony', 'rawah_wilderness_area', 'comanche_peak_wilderness_area', 'como', 'legault_families_complex', 'climatic_zone_montane', 'catamount_families', 'rock_outcrop_complex'] 

Soil Type 3:
Unique words in Soil Type 3 vocabulary: 36
Top 10 words in Soil Type 3: ['geologic_zone_igneous_and_metamorphic', 'rock_outcrop_complex', 'rubbly', 'climatic_zone_lower_montane', 'cache_la_poudre_wilderness_area', 'comanche_peak_wilderness_area', 'climatic_zone_montane', 'bullwark', 'catamount_families', 'vanet'] 

In [4]:
X_wild_soil = X_wild_soil_w_targets.wild_soil.copy() # only keep features
# CountVectorizer for entire dataset
min_df = 1 
max_df = 0.95  
max_features = 100
vectorizer = CountVectorizer(max_features=max_features, max_df=max_df, min_df=min_df)

vectorized = vectorizer.fit_transform(X_wild_soil)
vectorized.shape, vectorizer.stop_words_

((19229, 74), set())

In [5]:
vectorizer.get_feature_names()

['aquolis_complex',
 'borohemists_complex',
 'bross_family',
 'bullwark',
 'bullwark_family_complex',
 'cache_la_poudre_wilderness_area',
 'catamount_families',
 'catamount_families_complex',
 'catamount_family',
 'cathedral_family',
 'climatic_zone_alpine',
 'climatic_zone_lower_montane',
 'climatic_zone_montane',
 'climatic_zone_montane_and_subalpine',
 'climatic_zone_montane_dry',
 'climatic_zone_montane_dry_and_montane',
 'climatic_zone_subalpine',
 'comanche_peak_wilderness_area',
 'como',
 'como_family',
 'cryaquepts_complex',
 'cryaquolis',
 'cryaquolis_complex',
 'cryaquolls_complex',
 'cryoborolis_complex',
 'cryorthents',
 'cryorthents_complex',
 'cryumbrepts',
 'cryumbrepts_complex',
 'extremely_bouldery',
 'extremely_stony',
 'gateview_family',
 'geologic_zone_alluvium',
 'geologic_zone_glacial',
 'geologic_zone_igneous_and_metamorphic',
 'geologic_zone_mixed_sedimentary',
 'gothic_family',
 'granile',
 'haploborolis',
 'legault_families_complex',
 'legault_family',
 'legau

### A whopping total vocubulary of 74

In [6]:
len(vectorizer.get_feature_names())

74

# Topic Modeling
### NMF (Non-Negative Matrix Factorization) 

In [7]:
model = NMF(init="nndsvd",
            n_components=7,
            max_iter=200)

W = model.fit_transform(vectorized)
H = model.components_

W.shape, H.shape

((19229, 7), (7, 74))

In [8]:
terms = [""] * len(vectorizer.vocabulary_)
for term in vectorizer.vocabulary_.keys():
    terms[vectorizer.vocabulary_[term]] = term

### Create 7 Topics from the dataset

In [9]:
for topic_index in range(H.shape[0]):
    top_indicies = np.argsort(H[topic_index, :])[::-1][0:10]
    term_ranking = [terms[i] for i in top_indicies]
    print("Soil Topic {}: {} \n".format(topic_index+1, ", ".join(term_ranking)))

Soil Topic 1: climatic_zone_montane, rubbly, catamount_families, bullwark, geologic_zone_igneous_and_metamorphic, rock_outcrop_complex, cache_la_poudre_wilderness_area, rock_land_complex, comanche_peak_wilderness_area, rock_land 

Soil Topic 2: comanche_peak_wilderness_area, geologic_zone_igneous_and_metamorphic, climatic_zone_subalpine, catamount_family, extremely_stony, leighcan, rock_outcrop, leighcan_family_complex, bullwark_family_complex, rubbly 

Soil Topic 3: climatic_zone_lower_montane, geologic_zone_igneous_and_metamorphic, rock_outcrop_complex, cache_la_poudre_wilderness_area, rubbly, vanet, haploborolis, stony, wetmore_families, ratake_family 

Soil Topic 4: extremely_stony, climatic_zone_subalpine, geologic_zone_igneous_and_metamorphic, rawah_wilderness_area, como, legault_families_complex, como_family, legault_family_complex, rock_land, leighcan 

Soil Topic 5: leighcan_family, geologic_zone_glacial, till_substratum, climatic_zone_subalpine, typic_cryaquolls_complex, rawa

### How do they compare to the actual classes/topics?
- In terms of most common occuring words

In [10]:
for k, v in soil_CountVect.items():
    v_str = v
    print(k, v_str)
    print("")

Soil Type 1: climatic_zone_subalpine, geologic_zone_igneous_and_metamorphic, extremely_stony, rawah_wilderness_area, comanche_peak_wilderness_area, leighcan_family, geologic_zone_glacial, till_substratum, como, legault_families_complex

Soil Type 2: geologic_zone_igneous_and_metamorphic, climatic_zone_subalpine, extremely_stony, rawah_wilderness_area, comanche_peak_wilderness_area, como, legault_families_complex, climatic_zone_montane, catamount_families, rock_outcrop_complex

Soil Type 3: geologic_zone_igneous_and_metamorphic, rock_outcrop_complex, rubbly, climatic_zone_lower_montane, cache_la_poudre_wilderness_area, comanche_peak_wilderness_area, climatic_zone_montane, bullwark, catamount_families, vanet

Soil Type 4: cache_la_poudre_wilderness_area, geologic_zone_igneous_and_metamorphic, rock_outcrop_complex, climatic_zone_lower_montane, rubbly, haploborolis, geologic_zone_alluvium, climatic_zone_montane_and_subalpine, gateview_family, cryaquolis_complex

Soil Type 5: geologic_zone_

### Hard to tell from initial inspection

### For completeness, lets try a TfidfVectorizer (Term Frequency Inverse Document Frequency Vectorizer)

In [11]:
min_df = 1
max_df = 0.95 
max_features = 100 # not a problem here... soil + wilderness has a max of 70 feats

tfidf_vec = TfidfVectorizer(max_features=max_features, max_df=max_df, min_df=min_df)

tfidf_vecD = tfidf_vec.fit_transform(X_wild_soil)
tfidf_vecD.shape, tfidf_vec.stop_words_

((19229, 74), set())

In [12]:
terms = [""] * len(tfidf_vec.vocabulary_)
for term in tfidf_vec.vocabulary_.keys():
    terms[tfidf_vec.vocabulary_[term]] = term

In [13]:
model_tfidf = NMF(init="nndsvd",
                n_components=7,
                max_iter=200)

W_tfidf = model_tfidf.fit_transform(tfidf_vecD)
H_tfidf = model_tfidf.components_

W_tfidf.shape, H_tfidf.shape

((19229, 7), (7, 74))

In [14]:
for topic_index in range(H_tfidf.shape[0]):
    top_indices = np.argsort(H_tfidf[topic_index,:])[::-1][0:10]
    term_ranking = [terms[i] for i in top_indices]
    print("Soil Type {}: {}\n".format(topic_index+1, ", ".join(term_ranking)))

Soil Type 1: bullwark, catamount_families, climatic_zone_montane, rubbly, rock_outcrop_complex, cache_la_poudre_wilderness_area, geologic_zone_igneous_and_metamorphic, comanche_peak_wilderness_area, rock_land_complex, bullwark_family_complex

Soil Type 2: como, legault_families_complex, rawah_wilderness_area, climatic_zone_subalpine, extremely_stony, geologic_zone_igneous_and_metamorphic, leighcan, cathedral_family, catamount_families_complex, catamount_families

Soil Type 3: climatic_zone_lower_montane, rock_outcrop_complex, cache_la_poudre_wilderness_area, haploborolis, rubbly, geologic_zone_igneous_and_metamorphic, ratake_family, vanet, wetmore_families, stony

Soil Type 4: climatic_zone_alpine, extremely_stony, leighcan, comanche_peak_wilderness_area, cryorthents, moran_family, leighcan_family_complex, geologic_zone_igneous_and_metamorphic, moran_families, cryaquolls_complex

Soil Type 5: leighcan_family, till_substratum, geologic_zone_glacial, typic_cryaquolls_complex, climatic_zo

In [15]:
for k, v in soil_CountVect.items():
    v_str = v
    print(k, v_str)
    print("")

Soil Type 1: climatic_zone_subalpine, geologic_zone_igneous_and_metamorphic, extremely_stony, rawah_wilderness_area, comanche_peak_wilderness_area, leighcan_family, geologic_zone_glacial, till_substratum, como, legault_families_complex

Soil Type 2: geologic_zone_igneous_and_metamorphic, climatic_zone_subalpine, extremely_stony, rawah_wilderness_area, comanche_peak_wilderness_area, como, legault_families_complex, climatic_zone_montane, catamount_families, rock_outcrop_complex

Soil Type 3: geologic_zone_igneous_and_metamorphic, rock_outcrop_complex, rubbly, climatic_zone_lower_montane, cache_la_poudre_wilderness_area, comanche_peak_wilderness_area, climatic_zone_montane, bullwark, catamount_families, vanet

Soil Type 4: cache_la_poudre_wilderness_area, geologic_zone_igneous_and_metamorphic, rock_outcrop_complex, climatic_zone_lower_montane, rubbly, haploborolis, geologic_zone_alluvium, climatic_zone_montane_and_subalpine, gateview_family, cryaquolis_complex

Soil Type 5: geologic_zone_

### Not too much of a difference

### LDA (Latent Dirichlet Allocation) is another popular Topic Modeling approach

In [16]:
lda = LatentDirichletAllocation(n_components=7,
                                max_iter=5,
                                learning_method='online',
                                learning_offset=50,
                                random_state=42)

In [17]:
lda.fit(tfidf_vecD)

LatentDirichletAllocation(batch_size=128, doc_topic_prior=None,
             evaluate_every=-1, learning_decay=0.7,
             learning_method='online', learning_offset=50,
             max_doc_update_iter=100, max_iter=5, mean_change_tol=0.001,
             n_components=7, n_jobs=1, n_topics=None, perp_tol=0.1,
             random_state=42, topic_word_prior=None,
             total_samples=1000000.0, verbose=0)

In [18]:
tf_feature_names = tfidf_vec.get_feature_names()
lda_topics = dict()
for topic_idx, topic in enumerate(lda.components_):
    lda_topics["Topic {}:".format(topic_idx+1)] = [i for i in topic.argsort()[:-11:-1]] # keep track of indicies 
    print("Topic {}:".format(topic_idx+1), ", ".join([tf_feature_names[i] for i in topic.argsort()[:-11:-1]]))
    print("")

Topic 1: extremely_stony, climatic_zone_subalpine, leighcan, rawah_wilderness_area, comanche_peak_wilderness_area, geologic_zone_igneous_and_metamorphic, leighcan_family_complex, como, legault_families_complex, leighcan_family

Topic 2: geologic_zone_alluvium, climatic_zone_montane_and_subalpine, cryaquolis_complex, gateview_family, cache_la_poudre_wilderness_area, comanche_peak_wilderness_area, till_substratum_complex, typic_cryaquolls, cryumbrepts_complex, bross_family

Topic 3: catamount_families, climatic_zone_montane, bullwark, rubbly, rock_outcrop_complex, geologic_zone_igneous_and_metamorphic, cache_la_poudre_wilderness_area, cryorthents, comanche_peak_wilderness_area, rock_land_complex

Topic 4: ratake_family, climatic_zone_lower_montane, rock_outcrop_complex, rubbly, comanche_peak_wilderness_area, cathedral_family, geologic_zone_igneous_and_metamorphic, cache_la_poudre_wilderness_area, extremely_stony, aquolis_complex

Topic 5: haploborolis, climatic_zone_lower_montane, rubbly

### Compared to our "test set" this is starting to look a bit more reasonable

In [19]:
for k, v in soil_CountVect.items():
    print(k, v)
    print("")

Soil Type 1: climatic_zone_subalpine, geologic_zone_igneous_and_metamorphic, extremely_stony, rawah_wilderness_area, comanche_peak_wilderness_area, leighcan_family, geologic_zone_glacial, till_substratum, como, legault_families_complex

Soil Type 2: geologic_zone_igneous_and_metamorphic, climatic_zone_subalpine, extremely_stony, rawah_wilderness_area, comanche_peak_wilderness_area, como, legault_families_complex, climatic_zone_montane, catamount_families, rock_outcrop_complex

Soil Type 3: geologic_zone_igneous_and_metamorphic, rock_outcrop_complex, rubbly, climatic_zone_lower_montane, cache_la_poudre_wilderness_area, comanche_peak_wilderness_area, climatic_zone_montane, bullwark, catamount_families, vanet

Soil Type 4: cache_la_poudre_wilderness_area, geologic_zone_igneous_and_metamorphic, rock_outcrop_complex, climatic_zone_lower_montane, rubbly, haploborolis, geologic_zone_alluvium, climatic_zone_montane_and_subalpine, gateview_family, cryaquolis_complex

Soil Type 5: geologic_zone_

### Comparing our LDA predictions for the 10 most common 'features' per topic 

In [20]:
lda_topics

{'Topic 1:': [30, 16, 42, 52, 17, 34, 44, 18, 39, 43],
 'Topic 2:': [32, 13, 22, 31, 5, 17, 63, 67, 28, 2],
 'Topic 3:': [6, 12, 3, 59, 56, 34, 5, 25, 17, 54],
 'Topic 4:': [51, 11, 56, 59, 17, 9, 34, 5, 30, 0],
 'Topic 5:': [38, 11, 59, 5, 56, 34, 57, 70, 17, 22],
 'Topic 6:': [69, 11, 60, 71, 73, 50, 5, 34, 56, 17],
 'Topic 7:': [10, 47, 46, 23, 48, 52, 29, 17, 34, 40]}

### With the true 10 most common features for Soil Type 1

In [39]:
soil_types = {}
for i in range(1,8):
    l = []
    for j in soil_CountVect["Soil Type {}:".format(i)].split(", "):
        l.append(tf_feature_names.index(j))
    soil_types["Soil Type {}".format(i)] = l

soil_types

{'Soil Type 1': [16, 34, 30, 52, 17, 43, 33, 62, 18, 39],
 'Soil Type 2': [34, 16, 30, 52, 17, 18, 39, 12, 6, 56],
 'Soil Type 3': [34, 56, 59, 11, 5, 17, 12, 3, 6, 69],
 'Soil Type 4': [5, 34, 56, 11, 59, 38, 32, 13, 31, 22],
 'Soil Type 5': [34, 17, 16, 30, 52, 53, 59, 12, 19, 41],
 'Soil Type 6': [34, 56, 59, 6, 12, 5, 3, 17, 11, 69],
 'Soil Type 7': [34, 30, 10, 17, 25, 47, 42, 44, 46, 23]}

### Here's the catch with LDA and NMF. Both are predicting topics, however, 'Topic 1' does not necessarily match up with the true 'Topic' --> 'Soil Type 1'
- Making a huge assumption here (along with a number of others) when 'predicting' the accuracy of these models
- Additionally, I am only looking at the 10 most common 'features'

In [42]:
test_true = list()
for y_true, y_pred in zip(soil_CountVect["Soil Type 1:"].split(", "), lda_topics["Topic 1:"]):
    test_true.append(tf_feature_names.index(y_true))
    print(tf_feature_names.index(y_true), y_pred)

16 30
34 16
30 42
52 52
17 17
43 34
33 44
62 18
18 39
39 43


In [28]:
y_true, y_pred

([16, 34, 30, 52, 17, 43, 33, 62, 18, 39],
 [30, 16, 42, 52, 17, 34, 44, 18, 39, 43])

In [30]:
inter = set(y_true).intersection(set(y_pred))

len(inter)/len(y_true)*100

80.0

In [33]:
hack_accuracy = []
for i in range(1, 8):
    y_true, y_pred = list(), list()
    compare = zip(soil_CountVect["Soil Type {}:".format(i)].split(", "), \
                  lda_topics["Topic {}:".format(i)])
    for true, pred in compare:
        y_true.append(tf_feature_names.index(true))
        y_pred.append(pred)
    
    inter = set(y_true).intersection(set(y_pred))
    acc = len(inter)/len(y_true)*100
    hack_accuracy.append(acc)
    print("'Accuracy' for 'Topic {}': {}".format(i, acc))

'Accuracy' for 'Topic 1': 80.0
'Accuracy' for 'Topic 2': 10.0
'Accuracy' for 'Topic 3': 80.0
'Accuracy' for 'Topic 4': 50.0
'Accuracy' for 'Topic 5': 30.0
'Accuracy' for 'Topic 6': 60.0
'Accuracy' for 'Topic 7': 60.0


### Final 'Accuracy Metric' is 52.85%. That was ugly, lets try something a bit nicer. 

In [37]:
np.mean(hack_accuracy)

52.857142857142854

___