In [1]:
import pandas as pd
import numpy as np
import pickle

In [2]:
# Loading the Mineral Evolution Database (med)
med = pd.read_csv('tbl_locality_age_cache.csv',delim_whitespace = True)
med.drop(columns = ['at_locality', 'mindat_id','is_remote', 'age_type', 'is_legit_age', 'max_age', 'max_age_excel_id', 'max_age_ref_id' ,'min_age', 'min_age_excel_id', 'min_age_ref_id', 'dated_locality_mindat_id'], inplace = True)
new_col = np.array(list(range(len(med))))
new_col[:] = -1
med = med.assign(mineral_names = new_col, rj_id = new_col)
med

Unnamed: 0,locality_id,mineral_id,mineral_names,rj_id
0,1,819,-1,-1
1,1,1289,-1,-1
2,1,1399,-1,-1
3,1,1478,-1,-1
4,1,1673,-1,-1
...,...,...,...,...
1763562,285668,2026,-1,-1
1763563,285668,2116,-1,-1
1763564,285668,3741,-1,-1
1763565,285669,1547,-1,-1


In [3]:
# Loading the International Mineralogical Association (IMA) names for minerals and their mineral id #'s
IMA_names = pd.read_csv('Mindat_ID.csv')
IMA_names.drop(columns = ['Mineral Name'],inplace = True)
IMA_names

Unnamed: 0,Mineral Name (plain),Database ID
0,Abellaite,6482
1,Abelsonite,777
2,Abenakiite-(Ce),778
3,Abernathyite,779
4,Abhurite,780
...,...,...
5560,Zussmanite,4843
5561,Zvyaginite,6232
5562,Zvyagintsevite,4844
5563,Zwieselite,4845


In [4]:
# Combining the med database with the IMA name database using mineral_id and Database ID to match the entries
IMA_matching = np.array(IMA_names)
for name, num in IMA_matching:
    if num in med['mineral_id']:
        med.loc[med['mineral_id'] == num,['mineral_names']] = name
med.drop(columns = ['mineral_id'],inplace = True)
med

Unnamed: 0,locality_id,mineral_names,rj_id
0,1,Aktashite,-1
1,1,Calcite,-1
2,1,Chalcopyrite,-1
3,1,Cinnabar,-1
4,1,Dickite,-1
...,...,...,...
1763562,285668,Galena,-1
1763563,285668,Gold,-1
1763564,285668,Pyrite,-1
1763565,285669,Cooperite,-1


In [5]:
# Reading in the rockjock names and ID #'s
rj = pd.read_csv('rj_matching.csv')
rj.drop(columns = ['RockJock Names'], inplace = True)
rj

Unnamed: 0,Matching names,RJ number,Recommended Name
0,Alunite,62,Alunite
1,Actinolite,161,Amphibole (actinolite)
2,Ferro-tschermakite,54,Amphibole (ferrotschermakite)
3,Tremolite,151,Amphibole (tremolite)
4,Analcime,114,Analcime
...,...,...,...
169,Rossmanite,108,Tourmaline
170,Schorl,108,Tourmaline
171,Tsilaisite,108,Tourmaline
172,Vanadio-oxy-chromium-dravite,108,Tourmaline


### List name-mismatch minerals between the RockJock database and the Mindat database:
#### hornblende, biotite, chert, boehmite, chlorite, clinoptilolite (Hector), glauconite, halloysite, illite, anorthoclase, andesine, bytownite, labradorite, oligoclase, psilomelane, hypersthene, serpentine, smectite, tourmaline

### List of the suggested names from Mindat:
#### biotite: hydrobiotite, boehmite: bohmite, chert: quartz, glauconite: celadonite (suggested, pre-1998), andesine: sanidine,  bytownite: anorthite, labradorite: anorthite, oligoclase: albite,  smectite (ferruginous): nontronite

### List of expanded mineral families from Rock Jock:
#### hornblende, barite, chlorite, illite, serpentine, tourmaline

### List of minerals not found (missing from Mindat or IMS discredited):
#### anorthoclase, psilomelane, hypersthene

In [6]:
# Here I check to ensure that all of the rockjock entries are correctly being found in the IMS names database
# Note, many of the rockjock names had to be altered or combined to be prorperly found in the IMS names database
rj_names = list(rj['Matching names'])
match = list()
pd.options.display.max_rows = 500
for name in rj_names:
    if name in list(IMA_names['Mineral Name (plain)']):
        match.append(1)
    else:
        match.append(0)
df_matches = pd.DataFrame(np.array([rj_names,match]).T,columns = ['Names','Match'])
df_matches

Unnamed: 0,Names,Match
0,Alunite,1
1,Actinolite,1
2,Ferro-tschermakite,1
3,Tremolite,1
4,Analcime,1
5,Anatase,1
6,Andalusite,1
7,Anhydrite,1
8,Ankerite,1
9,Aragonite,1


In [7]:
# Matching the rockjock minerals to the med database and removing all unnecessary entries
# Note, this method renders the rj_id column meaningless
rj_matching = np.array(rj.iloc[:,:2])
for name, num in rj_matching:
    if name in np.array(med['mineral_names']):
        med.loc[med['mineral_names'] == name,['rj_id']] = num
df_cleaned = med[med['rj_id'] != -1]
df_cleaned

Unnamed: 0,locality_id,mineral_names,rj_id
1,1,Calcite,14
2,1,Chalcopyrite,159
3,1,Cinnabar,107
4,1,Dickite,27
5,1,Galena,115
...,...,...,...
1763557,285667,Quartz,1
1763559,285667,Talc,43
1763561,285668,Chalcopyrite,159
1763562,285668,Galena,115


In [8]:
# Matching the mineral names back to the original Rock Jock names (where possible)
new_col = np.array(range(len(df_cleaned)))
new_col[:] = -1
df_cleaned = df_cleaned.assign(rj_names = new_col)
for i in range(len(rj['Matching names'])):
    min_name = rj['Matching names'][i]
    df_cleaned.loc[df_cleaned['mineral_names'] == min_name,'rj_names'] = rj['Recommended Name'][i]
df_cleaned.to_csv('rj_med_localities.csv')
df_cleaned

Unnamed: 0,locality_id,mineral_names,rj_id,rj_names
1,1,Calcite,14,Calcite
2,1,Chalcopyrite,159,Chalcopyrite
3,1,Cinnabar,107,Cinnabar
4,1,Dickite,27,Dickite
5,1,Galena,115,Galena
...,...,...,...,...
1763557,285667,Quartz,1,Quartz
1763559,285667,Talc,43,Talc
1763561,285668,Chalcopyrite,159,Chalcopyrite
1763562,285668,Galena,115,Galena


In [9]:
# Verifying only RockJock names still exist in the database (df_check should be empty)
df_check = df_cleaned
for name in rj['Recommended Name']:
    df_check = df_check[df_check['rj_names'] != name]
df_check

Unnamed: 0,locality_id,mineral_names,rj_id,rj_names


In [None]:
# Creating a list of lists of minerals from each locality, using the Rock Jock names
name_loc = []
localities = np.unique(df_cleaned['locality_id'])
for i in localities:
    locality = df_cleaned[df_cleaned['locality_id'] == i]
    local = [name for name in locality['rj_names']]
    if len(local) != 1:
        name_loc.append(local)

In [None]:
# Exporting the list of mineral localities to a pickle object
with open('min_loc', 'wb') as f:
    pickle.dump(name_loc,f)

In [None]:
# Determining the frequency of each of the Rock Jock minerals to help guide the association rule process
df_minerals = pd.read_csv('rj_med_localities.csv')
rj = pd.read_csv('rj_matching.csv')
with open('min_loc', 'rb') as f:
    name_loc = pickle.load(f)
freq = np.array([])
for i in range(len(rj)):
    counts = df_minerals.loc[df_minerals['rj_names'] == rj['Recommended Name'][i],['rj_names']].count()
    freq = np.append(freq,counts[0]/len(name_loc))
freq = freq*100
min_freq = np.column_stack([freq,np.array(rj['Recommended Name'])])
min_freq

In [None]:
min_freq[:,0].min()