In [135]:
import pandas as pd
import numpy as np
from nltk.tokenize import sent_tokenize
import pickle

### Cleaning Scraped Data from MountainProject.com

Importing data froms scraped files - you can use your own filenames

In [136]:
prefix_to_import = ['a-i','ca','co','gunks', 'k-nj', 'nm-s',
                   't-v','w']

In [137]:
df = None
for prefix in prefix_to_import:
    if df is None:
        df = pd.read_csv('Text_Data/' + prefix + '_routes_text.csv')
    else:
        df2 = pd.read_csv('Text_Data/' + prefix + '_routes_text.csv')
        df = pd.concat([df,df2])


In [138]:
df_d = None
for prefix in prefix_to_import:
    if df_d is None:
        df_d = pd.read_csv('Scraped_Data/' + prefix + '_route_data.csv')
    else:
        df_d2 = pd.read_csv('Scraped_Data/' + prefix + '_route_data.csv')
        df_d = pd.concat([df_d,df_d2])


In [139]:
df_url = None
for prefix in prefix_to_import:
    if df_url is None:
        df_url = pd.read_csv('Route_URL/' + prefix + '_routes.csv')
    else:
        df_url2 = pd.read_csv('Route_URL/' + prefix + '_routes.csv')
        df_url = pd.concat([df_url,df_url2])


### Cleaning up .csv nonsense

In [140]:
del df_url['Unnamed: 0']

In [141]:
del df_d['Unnamed: 0']
del df_d['index']

### Remove duplicates

In [142]:
df = df.groupby('id',as_index=False).first()
df_d = df_d.groupby('id',as_index=False).first()
df_url = df_url.groupby('id',as_index=False).first()

### Combine urls & route data

In [143]:
df_d = df_d.merge(df_url, on='id')

### Clean up rating to follow standard format (5.XXa)
* Some fun with Regex - catching all of the scenarios is a little tricky!

In [144]:
def clean_rating(messy_rating):
    import re
    match = re.search('5\.[0-9]+[a-z]', messy_rating)
    if match:
        rate = match.group(0)
        if len(rate) == 3:
            return rate[:2] + '0' + rate[2]
        else:
            return rate
    else:
        match = re.search('5\.[0-9]+', messy_rating)
        if match:
            rate = match.group(0)
            if len(rate) == 3:
                return rate[:2] + '0' + rate[2]
            else:
                return rate
        else:
            return '0'

In [145]:
df_d['rating'] = df_d.apply(lambda x: clean_rating(x['rating']), axis=1)

In [146]:
with open('full_dataset.pkl', 'wb') as picklefile:
    pickle.dump(df_d, picklefile)

### Cleaning up text data
* Remove /n
* Remove leading/ending spaces
* Combine descriptions and comments for one long document (not used in final model)

In [8]:
text_cols = ['Comment', 'Loc', 'Desc', 'Intro', 'Prot', 'Descent', 'Escape']
for col in text_cols:
    df[col] = df.apply(lambda x: str(x[col]).replace('\n',''), axis=1)
    df[col] = df.apply(lambda x: str(x[col]).strip(), axis=1)


In [9]:
df['Desc_Comment'] = df['Desc'] + df['Comment']

### Sentence tokenize each document for improved accuracy
* Need to keep track of routes associated with each sentence
* Use dictionary to build this, then convert to DF for readability

In [97]:
x = df.apply(lambda x: sent_tokenize(x['Desc']), axis = 1).tolist()
id_list = df['id'].tolist()

sentence_list = []
id_num = []
for i, rec in enumerate(x):
    for sent in rec:
        if 'nan' not in sent:
            sentence_list.append(sent)
            id_num.append(id_list[i])

In [98]:
sent_dict = {}
sent_dict['id'] = id_num
sent_dict['sentence'] = sentence_list

df_sent = pd.DataFrame.from_dict(sent_dict)

### Create training dataset using sampled routes

In [10]:
sample_ids = df.sample(20000)['id'].tolist()

In [99]:
df_train = df_sent[df_sent['id'].isin(sample_ids)]

### Output data for use in final model

In [100]:
with open('df_train.pkl', 'wb') as picklefile:
    pickle.dump(df_train, picklefile)

In [101]:
with open('df_sent.pkl', 'wb') as picklefile:
    pickle.dump(df_sent, picklefile)