# Analyzing

### Read in the Mountain Project csv into pandas dataframe

In [51]:
# Importing modules
import pandas as pd

file_path = "dataset/mp_routes.csv"

# Read data into papers
df = pd.read_csv(file_path)

# Print head
df.head()

Unnamed: 0.1,Unnamed: 0,Route,Location,URL,Avg Stars,Route Type,Rating,Pitches,Length,Area Latitude,Area Longitude,desc,protection,num_votes
0,0,Access Denied,El Mirador > El Potrero Chico > Nuevo Leon > N...,https://www.mountainproject.com/route/11014983...,2.9,Sport,5.10b/c,4,350.0,25.95044,-100.47755,This is a really great route~ with awesome exp...,12 draws + 60m Rope Take 22 draws if you wan...,22
1,1,Agave Nectar,Sugar Shack > Cougar Canyon (Creek) - CONSTRUC...,https://www.mountainproject.com/route/11091386...,2.0,Sport,5.10b/c,1,,51.09642,-115.31767,from tabvar: Cool fins to roof~ thin holds...,4 bolts to anchor,1
2,2,Ant & Bee do Yoga,The Hen House > Kamloops > British Columbia > ...,https://www.mountainproject.com/route/11240652...,2.7,Trad,5.10b/c,1,,50.57212,-120.13874,A safe mixed route with a bit of run out up to...,"mixed~ gear to 4""",3
3,3,Besame Fuerte,Pilon De Lolita > Loreto Area > Baja Californi...,https://www.mountainproject.com/route/11608640...,2.0,Sport,5.10b/c,1,80.0,26.01097,-111.34166,Start on a slab under a left leaning arched ro...,bolts,1
4,4,Big Momma's Rock,The Courtyard > Mamquam FSR > Squamish > Briti...,https://www.mountainproject.com/route/11445772...,3.0,Sport,5.10b/c,1,60.0,49.71393,-123.09943,Fun technical climbing. Tricky right off the bat.,bolts,3


In [52]:
# Print the column names
print(df.columns)

# Print number of rows
print("Number of routes:", len(df))

Index(['Unnamed: 0', 'Route', 'Location', 'URL', 'Avg Stars', 'Route Type',
       'Rating', 'Pitches', 'Length', 'Area Latitude', 'Area Longitude',
       ' desc', ' protection', ' num_votes'],
      dtype='object')
Number of routes: 116700


In [53]:
# Remove the columns
df = df.drop(columns=['Unnamed: 0', 'URL', 'Route Type', 'Pitches', 'Length', 'Area Latitude', 'Area Longitude', ' protection', ' num_votes'], axis=1)

# Print out the first rows of papers
df.head()

Unnamed: 0,Route,Location,Avg Stars,Rating,desc
0,Access Denied,El Mirador > El Potrero Chico > Nuevo Leon > N...,2.9,5.10b/c,This is a really great route~ with awesome exp...
1,Agave Nectar,Sugar Shack > Cougar Canyon (Creek) - CONSTRUC...,2.0,5.10b/c,from tabvar: Cool fins to roof~ thin holds...
2,Ant & Bee do Yoga,The Hen House > Kamloops > British Columbia > ...,2.7,5.10b/c,A safe mixed route with a bit of run out up to...
3,Besame Fuerte,Pilon De Lolita > Loreto Area > Baja Californi...,2.0,5.10b/c,Start on a slab under a left leaning arched ro...
4,Big Momma's Rock,The Courtyard > Mamquam FSR > Squamish > Briti...,3.0,5.10b/c,Fun technical climbing. Tricky right off the bat.


In [54]:
print(df.columns)

Index(['Route', 'Location', 'Avg Stars', 'Rating', ' desc'], dtype='object')


### Remove any rows with 0 length descriptions or more than 2000 word descriptions

In [55]:
# Print the number of rows before removal
print("Number of rows before removal:", len(df))

# Remove rows with descriptions containing 0 words or more than 2000 words
df = df[(df[' desc'].str.split().str.len() > 0) & (df[' desc'].str.split().str.len() <= 2000)]

# Print the number of rows after removal
print("Number of rows after removal:", len(df))

Number of rows before removal: 116700
Number of rows after removal: 116430


In [56]:
df[' desc'].head()

0    This is a really great route~ with awesome exp...
1    from tabvar:     Cool fins to roof~ thin holds...
2    A safe mixed route with a bit of run out up to...
3    Start on a slab under a left leaning arched ro...
4    Fun technical climbing. Tricky right off the bat.
Name:  desc, dtype: object

### Remove any routes with non-English descriptions

In [60]:
from langdetect import detect

# Function to check if a text is in English using langdetect
def is_english(text):
    try:
        return detect(text) == 'en'
    except:
        return False
    
# Remove rows where the description is not in English
df = df[df[' desc'].apply(is_english)]

# Print the number of rows after removal of non-English
print("Number of rows after removal:", len(df))

### Remove punctuation and lowercase all descriptions

In [None]:
import string

# Remove punctuation and convert to lowercase
def clean_text(text):
    text = ''.join([char for char in text if char not in string.punctuation])
    return text.lower()

# Apply text cleaning to the description column
df[' desc'] = df[' desc'].apply(clean_text)

df.head()