# Challenge 1: Natural Language Process Overview

In [1]:
# imports

import nltk


## Test NLTK Installation

In [5]:
# para importar un DS llamado 'brown'

nltk.download('brown')

[nltk_data] Downloading package brown to /home/alberto/nltk_data...
[nltk_data]   Package brown is already up-to-date!


True

In [8]:
from nltk.corpus import brown

In [None]:
brown.words()[0:10]

In [None]:
# Tagger words

brown.tagged_words()[0:10]

In [23]:
text = 'Ironhack is a Global Tech School ranked num 2 worldwide.   Our mission is to help people transform their careers and join a thriving community of tech professionals that love what they do. This ideology is reflected in our teaching practices, which consist of a nine-weeks immersive programming, UX/UI design or Data Analytics course as well as a one-week hiring fair aimed at helping our students change their career and get a job straight after the course. We are present in 8 countries and have campuses in 9 locations - Madrid, Barcelona, Miami, Paris, Mexico City,  Berlin, Amsterdam, Sao Paulo and Lisbon.'

In [24]:
text

'Ironhack is a Global Tech School ranked num 2 worldwide. \u2028\u2028Our mission is to help people transform their careers and join a thriving community of tech professionals that love what they do. This ideology is reflected in our teaching practices, which consist of a nine-weeks immersive programming, UX/UI design or Data Analytics course as well as a one-week hiring fair aimed at helping our students change their career and get a job straight after the course. We are present in 8 countries and have campuses in 9 locations - Madrid, Barcelona, Miami, Paris, Mexico City,  Berlin, Amsterdam, Sao Paulo and Lisbon.'

In [25]:
from nltk import sent_tokenize, word_tokenize
nltk.download('punkt')

[nltk_data] Downloading package punkt to /home/alberto/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [26]:
sent_tokenize(text)

['Ironhack is a Global Tech School ranked num 2 worldwide.',
 'Our mission is to help people transform their careers and join a thriving community of tech professionals that love what they do.',
 'This ideology is reflected in our teaching practices, which consist of a nine-weeks immersive programming, UX/UI design or Data Analytics course as well as a one-week hiring fair aimed at helping our students change their career and get a job straight after the course.',
 'We are present in 8 countries and have campuses in 9 locations - Madrid, Barcelona, Miami, Paris, Mexico City,  Berlin, Amsterdam, Sao Paulo and Lisbon.']

In [27]:
word_tokenize(text)

['Ironhack',
 'is',
 'a',
 'Global',
 'Tech',
 'School',
 'ranked',
 'num',
 '2',
 'worldwide',
 '.',
 'Our',
 'mission',
 'is',
 'to',
 'help',
 'people',
 'transform',
 'their',
 'careers',
 'and',
 'join',
 'a',
 'thriving',
 'community',
 'of',
 'tech',
 'professionals',
 'that',
 'love',
 'what',
 'they',
 'do',
 '.',
 'This',
 'ideology',
 'is',
 'reflected',
 'in',
 'our',
 'teaching',
 'practices',
 ',',
 'which',
 'consist',
 'of',
 'a',
 'nine-weeks',
 'immersive',
 'programming',
 ',',
 'UX/UI',
 'design',
 'or',
 'Data',
 'Analytics',
 'course',
 'as',
 'well',
 'as',
 'a',
 'one-week',
 'hiring',
 'fair',
 'aimed',
 'at',
 'helping',
 'our',
 'students',
 'change',
 'their',
 'career',
 'and',
 'get',
 'a',
 'job',
 'straight',
 'after',
 'the',
 'course',
 '.',
 'We',
 'are',
 'present',
 'in',
 '8',
 'countries',
 'and',
 'have',
 'campuses',
 'in',
 '9',
 'locations',
 '-',
 'Madrid',
 ',',
 'Barcelona',
 ',',
 'Miami',
 ',',
 'Paris',
 ',',
 'Mexico',
 'City',
 ',',
 '

# Challenge 2: Preparing Text Data For Analysis

## Text Cleaning

In [72]:
text = '''@Ironhack's-#Q website 776-is http://ironhack.com [(2018)]")'''

In [73]:
def remove_url(sentence): 
    words = sentence.split(' ')
    for i, word in enumerate(words): 
        if 'http://' in word or 'https://' in word: 
            words.pop(i)
    return ' '.join(words)
 
def remove_digits(sentence): 
    return ''.join([c for c in sentence if not c.isdigit()])

def remove_non_alpha(sentence): 
    return ''.join([' ' if not c.isalpha() else c for c in sentence])

def clean_up(s):
    """
    Cleans up numbers, URLs, and special characters from a string.

    Args:
        s: The string to be cleaned up.

    Returns:
        A string that has been cleaned up.
    """
    for funcion in [remove_url, remove_digits, remove_non_alpha]: 
        s = funcion(s)
    return s.lower().strip() # word_tokenize(s.lower())

In [74]:
clean_up(text)

'ironhack s  q website  is'

## Tokenization

In [70]:
def tokenize(s):
    """
    Tokenize a string.

    Args:
        s: String to be tokenized.

    Returns:
        A list of words as the result of tokenization.
    """
    return word_tokenize(s)

In [71]:
tokenize('ironhack s  q website  is')

['ironhack', 's', 'q', 'website', 'is']

## Stemming and Lemmatization

In [79]:
nltk.download('wordnet')
from nltk.stem import WordNetLemmatizer

[nltk_data] Downloading package wordnet to /home/alberto/nltk_data...
[nltk_data]   Unzipping corpora/wordnet.zip.


In [80]:
lemmatizer = WordNetLemmatizer()

In [81]:
lemmatizer.lemmatize('was')
# 'wa'

'wa'

In [82]:
lemmatizer.lemmatize('runs', pos='v')
# 'be'

'run'

In [90]:
def stem_and_lemmatize(l_words):
    """
    Perform stemming and lemmatization on a list of words.

    Args:
        l: A list of strings.

    Returns:
        A list of strings after being stemmed and lemmatized.
    """
    lemmatizer = WordNetLemmatizer()
    res = [lemmatizer.lemmatize(word) for word in l_words]
    return res

In [91]:
stem_and_lemmatize(['ironhack', 'was', 'q', 'website', 'is'])

['ironhack', 'wa', 'q', 'website', 'is']