# Natural Language Processing
One of my objectives was to predict from the description how well the movie was going to be received.
To do that I need to work with the ``description`` column as input, and change the ``rating`` column into a boolean one where True is over 5 of valoration.

In [34]:
import pandas as pd
import numpy as np

In [35]:
data = pd.read_csv('data\imdb_processed.csv')

In [36]:
sample = data.sample(100, random_state=42)

## Target column

In [None]:
def approved(rating):
    if rating >= 5.0:
        return True
    else:
        return False

In [None]:
rating_bool = data.rating.map(approved)
rating_bool.value_counts()

True     33941
False     4917
Name: rating, dtype: int64

## Processing text

In [37]:
# text = data[['genres', 'description']]
descriptions = sample.description


In [41]:
from nltk.tokenize import word_tokenize
import nltk
# nltk.download('punkt')

from nltk.stem import PorterStemmer
from nltk.stem import WordNetLemmatizer 
from nltk.corpus import wordnet

from nltk.corpus import stopwords

import re



In [46]:
def clean_up(s):
    """
    Cleans up numbers, URLs, and special characters from a string.

    Args:
        s: The string to be cleaned up.

    Returns:
        A string that has been cleaned up.
    """
    reg_url = '(?:(?:https?|ftp):\/\/)?[\w\/\-?=%.]+\.[\w\/\-&?=%.]+'
    reg_sp  = '[^A-Za-z ]'

    s = re.sub(reg_url,'',s) 

    s = re.sub(reg_sp,' ',s) 

    return s

def tokenize(s):
    """
    Tokenize a string.

    Args:
        s: String to be tokenized.

    Returns:
        A list of words as the result of tokenization.
    """
    s = word_tokenize(s)

    return s

def stem_and_lemmatize(l):
    """
    Perform stemming and lemmatization on a list of words.

    Args:
        l: A list of strings.

    Returns:
        A list of strings after being stemmed and lemmatized.
    """
    ps = PorterStemmer() #I'm not convinced these go here
    lemmatizer = WordNetLemmatizer() 

    list = [lemmatizer.lemmatize(ps.stem(word)) for word in l]

    for word in l:
        word = ps.stem(word)
        word = lemmatizer.lemmatize(word)

    return list

def remove_stopwords(l):
    """
    Remove English stopwords from a list of strings.

    Args:
        l: A list of strings.

    Returns:
        A list of strings after stop words are removed.
    """
    clean_list = [word for word in l if not word in stopwords.words()]

    return clean_list

def full_process(s):
    '''
    Args:
        s: the string to process

    Returns:
        The list of words after removing the stopwords
    '''
    # s = clean_up(s)
    l = tokenize(s)
    l = stem_and_lemmatize(l)
    clean_list = remove_stopwords(l)

    return clean_list

In [43]:
tokens = descriptions.apply(full_process) #full
tokens[:15]

19686    [six, youth, crimin, chosen, particip, social,...
953      [privat, detect, hire, three, kidnap, wealthi,...
27625    [ireland, fli, saucer, full, alien, land, farm...
1416     [armi, send, andi, thoma, pose, renegad, find,...
38319    [anxieti, attack, new, encount, forc, mari, re...
1731     [sli, busi, manag, wacki, friend, two, opera, ...
33109    [loaf, countri, boy, arriv, athen, studi, inst...
7954     [mani, version, shakespear, masterpiec, none, ...
18924    [th, centuri, ukrain, polish, overlord, ukrain...
21617    [show, set, circu, backdrop, focus, littlechap...
35369    [housewif, despis, societi, health, look, susp...
12639    [life, career, heavyweight, champion, joe, lou...
3168     [dr, molnac, hi, music, troup, begg, manag, mi...
16111    [militari, school, cadet, boon, win, date, fre...
26766    [cotter, qv, sioux, indian, whose, life, tragi...
Name: description, dtype: object

In [44]:
tokens = descriptions.apply(full_process) # no stopwords
tokens[:15]

19686    [six, youth, crimin, chosen, particip, social,...
953      [privat, detect, hire, three, kidnap, wealthi,...
27625    [ireland, fli, saucer, full, alien, land, farm...
1416     [armi, send, andi, thoma, pose, renegad, find,...
38319    [anxieti, attack, new, encount, forc, mari, re...
1731     [sli, busi, manag, wacki, friend, two, opera, ...
33109    [loaf, countri, boy, arriv, athen, studi, inst...
7954     [mani, version, shakespear, masterpiec, none, ...
18924    [th, centuri, ukrain, polish, overlord, ukrain...
21617    [show, set, circu, backdrop, focus, littlechap...
35369    [housewif, despis, societi, health, look, susp...
12639    [life, career, heavyweight, champion, joe, lou...
3168     [dr, molnac, hi, music, troup, begg, manag, mi...
16111    [militari, school, cadet, boon, win, date, fre...
26766    [cotter, qv, sioux, indian, whose, life, tragi...
Name: description, dtype: object

In [47]:
tokens = descriptions.apply(full_process) # no regex
tokens[:15]

19686    [six, youth, crimin, chosen, particip, social,...
953      [privat, detect, hire, three, kidnap, wealthi,...
27625    [ireland, fli, saucer, full, alien, land, farm...
1416     [armi, send, andi, thoma, pose, renegad, find,...
38319    [anxieti, attack, new, encount, forc, mari, re...
1731     [sli, busi, manag, wacki, friend, two, opera, ...
33109    [loaf, countri, boy, arriv, athen, studi, inst...
7954     [mani, version, shakespear, masterpiec, none, ...
18924    [th, centuri, ukrain, polish, overlord, ukrain...
21617    [show, set, circu, backdrop, focus, littlechap...
35369    [housewif, despis, societi, health, look, susp...
12639    [life, career, heavyweight, champion, joe, lou...
3168     [dr, molnac, hi, music, troup, begg, manag, mi...
16111    [militari, school, cadet, boon, win, date, fre...
26766    [cotter, qv, sioux, indian, whose, life, tragi...
Name: description, dtype: object

In [45]:
data.shape

(38858, 8)