# Natural Language Processing Project - Part 1

## Import Dependencies

In [None]:
import pandas as pd
import spacy
import re
import nltk
from bs4 import BeautifulSoup

In [None]:
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

## Read Dataset

In [None]:
imdb = pd.read_csv("/content/drive/My Drive/NLP/IMDB Dataset.csv")

In [None]:
imdb.head()

Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,positive
1,A wonderful little production. <br /><br />The...,positive
2,I thought this was a wonderful way to spend ti...,positive
3,Basically there's a family where a little boy ...,negative
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive


## Pre Processing


In [None]:
spc_en = spacy.load('en')

def pre_processing(text):

  # remove all html tags
  soup = BeautifulSoup(text)
  string =  soup.get_text()
  
  # convert uppercase into lowercase 
  string_lower = string.lower()
  
  # remove evererything is not letters
  list_only_letters = re.findall(r'[a-z]+', string_lower)

  # set english stopwords
  stopwords = nltk.corpus.stopwords.words('english')
  stop = set(stopwords)

  # remove stopwords
  list_meaningful_words = [word for word in list_only_letters \
                          if word not in stopwords]
  # join list strings on space
  string_meaninful_words = " ".join(list_meaningful_words)

  # innitializing spacy object
  spc_letters = spc_en(string_meaninful_words)

  # lemminization
  tokens = [token.lemma_ if token.pos_ == 'VERB' else str(token) \
            for token in spc_letters]

  return " ".join(tokens)

In [None]:
imdb['review'] = imdb['review'].apply(pre_processing)

In [None]:
imdb.head()

Unnamed: 0,review,sentiment
0,"[one, reviewers, mention, watch, oz, episode, ...",positive
1,"[wonderful, little, production, filming, techn...",positive
2,"[think, wonderful, way, spend, time, hot, summ...",positive
3,"[basically, family, little, boy, jake, think, ...",negative
4,"[petter, mattei, love, time, money, visually, ...",positive


In [None]:
new_imdb = imdb.copy()
new_imdb["review"] = imdb["review"].apply(lambda x: " ".join(x))

In [None]:
new_imdb['sentiment'] = pd.get_dummies(new_imdb['sentiment'])['positive']

In [None]:
new_imdb.head()

Unnamed: 0,review,sentiment
0,one reviewers mention watch oz episode hook ri...,1
1,wonderful little production filming technique ...,1
2,think wonderful way spend time hot summer week...,1
3,basically family little boy jake think zombie ...,0
4,petter mattei love time money visually stunnin...,1


In [None]:
new_imdb.to_csv("/content/drive/My Drive/NLP/IMDB_cleaned.csv", index=False)

## Bag of words

In [None]:
new_imdb = pd.read_csv("/content/drive/My Drive/NLP/IMDB_cleaned.csv")

In [None]:
from sklearn.feature_extraction.text import CountVectorizer

In [None]:
vectorizer = CountVectorizer(binary=True, max_features = 5000)
X = vectorizer.fit_transform(new_imdb['review'])

In [None]:
X.toarray()

array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       ...,
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]])

In [None]:
from sklearn.model_selection import train_test_split

In [None]:
X_train, X_test, y_train, y_test = train_test_split(
    X.toarray(), new_imdb['sentiment'], 
    test_size=0.33, random_state=42
)

## Model

### Random Forest

In [None]:
from sklearn.ensemble import RandomForestClassifier

from sklearn.metrics import confusion_matrix, accuracy_score

In [None]:
random_forest = RandomForestClassifier(max_depth=3, random_state=0)
random_forest.fit(X_train, y_train)

RandomForestClassifier(bootstrap=True, ccp_alpha=0.0, class_weight=None,
                       criterion='gini', max_depth=3, max_features='auto',
                       max_leaf_nodes=None, max_samples=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=100,
                       n_jobs=None, oob_score=False, random_state=0, verbose=0,
                       warm_start=False)

In [None]:
y_hat = random_forest.predict(X_test)

In [None]:
def validation(y_test, y_hat):
  
  acc = accuracy_score(y_test, y_hat)
  tn, fp, fn, tp = confusion_matrix(y_test, y_hat).ravel()

  precision = tp / (tp + fp)
  recall = tp / (tp + fn)
  fpr = fp / (fp + tn)

  print("Acuracia: ", acc)
  print("Precision: ", precision)
  print("Recall: ", recall)

### Regressao Logistica

In [None]:
from sklearn.linear_model import LogisticRegression

In [None]:
log_reg = LogisticRegression()
log_reg.fit(X_train, y_train)
y_hat = log_reg.predict(X_test)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


In [None]:
validation(y_test, y_hat)