***

## Natural Language Processing

***

### Data Preprocessing

#### Importing libraries

In [1]:
import pandas as pd 
import numpy as np 
import matplotlib.pyplot as plt

#### Importing dataset

In [2]:
df = pd.read_csv("C:/Users/tze/OneDrive/ML_BOOTCAMP/Machine Learning A-Z (Codes and Datasets)/Part 7 - Natural Language Processing/Section 36 - Natural Language Processing/Python/Restaurant_Reviews.tsv", delimiter = '\t', quoting = 3)
df.head()

Unnamed: 0,Review,Liked
0,Wow... Loved this place.,1
1,Crust is not good.,0
2,Not tasty and the texture was just nasty.,0
3,Stopped by during the late May bank holiday of...,1
4,The selection on the menu was great and so wer...,1


#### Text Cleaning

In [3]:
import re
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
#reviews list which is yet empty
corpus = []
for i in range(0, len(df)):
    review = re.sub('^a-zA-Z', ' ', df['Review'][i]) # Replace others symbols by a space in the reviews columns
    review = review.lower()                          # lowering the reviews after substitution 
    review = review.split()                          # And then split them into piece of words
    ps = PorterStemmer()                             # Stemmer that uses root of words to identify them
    all_stopwords = stopwords.words('english')
    all_stopwords.remove('not')
    review = [ps.stem(word) for word in review if not word in set(all_stopwords)]
    review = " ".join(review)                        # Join each piece of word but separates them with a space
    corpus.append(review)                            # Adding all pieces of word 

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\tze\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


### Bag Of Words model

#### Feature Extraction

In [4]:
from sklearn.feature_extraction.text import CountVectorizer
tokenizer = CountVectorizer(max_features = 1500)
X = tokenizer.fit_transform(corpus).toarray()
y = df.iloc[:, -1].values

#### Splitting the dataset into training and testing sets

In [5]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split( X, y, test_size = 0.2, random_state = 42)

#### Fitting Naive Bayes Model & Test Set Prediction

In [6]:
from sklearn.naive_bayes import GaussianNB
classifier = GaussianNB()
classifier.fit(X_train, y_train)
y_pred = classifier.predict(X_test)

#### Classification Metrics

In [7]:
from sklearn.metrics import confusion_matrix, accuracy_score
c_matrix = confusion_matrix(y_test, y_pred)
score = accuracy_score(y_test, y_pred)
c_matrix, score

(array([[47, 49],
        [16, 88]], dtype=int64),
 0.675)

### Predicting good review 👍🏾

In [8]:
_review ='I love this restaurant so much'
_review = re.sub('^a-zA-Z', ' ', _review)
_review = _review.lower()
_review = _review.split()
_review = [ps.stem(word) for word in _review if not word in set(all_stopwords)]
_review = ' '.join(_review)
corpus_ = [_review]
X_ = tokenizer.transform(corpus_).toarray()
y_ = classifier.predict(X_)
y_

array([1], dtype=int64)

### Predicting good review 👎🏾

In [9]:
review_ = 'I hate this restaurant so much'
review_ = re.sub('^a-zA-Z', ' ', review_)
review_ = review_.lower()
review_ = review_.split()
review_ = [ps.stem(word) for word in review_ if not word in set(all_stopwords)]
review_ = ' '.join(review_)
corpus__ = [review_]
x = tokenizer.transform(corpus__).toarray()
_y = classifier.predict(x)
_y

array([0], dtype=int64)

***

### Fitting other classification models 

***

#### Loading necessaries

In [26]:
from sklearn.model_selection import cross_validate
from sklearn.metrics import make_scorer, accuracy_score, f1_score, precision_score, recall_score
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.linear_model import LogisticRegression

#### Models building & performances evaluation

#### - Performance metrics

In [35]:
scores = {'accuracy': make_scorer(accuracy_score),
          'precision': make_scorer(precision_score),
          'recall': make_scorer(recall_score),
          'f1_score': make_scorer(f1_score)}

#### - Model building

In [32]:
knn_ = KNeighborsClassifier()
svc_ = SVC()
dtc_ = DecisionTreeClassifier()
rfc_ = RandomForestClassifier()
ada_ = AdaBoostClassifier()
nbc_ = GaussianNB()
log_ = LogisticRegression(max_iter = 10000)

#### - Models evaluation function

In [37]:
def evaluate(X, y, folds):
    #cross-validations
    knn = cross_validate(knn_, X_train, y_train, cv = folds, scoring = scores)
    svc = cross_validate(svc_, X_train, y_train, cv = folds, scoring = scores)
    dtc = cross_validate(dtc_, X_train, y_train, cv = folds, scoring = scores)
    rfc = cross_validate(rfc_, X_train, y_train, cv = folds, scoring = scores)
    ada = cross_validate(ada_, X_train, y_train, cv = folds, scoring = scores)
    nbc = cross_validate(nbc_, X_train, y_train, cv = folds, scoring = scores)
    log = cross_validate(log_, X_train, y_train, cv = folds, scoring = scores)
    #model scores dataframe
    score_table = pd.DataFrame({'K-NN': [knn['test_accuracy'].mean(),
                                         knn['test_precision'].mean(),
                                         knn['test_recall'].mean(),
                                         knn['test_f1_score'].mean()],
                               
                                'Support Vector Classification': [svc['test_accuracy'].mean(),
                                                                  svc['test_precision'].mean(),
                                                                  svc['test_recall'].mean(),
                                                                  svc['test_f1_score'].mean()],
                               
                               'Decision Tree Classification': [dtc['test_accuracy'].mean(),
                                                                dtc['test_precision'].mean(),
                                                                dtc['test_recall'].mean(),
                                                                dtc['test_f1_score'].mean()],
                               
                               'Random Forest Classification': [rfc['test_accuracy'].mean(),
                                                                rfc['test_precision'].mean(),
                                                                rfc['test_recall'].mean(),
                                                                rfc['test_f1_score'].mean()],
                               
                               'AdaBoost Classification': [ada['test_accuracy'].mean(),
                                                           ada['test_precision'].mean(),
                                                           ada['test_recall'].mean(),
                                                           ada['test_f1_score'].mean()],
                               
                               'Naive Bayes Classification': [nbc['test_accuracy'].mean(),
                                                              nbc['test_precision'].mean(),
                                                              nbc['test_recall'].mean(),
                                                              nbc['test_f1_score'].mean()],
                               
                               'Logistic Regression': [log['test_accuracy'].mean(),
                                                       log['test_precision'].mean(),
                                                       log['test_recall'].mean(),
                                                       log['test_f1_score'].mean()]
                               },
                              index = ['Accuracy', 'Precision', 'Recall', 'F1 Score'])
    #Adding Best-score collumn
    score_table['Best-Score'] = score_table.idxmax(axis = 1)
    return score_table
#Running the function
evaluate(X_train, y_train, 10)

  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)
  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)
  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)
  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)
  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)
  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)
  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)
  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)
  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)
  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)


Unnamed: 0,K-NN,Support Vector Classification,Decision Tree Classification,Random Forest Classification,AdaBoost Classification,Naive Bayes Classification,Logistic Regression,Best-Score
Accuracy,0.65625,0.79125,0.72375,0.785,0.74625,0.70875,0.82,Logistic Regression
Precision,0.82612,0.882302,0.742129,0.874786,0.895027,0.653965,0.839859,AdaBoost Classification
Recall,0.394167,0.669679,0.676987,0.661987,0.556282,0.880962,0.787949,Naive Bayes Classification
F1 Score,0.53114,0.758904,0.706146,0.750266,0.680546,0.749783,0.812064,Logistic Regression
