In [None]:
!pip install xlwt

In [None]:
from sklearnex import patch_sklearn  #This is a library to accelerate existing scikit-learn code
patch_sklearn() #For more info: https://github.com/intel/scikit-learn-intelex

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import matplotlib.gridspec as gridspec
import seaborn as sns
import xgboost as xgb
from mlxtend.classifier import StackingCVClassifier

from sklearn.metrics import classification_report, confusion_matrix, roc_curve, roc_auc_score, ConfusionMatrixDisplay, f1_score, precision_score, recall_score, accuracy_score, average_precision_score
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression, SGDClassifier
from sklearn.model_selection import RandomizedSearchCV, cross_val_score, KFold, StratifiedKFold, train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.ensemble import VotingClassifier, RandomForestClassifier, ExtraTreesClassifier
from sklearn.naive_bayes import MultinomialNB, BernoulliNB
from imblearn.pipeline import make_pipeline
from imblearn.under_sampling import RandomUnderSampler, NearMiss
from imblearn.over_sampling import ADASYN
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer

from wordcloud import WordCloud
import re
import spacy
from nltk.corpus import stopwords
import warnings

warnings.filterwarnings('ignore')
np.random.seed(0)

In [None]:
df = pd.read_csv("/kaggle/input/reviews/Restaurant_Reviews.tsv", sep="\t")
df.head()

Let' see if there are any null. (There isn't.)

In [None]:
df.isnull().sum()

In [None]:
df.info()

Liked needed to be categorical.

In [None]:
df["Liked"]=df["Liked"].astype("category")

In [None]:
def get_axes_list(length, column_number=2):
    fig, axes_list = get_fig_and_axes_list(length, column_number=column_number )
    return axes_list

In [None]:
def get_fig_and_axes_list(plot_count, column_number=2):
    """
    This function takes in the number of subplots to be plotted and the desired number of columns for the subplot grid. 
    It then calculates the number of rows required and generates a matplotlib figure with the given number of subplots 
    in a grid with the desired number of columns.
    
    Args:
    - plot_count: int, the number of subplots to be plotted
    - column_number: int, the number of columns in the subplot grid. Default value is 2.
    
    Returns:
    - fig: matplotlib Figure object, the generated figure
    - axes_list: list of matplotlib Axes objects, the axes of the subplots in the figure
    """
    reminder_num = plot_count % column_number
    row_num = (plot_count // column_number) + (reminder_num > 0)
    axes_list = []
    row_number_alignment = np.ones((row_num, column_number), dtype="int")
    if reminder_num != 0:
        row_number_alignment[-1,-(column_number - reminder_num):] = 0
    coefficient = (3.5 if plot_count==1 else 4.2)
    col_size = coefficient*column_number
    row_size = coefficient*row_num
    fig = plt.figure(figsize=(col_size, row_size), layout="constrained")
    spec = fig.add_gridspec(row_num, column_number)
    for i in range(row_num):
        for j in range(column_number):
            if row_number_alignment[i,j] == 1:
                ax = fig.add_subplot(spec[i,j])
                axes_list.append(ax)
    return fig,axes_list

In [None]:
df["Liked"].value_counts()

In [None]:
ax = get_axes_list(1)[0]
sns.set_palette("PuRd_r")
bar = sns.countplot(x=df["Liked"],ax=ax)
ax.set_xticklabels(['Not Liked', 'Liked'])
ax.set_xlabel(None)
ax.set_ylabel("Count")
bar.set_title('Distribution of Restaurant Reviews')
plt.show()

In [None]:
df['Review'] = df['Review'].str.lower()

In [None]:
df

# Stopwords

A stopword is a commonly used word in a language that is usually removed from text data because it is not considered useful for natural language processing tasks such as text classification or sentiment analysis. Examples of stopwords in English include "a," "an," "the," "and," "or," "but," and "is".Since these words are so common and do not carry much meaning on their own, they are often removed from text data to improve the efficiency and accuracy of natural language processing algorithms. But in this project, i will be using a custom stopwords list that doesn't contain neggative words because we need them. If we don't the sentences of "It is not good" and "It is good" will be same.

In [None]:
stopwords = stopwords.words("english")

In [None]:
custom_stopwords = \
['i','me','my','myself','we','our','ours','ourselves','you',"you're","you've","you'll","you'd",'your','yours','yourself','yourselves','he','him','his','himself','she',"she's",'her','hers','herself','it',"it's",'its','itself','they','them','their','theirs','themselves','what','which','who','whom','this','that',"that'll",'these','those','am','is','are','was','were','be','been','being','have','has','had','having','do','does','did','doing','a','an','the','and','if','or','because','as','until','while','of','at','by','for','with','about','against','between','into','through','during','before','after','above','below','to','from','up','down','in','out','on','over','under','again','further','then','once','here','there','when','where','why','how','all','both','each','few','more','most','other','some','such','own','same','so','than','too','s','can','will','just','should',"should've",'now','d','ll','m','o','re','ve','y','ma']

In [None]:
# Load the en_core_web_sm model using its package name
nlp = spacy.load("en_core_web_sm")

# Lemmatization 

Lemmatization is a process of reducing words to their base or dictionary form, which is called a lemma. The main purpose of lemmatization is to group together different inflected forms of a word, such as "run," "runs," and "running," so that they can be analyzed as a single item. This is often used in natural language processing to improve the accuracy of text analysis and machine learning models.

Unlike stemming, which simply chops off the end of words to get to their base form, lemmatization uses a vocabulary and morphological analysis of words to reduce them to their base form. This means that the resulting lemmas are actual words that can be found in a dictionary, rather than just truncated versions of the original words. For example, the lemma of "I'm" is "I" and "am," and the lemma of "went" is "go." If the word "I'm" is used in stemming, only the word "I" will be extracted, which is not desirable.

In [None]:
string = "Hello! I don't know what I'm doing here."
print("Original sentence: ",string)
doc = nlp(string)
lemmas = [token.lemma_ for token in doc]
print("After lemmatization: ",lemmas)

In [None]:
df['Review_tokenized'] = df['Review'].apply(lambda x: " ".join([token.lemma_ for token in nlp(x)])).str.lower()
df['Review_tokenized'] = df['Review_tokenized'] \
.apply(lambda x: " ".join([word for word in re.sub("[^a-zA-Z]"," ",x).split() if not word in set(custom_stopwords)]))

Let's have a look at our DataFrame's tokenized version.

In [None]:
pd.options.display.min_rows = 15
pd.options.display.max_colwidth = 50
df[['Review_tokenized',"Liked"]] 

# Wordcloud

A word cloud is a visual representation of text data, where the most frequently occurring words in a given set of text are displayed in a way that highlights their prominence.Word clouds can be useful for quickly identifying the key themes or topics in a large body of text.

In [None]:
spacy_stopwords = nlp.Defaults.stop_words
df['Review_tokenized_restrict_stopwords'] = df['Review_tokenized'] \
.apply(lambda x: " ".join([word for word in re.sub("[^a-zA-Z]"," ",x).split() if not word in set(spacy_stopwords)]))
text = " ".join(i for i in df.Review_tokenized_restrict_stopwords)
wordcloud = WordCloud(max_font_size=100,
                      max_words=200,
                      background_color="black").generate(text)
plt.figure()
plt.imshow(wordcloud, interpolation="bilinear")
plt.axis("off")
plt.show()

In [None]:
X = df["Review_tokenized"]
y = df["Liked"]

In [None]:
vectorizer = CountVectorizer(ngram_range=(1,2))
X_count = vectorizer.fit_transform(X)

In [None]:
tf_idf_vectorizer = TfidfVectorizer(ngram_range=(1, 2))
X_tf_idf = tf_idf_vectorizer.fit_transform(X)

In [None]:
rand_state=42
X_train, X_test, y_train, y_test = train_test_split(X_count, y, test_size=0.2,random_state=rand_state, stratify=y)

In [None]:
#Our results DataFrame
results = pd.DataFrame(columns=["Function Name", "Accuracy", "Precision", "Recall", "F1",
                               "TN", "FP", "FN", "TP", "ROC-AUC Score", "Precision-Recall Score", "Train CV Score"])
kf = StratifiedKFold(n_splits=5, shuffle=True, random_state=rand_state)
kf_for_test = StratifiedKFold(n_splits=6, shuffle=True, random_state=0) #We use another one for testing different cross validations

In [None]:
def cross_val_score_for_sampling(model, X_train, y_train, cv=kf_for_test, 
                                 results=results, undersample=False, 
                                 oversample=False, stack=False, other=False):  
    scoring_metrics = ['accuracy', 'precision', 'recall', 'f1', 'roc_auc', 'average_precision']
    scores_dict = {metric: [] for metric in scoring_metrics}
    sampling_methods = {'undersample': NearMiss(sampling_strategy='majority', n_jobs=-1, version=3),
                        'oversample': ADASYN(random_state=rand_state)}
    sample_pipeline = make_pipeline(model)
    if undersample:
        sample_pipeline.steps.insert(0, ('undersample', sampling_methods['undersample']))
    if oversample:
        sample_pipeline.steps.insert(0, ('oversample', sampling_methods['oversample']))
    for train_idx, test_idx in cv.split(X_train, y_train):
        sample_model = sample_pipeline.fit(X_train[train_idx], y_train[train_idx])
        sample_prediction_proba = sample_model.predict_proba(X_train[test_idx])[:,1]
        sample_prediction = np.round(sample_prediction_proba)
        scores_dict["accuracy"].append(accuracy_score(y_train[test_idx], sample_prediction))
        scores_dict["precision"].append(precision_score(y_train[test_idx], sample_prediction))
        scores_dict["recall"].append(recall_score(y_train[test_idx], sample_prediction))
        scores_dict["f1"].append(f1_score(y_train[test_idx], sample_prediction))
        scores_dict["roc_auc"].append(roc_auc_score(y_train[test_idx], sample_prediction_proba))
        scores_dict["average_precision"].append(average_precision_score(y_train[test_idx], sample_prediction_proba))
    return  scores_dict

In [None]:
def get_scoring_name(scoring):
    equal = {"accuracy": "Accuracy", 
             "precisione": "Precision", 
             "recall": "Recall", 
             "f1": "F1", 
             "roc_auc": "ROC AUC", 
             "average_precision": "AP AUC"} 
    return equal[scoring]

In [None]:
def get_training_cross_validated_score(model, X_train, y_train, undersample=False, oversample=False, stack=False, other=False, scoring="Accuracy"):
    try:
        if undersample or oversample or stack:
            training_cross_validated_score = np.mean(
                cross_val_score_for_sampling(
                    model, X_train, y_train, undersample=undersample, oversample=oversample, stack=stack, other=other)[scoring])
        elif other:
            return None
        else:
            training_cross_validated_score = cross_val_score(
                model, X_train, y=y_train, cv=kf, scoring=scoring, n_jobs=-1).mean()
        return training_cross_validated_score
    except Exception as e:
        return None

In [None]:
def print_scores(models, X_train=X_train, X_test=X_test, y_train=y_train, 
                 y_test=y_test, result_prefix="",results=results, undersample=False, 
                 oversample=False, stack=False, other=False, scoring="roc_auc"):
    """
    Plot confusion matrices and evaluation metrics for a dictionary of models.

    Parameters:
    -----------
    models : dict
        A dictionary of scikit-learn models to evaluate.
    X_train : array-like, shape (n_samples, n_features), default=X_train_scaled
        The training input samples.
    X_test : array-like, shape (n_samples, n_features), default=X_test_scaled
        The testing input samples.
    y_train : array-like, shape (n_samples,), default=y_train
        The target values of the training input samples.
    y_test : array-like, shape (n_samples,), default=y_test
        The target values of the testing input samples.
    result_prefix : str, default=""
        A prefix to add to the function name in the results DataFrame.
    results : pandas.DataFrame, default=results
        A DataFrame to store the results.
    undersample : bool, default=False
        Whether to perform undersampling during cross-validation on training set.
    oversample : bool, default=False
        Whether to perform oversampling during cross-validation on training set.
    stack : bool, default=False
        Whether Stacking Classifier will be used during cross-validation on training set.
    other : bool, default=False
        Whether cross-validation will be used on training set.
    scoring : str or callable, default="roc_auc"
        The scoring metric to use for cross validation .

    Returns:
    --------
    None
    """
    ax_list =get_axes_list(len(models))
    ax_counter = 0
    for i, (name, model) in enumerate(models.items()):
        model_name = model.__class__.__name__
        ax_of_model = ax_list[i]
        model.fit(X_train, y_train)
        y_pred = model.predict(X_test)
        y_pred = np.round(y_pred)
        accuracy = accuracy_score(y_test, y_pred)
        precision = precision_score(y_test, y_pred)
        recall = recall_score(y_test, y_pred)
        f1 = f1_score(y_test, y_pred)
        tn, fp, fn, tp = confusion_matrix(y_test, y_pred).ravel()
        if hasattr(model, "predict_proba"):
            predicted_prob = model.predict_proba(X_test)[:, 1]
            roc_auc = roc_auc_score(y_test, predicted_prob)
            average_precision = average_precision_score(y_test, predicted_prob)
        else:
            predicted_prob = None
            roc_auc = None
            average_precision = None
        training_cross_validated_score = get_training_cross_validated_score(
            model, X_train, y_train, undersample=undersample, oversample=oversample, stack=stack, other=other, scoring=scoring)
        scoring_name = get_scoring_name(scoring)
        scoring_variable = locals()[scoring]
        if training_cross_validated_score and scoring_variable:
            ax_of_model.set_title(f"{model_name} \n ({scoring_name}: {scoring_variable:3.2f} || Train {scoring_name}: {scoring_variable:3.2f})",
                                  fontdict={'fontsize':10})
        elif scoring_variable:
            ax_of_model.set_title(f"{model_name} \n ({scoring_name} score: {scoring_variable:3.2f})")
        else:
            ax_of_model.set_title(f"{model_name}")
        function_name = f"{model_name}{result_prefix}"    
        function_location = results[results["Function Name"] == function_name].index
        index_to_insert=(len(results.index) if function_location.empty else function_location[0])
        results.loc[index_to_insert] = [function_name, accuracy,
                                        precision, recall, f1, tn, fp, fn, tp, roc_auc, 
                                        average_precision, training_cross_validated_score] 
        labels_ = ['No Fraud', 'Fraud']
        ConfusionMatrixDisplay.from_predictions(y_test, y_pred, display_labels=labels_, 
                                                ax=ax_of_model, cmap='Blues', xticks_rotation="vertical")
        #print(classification_report(y_test, y_pred)) #If you want to get classification report.
    plt.show()

In [None]:
models = {"Random Forest":RandomForestClassifier(n_jobs=-1, random_state=rand_state),
          "BernoulliNB":BernoulliNB(),
          "MultinomialNB":MultinomialNB(),
          "Extra":ExtraTreesClassifier(n_jobs=-1),}
print_scores(models)

#### And try Voting Classifier.

A voting classifier is a machine learning technique that combines the predictions of multiple models to make a final prediction. This ensemble approach can improve the accuracy and stability of predictions, especially when individual models are prone to error or have biases. 

In [None]:
clf_meta=RandomForestClassifier(n_jobs=-1, random_state=rand_state)
classifiers_stack=[ExtraTreesClassifier(n_jobs=-1),
                   RandomForestClassifier(n_jobs=-1),
                   xgb.XGBClassifier(eval_metric=average_precision_score, n_jobs=-1, 
                                     tree_method="hist", random_state=rand_state),
                   xgb.XGBRFClassifier(objective="binary:logistic", eval_metric=average_precision_score,
                                       n_jobs=-1, tree_method="hist", random_state=rand_state),
                   LogisticRegression(solver="liblinear",n_jobs=-1,max_iter=int(1e9), class_weight="balanced"),
                   KNeighborsClassifier(n_jobs=-1),
                   SGDClassifier(loss="modified_huber", n_jobs=-1,max_iter=int(1e9))]
clf_stack = StackingCVClassifier(classifiers=classifiers_stack,
                                 meta_classifier=clf_meta,
                                 cv=kf,
                                 verbose=False,
                                 use_probas=True,
                                 use_features_in_secondary=True)

In [None]:
vc ={"Stack": clf_stack }
print_scores(vc, stack=True, result_prefix="_voting_stack")

# Bonus: TPOT

Last but not least. Let's try TPOT. It's is a Python Automated Machine Learning tool that optimizes machine learning pipelines using genetic programming. Let's try it.

In [None]:
try: 
    from tpot import TPOTClassifier
    # If you have time, then don't hesitate increasing generations ,offspring_size and population_size
    TPOT_model_us = TPOTClassifier(generations=30,offspring_size=20, scoring='accuracy', cv=kf, 
                                     population_size=20, verbosity=0, n_jobs=-1, random_state=rand_state)
    models = {"TPOT": TPOT_model_us}
    print_scores(models, X_train=X_train.toarray(), X_test=X_test.toarray(), y_train=y_train, 
                 y_test=y_test, other=True, result_prefix="_us_TPOT") 
except Exception as e:
    print(e,"Packages have conficlict.")

In [None]:
TPOT_model_us.fitted_pipeline_ 

In [None]:
results.sort_values(by="ROC-AUC Score",ascending=False).style.format(precision=3)\
                    .highlight_max(subset=["Accuracy","Precision-Recall Score","ROC-AUC Score","Train CV Score"], color ='lightgreen')

Best results highlited. I will be waiting for your feedback :)

In [None]:
results.to_excel('results_1.1.xls', index=False)