# Final Project - Alaa Sweed - 318462959

# Final Project:Part 1

In [57]:
#import spacy and load the language library
import spacy
nlp = spacy.load('en_core_web_sm')

In [58]:
#open and read the text file
with open("Restaurant_Reviews.tsv", "r", encoding="utf-8") as file:
    #skip first column header line
    next(file)
    #read the reset of the data in file
    restaurant_reviews = file.readlines()

In [59]:
#after data cleaning and preprocessing we save new review in restaurant_reviews_updated list
restaurant_reviews_updated = []
#now we iterrate in each review and clean it and lemmatize it 
import re
for review in restaurant_reviews:
    [review,liked]= review.strip().split('\t') 
    #remove special characters from the text except of english chars or nubmer or \n
    review = re.sub('[^a-zA-Z0-9\n]', ' ', review)
    #convert the text to lower case
    review = review.lower()
    #make doc object from the review
    doc = nlp(review)
    #filter out stop words and punctuation after lemmatization
    lemma_review = [token.lemma_ for token in doc if token.lemma_ not in nlp.Defaults.stop_words]
    #convert lemma_review list back to string
    lemma_review_str = " ".join(lemma_review)
    #filter out null or empty reviews and liked values after cleaning reviews and lemmatization
    if lemma_review_str.strip() == '' or liked.strip() == '' or lemma_review_str is None or liked is None:
        continue
    #append the updated review and liked value to the list in .tsv format
    restaurant_reviews_updated.append(f"{lemma_review_str}\t{liked}\n")

In [60]:
#save the updated restaurant reviews to a new .tsv file (Restaurant_Reviews_Updated.tsv)
with open("Restaurant_Reviews_Updated.tsv", "w", encoding="utf-8") as file:
    #first we write column headers
    file.write("Review\tLiked\n")
    #then we write all reviews data
    file.writelines(restaurant_reviews_updated)

# Final Project:Part 2

In [61]:
import pandas as pd
#load the new updated dataset into a pandas datafram (Restaurant_Reviews_Updated.tsv)
#because we have .tsv file with 'Review' and 'Liked' columns ,they are seprated by tap in tsv format
df = pd.read_csv('Restaurant_Reviews_Updated.tsv', sep='\t')

In [62]:
from sklearn.model_selection import train_test_split
X = df['Review']
y= df['Liked']
#split the data into train & test sets : train 0.67 and test 0.33 and we do it randomly!
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)

In [63]:
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from sklearn.tree import DecisionTreeClassifier
#define result table list for all the algorethims
results_table=[]
#Decision Tree algorethim
text_clf_DT=Pipeline([('tfidf', TfidfVectorizer()), ('DTclf', DecisionTreeClassifier())])
#feed the training data through the pipeline
text_clf_DT.fit(X_train, y_train)
#get a prediction set
predictions = text_clf_DT.predict(X_test)
#get result of Decision Tree algorethim
accuracy = accuracy_score(y_test, predictions)
precision = precision_score(y_test, predictions, average='weighted')
recall = recall_score(y_test, predictions, average='weighted')
f1 = f1_score(y_test, predictions, average='weighted')
#add result of Decision Tree algorethim to result table list
results_table.append(['Decision Tree', accuracy, precision, recall, f1])

In [64]:
from sklearn.ensemble import RandomForestClassifier
#Random Forest algorethim
text_clf_RF=Pipeline([('tfidf', TfidfVectorizer()), ('RFclf', RandomForestClassifier())])
#feed the training data through the pipeline
text_clf_RF.fit(X_train, y_train)
#get a prediction set
predictions = text_clf_RF.predict(X_test)
#get result of Random Forest algorethim
accuracy = accuracy_score(y_test, predictions)
precision = precision_score(y_test, predictions, average='weighted')
recall = recall_score(y_test, predictions, average='weighted')
f1 = f1_score(y_test, predictions, average='weighted')
#add result of Random Forest algorethim to result table list
results_table.append(['Random Forest', accuracy, precision, recall, f1])

In [65]:
from sklearn.svm import SVC
#Support Vector Machines algorethim
text_clf_SV=Pipeline([('tfidf', TfidfVectorizer()), ('SVclf', SVC())])
#feed the training data through the pipeline
text_clf_SV.fit(X_train, y_train)
#get a prediction set
predictions = text_clf_SV.predict(X_test)
#get result of Support Vector Machines algorethim
accuracy = accuracy_score(y_test, predictions)
precision = precision_score(y_test, predictions, average='weighted')
recall = recall_score(y_test, predictions, average='weighted')
f1 = f1_score(y_test, predictions, average='weighted')
#add result of Support Vector Machines algorethim to result table list
results_table.append(['Support Vector Machines', accuracy, precision, recall, f1])

In [66]:
from sklearn.linear_model import LogisticRegression
#Logistic Regression algorethim
text_clf_LR=Pipeline([('tfidf', TfidfVectorizer()), ('LRclf', LogisticRegression())])
#feed the training data through the pipeline
text_clf_LR.fit(X_train, y_train)
#get a prediction set
predictions = text_clf_LR.predict(X_test)
#get result of Logistic Regression algorethim
accuracy = accuracy_score(y_test, predictions)
precision = precision_score(y_test, predictions, average='weighted')
recall = recall_score(y_test, predictions, average='weighted')
f1 = f1_score(y_test, predictions, average='weighted')
#add result of Logistic Regression algorethim to result table list
results_table.append(['Logistic Regression', accuracy, precision, recall, f1])

In [67]:
from sklearn.ensemble import GradientBoostingClassifier
#Gradient Boosting algorethim
text_clf_GB=Pipeline([('tfidf', TfidfVectorizer()), ('GBclf', GradientBoostingClassifier())])
#feed the training data through the pipeline
text_clf_GB.fit(X_train, y_train)
#get a prediction set
predictions = text_clf_GB.predict(X_test)
#get result of Gradient Boosting algorethim
accuracy = accuracy_score(y_test, predictions)
precision = precision_score(y_test, predictions, average='weighted')
recall = recall_score(y_test, predictions, average='weighted')
f1 = f1_score(y_test, predictions, average='weighted')
#add result of Gradient Boosting algorethim to result table
results_table.append(['Gradient Boosting', accuracy, precision, recall, f1])

In [68]:
#print the results in a table
results_table_df = pd.DataFrame(results_table, columns=['Algorithm', 'Accuracy', 'Precision', 'Recall', 'F1 Score'])
print(results_table_df)

                 Algorithm  Accuracy  Precision    Recall  F1 Score
0            Decision Tree  0.704268   0.705379  0.704268  0.704293
1            Random Forest  0.728659   0.737460  0.728659  0.727264
2  Support Vector Machines  0.780488   0.783111  0.780488  0.780374
3      Logistic Regression  0.774390   0.776366  0.774390  0.774340
4        Gradient Boosting  0.762195   0.771254  0.762195  0.761078


In [69]:
#based on the result we got, my conclusion is:
#the Support Vector Machines (SVM) algorithm has the highest accuracy so it have the most correct predictions overall.
#the Random Forest algorithm has the highest precision so it have the highest ability to accurately identify positive cases.
#the Support Vector Machines (SVM) have the highest recall so it good at finding all positive cases.
#the Gradient Boosting algorithm achieves the highest F1 score so it shows a good balance between precision and recall.

In [70]:
#based on the results, the insights into the challenges faced are:
#Decision Tree:it doesn't do well in terms of accuracy, precision, recall, and F1 score.
#Random Forest:it has trouble with high accuracy and accurately classify certain instances.
#Support Vector Machines (SVM):it's highly accurate, but there's not much room for improvement in precision and recall.
#Logistic Regression:it's highly accurate, but there's not much room for improvement in precision and recall(but SVM is better).
#Gradient Boosting:it achieves a good balance between precision and recall, but overall accuracy is not the highest.