# Clean
Group 10 Olga Vyrvich, Jill Henry, and Brendan Dugan

## Requirements

In [157]:
import pandas as pd
import numpy as np
import nltk
nltk.download("stopwords")
nltk.download("punkt")
nltk.download("wordnet")
from nltk.corpus import stopwords
import re
import string
from textblob import TextBlob
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer, TfidfTransformer
from sklearn.linear_model import SGDClassifier, LogisticRegression
from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn import metrics
import pickle

# variables used during normalization
ps = nltk.porter.PorterStemmer()
wnl = nltk.stem.WordNetLemmatizer()
# creating object for nltk stopwords
stop_words = nltk.corpus.stopwords.words("english")
stop_words.remove("no")
stop_words.remove("but")
stop_words.remove("not")
stop_words.remove("very")
stop_words.append("mcdonalds")
stop_words.append("mcdonald's")
stop_words.append("mcdonald")

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


## Importing dataset

In [158]:
data = pd.read_csv("/content/McDonald_s_Reviews.csv", encoding="Latin-1"
                   ).drop( # drop columns we won't use or that are invariate
                          ["store_name", "category", "rating_count", "review_time"],
                           axis = 1)
# get state abbreviations
states = pd.read_csv(
        "https://www2.census.gov/geo/docs/reference/codes2020/national_state2020.txt",
        sep = "|"
    ).drop(columns=["STATEFP", "STATENS"]
    ).rename(
        columns = {"STATE":"state_abb", "STATE_NAME":"state_name"}
    )
# create a pattern of state abbs to find match, "AL|AK|...|WY"
data["state_abb"] = [re.findall("|".join(states.state_abb),
                               data.store_address[i]) for i in range(0, len(data))]
data["state_abb"] = ["".join(data.state_abb[i]) for i in range(0, len(data))]
data.head()

Unnamed: 0,reviewer_id,store_address,latitude,longitude,review,rating,state_abb
0,1,"13749 US-183 Hwy, Austin, TX 78750, United States",30.460718,-97.792874,Why does it look like someone spit on my food?...,1 star,TX
1,2,"13749 US-183 Hwy, Austin, TX 78750, United States",30.460718,-97.792874,It'd McDonalds. It is what it is as far as the...,4 stars,TX
2,3,"13749 US-183 Hwy, Austin, TX 78750, United States",30.460718,-97.792874,Made a mobile order got to the speaker and che...,1 star,TX
3,4,"13749 US-183 Hwy, Austin, TX 78750, United States",30.460718,-97.792874,My mc. Crispy chicken sandwich was ï¿½ï¿½ï¿½ï¿...,5 stars,TX
4,5,"13749 US-183 Hwy, Austin, TX 78750, United States",30.460718,-97.792874,"I repeat my order 3 times in the drive thru, a...",1 star,TX


In [159]:
# breaking rating up into sentiment: positive: 4 or 5 stars, negative: 2 stars or below, neutral: 3 stars
sentiments = []
for rating in data["rating"]:
  if rating == "4 stars" or rating == "5 stars":
    sentiments.append("positive")
  elif rating == "3 stars":
    sentiments.append("neutral")
  else:
    sentiments.append("negative")

data["sentiment"] = sentiments

# clean text
data["review_clean"] = [ps.stem(i) for i in data["review"]] # lowers and stems
data["review_clean"] = [re.sub(r"[^A-z\s\d!\?\.]", "", i, re.I|re.A) for i in data["review_clean"]] # removes special characters
data["review_clean"] = [wnl.lemmatize(i) for i in data["review_clean"]] #
data["review_clean"] = [i.split(" ") for i in data["review_clean"]] # tokenize along spaces to...
data["review_clean"] = [i for i in data["review_clean"] if i not in stop_words] # ...remove stopwords
data["review_clean"] = [" ".join(i) for i in data["review_clean"]] # rejoin string

# add sentiment polarity score (-1 to 1 where -1 is very cold and 1 is very warm),
# categorical version (pos/neg), and convert the store rating from categorical into numeric
# https://textblob.readthedocs.io/en/dev/quickstart.html#sentiment-analysis
data["polarity"] = [TextBlob(data.review_clean[i]).sentiment.polarity for i in range(0, len(data))]
data["polarity_sentiment"] = ['positive' if data.polarity[i] > 0 else 'negative' for i in range(0, len(data))]
data["rating_num"] = [int(re.sub("\\s|[A-z]", "", data.rating[i])) for i in range(0, len(data))]
data.head()

Unnamed: 0,reviewer_id,store_address,latitude,longitude,review,rating,state_abb,sentiment,review_clean,polarity,polarity_sentiment,rating_num
0,1,"13749 US-183 Hwy, Austin, TX 78750, United States",30.460718,-97.792874,Why does it look like someone spit on my food?...,1 star,TX,negative,why does it look like someone spit on my food?...,0.216667,positive,1
1,2,"13749 US-183 Hwy, Austin, TX 78750, United States",30.460718,-97.792874,It'd McDonalds. It is what it is as far as the...,4 stars,TX,positive,itd mcdonalds. it is what it is as far as the ...,0.32619,positive,4
2,3,"13749 US-183 Hwy, Austin, TX 78750, United States",30.460718,-97.792874,Made a mobile order got to the speaker and che...,1 star,TX,negative,made a mobile order got to the speaker and che...,-0.075,negative,1
3,4,"13749 US-183 Hwy, Austin, TX 78750, United States",30.460718,-97.792874,My mc. Crispy chicken sandwich was ï¿½ï¿½ï¿½ï¿...,5 stars,TX,positive,my mc. crispy chicken sandwich was customer s...,-0.133333,negative,5
4,5,"13749 US-183 Hwy, Austin, TX 78750, United States",30.460718,-97.792874,"I repeat my order 3 times in the drive thru, a...",1 star,TX,negative,i repeat my order 3 times in the drive thru an...,-0.041071,negative,1


In [160]:
data[["polarity_sentiment", "sentiment"]].value_counts(normalize = True)

polarity_sentiment  sentiment
positive            positive     0.415110
negative            negative     0.286262
positive            negative     0.088544
negative            neutral      0.078273
positive            neutral      0.065996
negative            positive     0.065816
dtype: float64

In [161]:
# impute GPS for San Bernardino, CA, location of 1st McDonald's for those missing or with invalide addresses
# though looking at some reviews it appears to be several locations in Hawai'i
# https://www.gps-latitude-longitude.com/gps-coordinates-of-san-bernardino-ca
data = data.fillna({"latitude":34.1083449, "longitude":-117.2897652, "state_abb":"CA"})
data.loc[data.state_abb == "", "state_abb"] = "CA"


In [162]:
# creating X and y values from the review and sentiment, respectively
x_train, x_test, y_train, y_test = train_test_split(
    data["review_clean"], data["sentiment"],
    random_state=10)

x_train = x_train.to_frame()
y_train = y_train.to_frame()
x_test = x_test.to_frame()
y_test = y_test.to_frame()

## Visualizing sentiments by state

In [163]:
#for address in data['state_abb'].value_counts().index:
#    plt.figure()
#    data[data['state_abb'] == address]['sentiment'].value_counts().plot(kind='barh',color='teal')
#    plt.title(address)

## Model Training, Prediction, and Performance Evaluation

In [164]:
vector_transformer = Pipeline(
    # tuple(name, fun(args))
    steps = [("vectorize", TfidfVectorizer(use_idf=True, min_df=0.0, max_df=1.0,
                                           ngram_range=(1,2), sublinear_tf=True)),
           ("tfidf", TfidfTransformer())]
)

preprocessor = ColumnTransformer(
    transformers = [
        # tuple(name, transformer_pipeline, [var1, var2,...])
        ("text", vector_transformer, "review_clean")
    ]
)
classify_mnnb = Pipeline(
    steps = [("preprocessor", preprocessor),
             ("classifier", MultinomialNB())]
)
classify_lg = Pipeline(
    steps = [("preprocessor", preprocessor),
             ("classifier", LogisticRegression(penalty = "l2", max_iter = 1000,
                                               C = 1))]
)
#classify_svm = Pipeline(
#    steps = [("preprocessor", preprocessor),
#             "classifier", SGDClassifier(loss = "hinge", max_iter = 100)]
#)
classify_lg.fit(x_train, y_train)
#classify_svm.fit(x_train, y_train)
classify_mnnb.fit(x_train, y_train)


  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)


In [165]:
print("Logistic classifier model score: %.4f" % classify_lg.score(x_test, y_test))
#print("SGD classifier model score: %.4f" % classify_svm.score(x_test, y_test))
print("Multinomial Naive Bayes classifier model score: %.4f" % classify_mnnb.score(x_test, y_test))

Logistic classifier model score: 0.8367
Multinomial Naive Bayes classifier model score: 0.8214


In [166]:
test_mnnb = x_test.join(y_test, how = "inner"
).reset_index(
).join(pd.DataFrame(
    classify_mnnb.predict(x_test)
    ).rename(columns={0:"pred"})
)
print("Multinomial Naive Bayes\n",
      metrics.classification_report(test_mnnb.sentiment, test_mnnb.pred))

Multinomial Naive Bayes
               precision    recall  f1-score   support

    negative       0.79      0.94      0.86      3130
     neutral       0.99      0.20      0.34      1211
    positive       0.84      0.92      0.88      4008

    accuracy                           0.82      8349
   macro avg       0.87      0.69      0.69      8349
weighted avg       0.84      0.82      0.79      8349



In [167]:
pd.DataFrame(
    metrics.confusion_matrix(test_mnnb.sentiment, test_mnnb.pred)
).rename(
    columns = {0:"Pred. Negative", 1:"Pred. Neutral", 2:"Pred. Positive"},
).rename(
    index = {0:"Actual Negative", 1:"Actual Neutral", 2:"Actual Positive"})

Unnamed: 0,Pred. Negative,Pred. Neutral,Pred. Positive
Actual Negative,2934,1,195
Actual Neutral,441,246,524
Actual Positive,329,1,3678


In [168]:
test_lg = x_test.join(y_test, how = "inner"
).reset_index(
).join(pd.DataFrame(
    classify_lg.predict(x_test)
    ).rename(columns={0:"pred"})
)
print("Logistic Regression\n",
      metrics.classification_report(test_lg.sentiment, test_lg.pred))

Logistic Regression
               precision    recall  f1-score   support

    negative       0.81      0.94      0.87      3130
     neutral       0.81      0.32      0.46      1211
    positive       0.86      0.91      0.89      4008

    accuracy                           0.84      8349
   macro avg       0.83      0.72      0.74      8349
weighted avg       0.84      0.84      0.82      8349



In [169]:
pd.DataFrame(
    metrics.confusion_matrix(test_lg.sentiment, test_lg.pred)
).rename(
    columns = {0:"Pred. Negative", 1:"Pred. Neutral", 2:"Pred. Positive"},
).rename(
    index = {0:"Actual Negative", 1:"Actual Neutral", 2:"Actual Positive"})

Unnamed: 0,Pred. Negative,Pred. Neutral,Pred. Positive
Actual Negative,2940,33,157
Actual Neutral,406,385,420
Actual Positive,291,56,3661


## Export

In [170]:
data.head()

Unnamed: 0,reviewer_id,store_address,latitude,longitude,review,rating,state_abb,sentiment,review_clean,polarity,polarity_sentiment,rating_num
0,1,"13749 US-183 Hwy, Austin, TX 78750, United States",30.460718,-97.792874,Why does it look like someone spit on my food?...,1 star,TX,negative,why does it look like someone spit on my food?...,0.216667,positive,1
1,2,"13749 US-183 Hwy, Austin, TX 78750, United States",30.460718,-97.792874,It'd McDonalds. It is what it is as far as the...,4 stars,TX,positive,itd mcdonalds. it is what it is as far as the ...,0.32619,positive,4
2,3,"13749 US-183 Hwy, Austin, TX 78750, United States",30.460718,-97.792874,Made a mobile order got to the speaker and che...,1 star,TX,negative,made a mobile order got to the speaker and che...,-0.075,negative,1
3,4,"13749 US-183 Hwy, Austin, TX 78750, United States",30.460718,-97.792874,My mc. Crispy chicken sandwich was ï¿½ï¿½ï¿½ï¿...,5 stars,TX,positive,my mc. crispy chicken sandwich was customer s...,-0.133333,negative,5
4,5,"13749 US-183 Hwy, Austin, TX 78750, United States",30.460718,-97.792874,"I repeat my order 3 times in the drive thru, a...",1 star,TX,negative,i repeat my order 3 times in the drive thru an...,-0.041071,negative,1


In [171]:
# add predicted sentiments to data and save
data = data.join(
    pd.DataFrame(classify_lg.predict(data)).rename(columns = {0:"pred_sentiment"}),
    how = "left"
    ).drop(columns = ["store_address", "review"])

# create numeric score for easier plotting
# this groups neg and neutral, but we could instead make it -1:neg 0:neutral 1:pos
# similar to polarity score
data["sentiment_num"] = [1 if i == "positive" else 0 for i in data["pred_sentiment"]]

data.to_csv("/content/mcdonalds_reviews.csv", index = False)

# save the model
with open("pipe_lg.pkl", "wb") as f:
  pickle.dump(classify_lg, f)