In [10]:
import pandas as pd

from nltk.corpus import stopwords

from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.preprocessing import LabelBinarizer, LabelEncoder

from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB
from sklearn.ensemble import RandomForestClassifier
from sklearn.decomposition import PCA
from sklearn.metrics import f1_score, classification_report
from sklearn.preprocessing import StandardScaler, Normalizer
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_validate

In [3]:
df = pd.read_csv("yelp_review_sentiment_2classes.tsv", delimiter="\t")
df = df.dropna()

# Machine Learning Model Experiments with 4 Classes

In [6]:
en_stopwords = stopwords.words('english') + ["-pron-", "pron"]

cnt_vec = CountVectorizer(ngram_range=(1, 3), min_df=10, stop_words=en_stopwords)
tfidf_vec = TfidfVectorizer(ngram_range=(1, 3), min_df=10, stop_words=en_stopwords)
label_enc = LabelEncoder()
label_bin = LabelBinarizer()
normalizer = Normalizer()

In [7]:
sample_data = df.sample(220000)
train, test = train_test_split(sample_data, test_size=0.1, shuffle=True)

cnt_vec.fit(train["text"])
tfidf_vec.fit(train["text"])
label_enc.fit(df["sentiment"].unique())



((198000, 230884), (198000, 230884))

In [56]:
logistic = LogisticRegression(random_state=42, max_iter=100000)
mnb = MultinomialNB()
rf = RandomForestClassifier(n_jobs=-1, random_state=42)


combinations = [
    (logistic, cnt_vec, None),
    (logistic, tfidf_vec, None),
    (logistic, cnt_vec, normalizer),
    (logistic, tfidf_vec, normalizer),
    (mnb, cnt_vec, None),
    (rf, cnt_vec, None),
    (rf, tfidf_vec, None),
    (rf, cnt_vec, normalizer),
    (rf, tfidf_vec, normalizer),
]

res = {}
for comb in combinations:
    model = comb[0]
    model_name = type(model).__name__
    
    print("Prepare data to train..")
    featurizer = comb[1]
    featurizer_name = type(featurizer).__name__
    X_train = featurizer.transform(train["text"])
    y_train = label_enc.transform(train["sentiment"])
    X_test = featurizer.transform(test["text"])
    y_test = label_enc.transform(test["sentiment"])
    
    normalizer_name = "None"
    if comb[2]:
        normalizer = comb[2]
        normalizer_name = type(normalizer).__name__
        X_train = normalizer.fit_transform(X_train)
        X_test = normalizer.transform(X_test)
        
    metadata = model_name + "-" + featurizer_name + "-" + normalizer_name
    
    print("Cross-validate {}".format(metadata))
    score = cross_validate(model, X_train, y_train,
                           scoring=["f1_micro", "precision_micro", "recall_micro"],
                           cv=5, n_jobs=-1, return_train_score=True)
    res[metadata] = score
    print(metadata, score)

Prepare data to train..
Cross-validate LogisticRegression-CountVectorizer-None
LogisticRegression-CountVectorizer-None {'fit_time': array([2447.32094288, 2435.27109909, 2447.24594998, 2434.61051106,
       2369.61255503]), 'score_time': array([0.204386  , 0.3321619 , 0.20633531, 0.47214317, 0.40806508]), 'test_f1_micro': array([0.65      , 0.64941919, 0.65234848, 0.65017677, 0.64891414]), 'train_f1_micro': array([0.99838384, 0.99828283, 0.99840909, 0.99835859, 0.99834596]), 'test_precision_micro': array([0.65      , 0.64941919, 0.65234848, 0.65017677, 0.64891414]), 'train_precision_micro': array([0.99838384, 0.99828283, 0.99840909, 0.99835859, 0.99834596]), 'test_recall_micro': array([0.65      , 0.64941919, 0.65234848, 0.65017677, 0.64891414]), 'train_recall_micro': array([0.99838384, 0.99828283, 0.99840909, 0.99835859, 0.99834596])}
Prepare data to train..
Cross-validate LogisticRegression-TfidfVectorizer-None
LogisticRegression-TfidfVectorizer-None {'fit_time': array([423.11136198, 

In [65]:
import pickle
pickle.dump(res, open("4class_result.pkl", "wb"))

# Machine Learning Model Experiments with 2 Classes

In [11]:
en_stopwords = stopwords.words('english') + ["-PRON-", "-pron-", "PRON", "pron"]

cnt_vec = CountVectorizer(ngram_range=(1, 3), min_df=10, stop_words=en_stopwords)
tfidf_vec = TfidfVectorizer(ngram_range=(1, 3), min_df=10, stop_words=en_stopwords)
label_enc = LabelEncoder()
label_bin = LabelBinarizer()
normalizer = Normalizer()

In [13]:
# def replace_sentiment(sentiment):
#     if sentiment == "quite good":
#         return "good"
#     elif sentiment == "kind of bad":
#         return "bad"
#     return sentiment

# sample_data = df.sample(220000)
# df["sentiment"] = df["sentiment"].map(replace_sentiment)
train, test = train_test_split(df, test_size=0.1, shuffle=True)

cnt_vec.fit(train["text"])
tfidf_vec.fit(train["text"])
label_enc.fit(df["sentiment"].unique())

KeyboardInterrupt: 

In [None]:
logistic = LogisticRegression(random_state=42, max_iter=100000)
mnb = MultinomialNB()
rf = RandomForestClassifier(n_jobs=-1, random_state=42)


combinations = [
    (logistic, cnt_vec, None),
    (logistic, tfidf_vec, None),
    (logistic, cnt_vec, normalizer),
    (logistic, tfidf_vec, normalizer),
    (mnb, cnt_vec, None),
    (rf, cnt_vec, None),
    (rf, tfidf_vec, None),
    (rf, cnt_vec, normalizer),
    (rf, tfidf_vec, normalizer),
]

res2 = {}
for comb in combinations:
    model = comb[0]
    model_name = type(model).__name__
    
    print("Prepare data to train..")
    featurizer = comb[1]
    featurizer_name = type(featurizer).__name__
    X_train = featurizer.transform(train["text"])
    y_train = label_enc.transform(train["sentiment"])
    X_test = featurizer.transform(test["text"])
    y_test = label_enc.transform(test["sentiment"])
    
    normalizer_name = "None"
    if comb[2]:
        normalizer = comb[2]
        normalizer_name = type(normalizer).__name__
        X_train = normalizer.fit_transform(X_train)
        X_test = normalizer.transform(X_test)
        
    metadata = model_name + "-" + featurizer_name + "-" + normalizer_name
    
    print("Cross-validate {}".format(metadata))
    score = cross_validate(model, X_train, y_train,
                           scoring=["f1_micro", "precision_micro", "recall_micro"],
                           cv=5, n_jobs=-1, return_train_score=True)
    res2[metadata] = score
    print(metadata, score)
    print()