## Scikit Learn pipeline

In [199]:
## Setting to ignore warnings
import warnings
warnings.filterwarnings("ignore")

In [200]:
## Imports
import pandas as pd

## Transformers
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
## Classifiers
from sklearn.linear_model import SGDClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
## Model selection and Pipeline utils
from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, classification_report


## Text processing libraries
import re
from nltk.corpus import stopwords
import emoji
import contractions ## from here: https://github.com/kootenpv/contractions

from pprint import pprint
from time import time
import logging

### Read Data
1. Drop duplicate reviews
2. Drop unused columns
3. Create column for review length
4. Drop unusually long reviews

In [201]:
## Read data and copy
spotify = pd.read_csv("../data/raw/spotify_review_kaggle.csv")
data_in = spotify.copy()
data_in = data_in.drop_duplicates(subset="Review")
data_in.drop(["Time_submitted", "Total_thumbsup", "Reply"], axis=1, inplace=True)
data_in["Length"] = data_in["Review"].str.split(" ").str.len()
data_in = data_in[data_in.Length < 150]
data = data_in

### Code Sentiment
* Ratings 1, 2 and 3 as _negative_
* Ratings 4 and 5 as _positive_

In [202]:
## Code sentiment from rating (1 or 2 == bad, 3 == neutral, 4 or 5 == good)
def get_sentiment(rating):
    if rating == 1 or rating == 2:
        return "negative"
    # if rating == 3:
    #     return "neutral"
    if rating == 3 or rating == 4 or rating == 5:
        return "positive"

data["Sentiment"] = data["Rating"].apply(get_sentiment)


### Define Stopwords
Here we grab the NLTK stopwords but want to keep "not" 

In [203]:
our_stop_words = set(stopwords.words('english'))
our_stop_words.remove("not")

### Translate emojis to text
We use this function here to translate emojis to text. We will need this function later on to include in our prediction pipeline, so that new data will be transformed in the same way.

In [279]:
## define a function to translate emojis text
def translate_emoji(sentence):
  return emoji.demojize(sentence)

In [205]:
data["Review"] = data["Review"].apply(translate_emoji)

### Translate contractions

Change contractions like _I'm_, _they're_ etc. to _I am_, _they are_ etc. We'll use this again later in our prediction pipeline.

In [206]:
def translate_contractions(sentence):
    return contractions.fix(sentence)

In [207]:
data["Review"] = data["Review"].apply(translate_contractions)

### Remove strange fonts
There are some strange unreadable fonts like 𝚝𝚑𝚒𝚜 in the reviews. Here we remove them. Again, we'll use this function later in our make_prediction function.

In [282]:
def remove_strange_fonts(sentence):
    return re.sub(r'[^\x00-\x7f]', r'', sentence)


data['Review'] = data['Review'].apply(lambda sentence: list(map(remove_strange_fonts, sentence)))
data["Review"] = data["Review"].apply(lambda sentence: "".join(sentence))

In [283]:
data["Review"]

0        Great music service, the audio is high quality...
1        Please ignore previous negative rating. This a...
2        This pop-up "Get the best Spotify experience o...
3          Really buggy and terrible to use as of recently
4        Dear Spotify why do I get songs that I did not...
                               ...                        
61589    Even though it was communicated that lyrics fe...
61590    Use to be sooo good back when I had it, and wh...
61591    This app would be good if not for it taking ov...
61592    The app is good hard to navigate and will not ...
61593    Its good but sometimes it does not load the mu...
Name: Review, Length: 61346, dtype: object

### Create features and target and split into training, development and test set

In [210]:
X = data["Review"]
y = data["Sentiment"]
print("Shape of X and y: ", X.shape, y.shape)

Shape of X and y:  (61346,) (61346,)


In [211]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.4, stratify = y, random_state=42)
print(X_train.shape, X_test.shape, y_train.shape, y_test.shape)

(36807,) (24539,) (36807,) (24539,)


In [212]:
X_dev, X_test, y_dev, y_test = train_test_split(X_test, y_test, test_size=0.5, stratify = y_test, random_state=42)
print(X_dev.shape, X_test.shape, y_dev.shape, y_test.shape)

(12269,) (12270,) (12269,) (12270,)


### Create Pipeline for hyperparameter selection

In [272]:
## define parameters for the pipelines

## Parameters for Feature extraction
transformer_parameters = {
    'vect__max_features': (None, 5000, 10000),
    "vect__ngram_range": ((1, 1), (1, 2), (1,3)),  # unigrams or bigrams
    'tfidf__norm': ('l1','l2')
}

## Parameters for classifiers
logistic_regression_parameters = {
        'clf': (LogisticRegression(
            fit_intercept=True,
            class_weight=None,
            solver="lbfgs",
            max_iter=1000,
            random_state=42),
            ),
        'clf__penalty': ("none","l1","l2","elasticnet"),
        'clf__C': (0.01, 0.1, 1, 10, 100),
        "clf__l1_ratio": (0.01, 0.1,1)
    }

svc_parameters = {
        'clf': (SVC(
            probability=True,
            tol=1e-3,
            class_weight=None,
            max_iter=1000,
            random_state=42),
            ),
        'clf__kernel': ("linear","poly","rbf","sigmoid"),
        'clf__C': (0.001, 0.01, 0.1, 1, 10, 100),
        "clf__gamma": (0.001, 0.01, 0.1, 1, 10, 100, "scale"),
        "clf__shrinking": (True, False),
        "clf__decision_function_shape": ("ovr", "ovo")
    }

rf_parameters = {
        'clf': (RandomForestClassifier(
            criterion="gini",
            min_samples_leaf=5,
            oob_score=True,
            class_weight=None,
            random_state=42),
            ),
        'clf__n_estimators': (10, 100, 1000),
        'clf__max_depth': (3, 5, 10, 50, 100, None),
        #"clf__min_samples_split": (2, 5, 10),
        #"clf__max_features": ("sqrt", "log2"),
        #"clf__bootstrap": (True, False)
    }

In [273]:
## Define Grid Search Params
logistic_parameters = [
    transformer_parameters,
    logistic_regression_parameters
]

svc_parameters = [
    transformer_parameters,
    svc_parameters
]

random_forest_parameters = [
    transformer_parameters,
    rf_parameters
]

In [285]:
all_parameter_sets = [logistic_parameters]
classifiers = [LogisticRegression()]

for parameter_sets, classifier in zip(all_parameter_sets, classifiers):
    #define pipeline
    pipeline = Pipeline(
        [   
            ("vect", CountVectorizer(stop_words=our_stop_words)),
            ("tfidf", TfidfTransformer()),
            ("clf", classifier)
        ]
    )
    ## Perform grid search CV
    print("Fitting ", str(classifier)[:-2]+" Classifier")
    gs = GridSearchCV(pipeline, parameter_sets, n_jobs=-1, verbose=1, scoring="accuracy", cv = 3) ## -1 means all processors

    print("Performing grid search...")
    print("pipeline:", [name for name, _ in pipeline.steps])
    print("parameters:")
    pprint(parameter_sets)

    gs.fit(X_train, y_train)
    
    print("Best score: %0.3f" % gs.best_score_)
    print("Best parameters set:")

    best_parameters = gs.best_estimator_.get_params()

    for parameter_set in parameter_sets:
        for param_name in sorted(parameter_set.keys()):
            print("\t%s: %r" % (param_name, best_parameters[param_name]))

    print("----------------------")

Fitting  LogisticRegression Classifier
Performing grid search...
pipeline: ['vect', 'tfidf', 'clf']
parameters:
[{'tfidf__norm': ('l1', 'l2'),
  'vect__max_features': (None, 5000, 10000),
  'vect__ngram_range': ((1, 1), (1, 2), (1, 3))},
 {'clf': (LogisticRegression(C=0.1, max_iter=1000, random_state=42),),
  'clf__l1_ratio': (0.01, 0.1, 1),
  'clf__penalty': ('none', 'l1', 'l2', 'elasticnet')}]
Fitting 3 folds for each of 30 candidates, totalling 90 fits
Best score: 0.835
Best parameters set:
	tfidf__norm: 'l2'
	vect__max_features: 10000
	vect__ngram_range: (1, 2)
	clf: LogisticRegression()
	clf__l1_ratio: None
	clf__penalty: 'l2'
----------------------


In [286]:
our_classifier =  gs.best_estimator_

In [287]:
predictions = our_classifier.predict(X_dev)

In [288]:
print("Accuracy on training set: %0.3f" % our_classifier.score(X_train, y_train))
print("Accuracy on development set: %0.3f" % our_classifier.score(X_dev, y_dev))

Accuracy on training set: 0.874
Accuracy on development set: 0.836


In [289]:
confusion_matrix(y_dev, predictions, labels=["positive", "negative"])

array([[6201, 1126],
       [ 885, 4057]], dtype=int64)

In [247]:
print(classification_report(y_dev, predictions))

              precision    recall  f1-score   support

    positive       0.88      0.85      0.86      7327
    negative       0.78      0.82      0.80      4942

    accuracy                           0.84     12269
   macro avg       0.83      0.83      0.83     12269
weighted avg       0.84      0.84      0.84     12269



In [197]:
def make_prediction(classifier, sentence):
    temp = translate_emoji(sentence)
    temp = translate_contractions(temp)
    temp = remove_strange_fonts(temp)
    return classifier.predict([temp])


In [297]:
review = "I ❤️ Spotify"

#review = "Well, it is Spotify, what can one say, never works as it should"

make_prediction(our_classifier, review)

array(['positive'], dtype=object)

In [294]:
review = "I hate Spotify"

make_prediction(our_classifier, review)

array(['negative'], dtype=object)