From https://towardsdatascience.com/building-a-sentiment-classifier-using-scikit-learn-54c8e7c5d2f0

## Load data

In [0]:
import pandas as pd
import re
from os import system, listdir
from os.path import isfile, join
from random import shuffle

system('wget "http://ai.stanford.edu/~amaas/data/sentiment/aclImdb_v1.tar.gz"')
system('tar -xzf "aclImdb_v1.tar.gz"')

def create_data_frame(folder: str) -> pd.DataFrame:
    '''
    folder - the root folder of train or test dataset
    Returns: a DataFrame with the combined data from the input folder
    '''
    pos_folder = f'{folder}/pos' # positive reviews
    neg_folder = f'{folder}/neg' # negative reviews
    
    def get_files(fld: str) -> list:
        '''
        fld - positive or negative reviews folder
        Returns: a list with all files in input folder
        '''
        return [join(fld, f) for f in listdir(fld) if isfile(join(fld, f))]
    
    def append_files_data(data_list: list, files: list, label: int) -> None:
        '''
        Appends to 'data_list' tuples of form (file content, label)
        for each file in 'files' input list
        '''
        for file_path in files:
            with open(file_path, 'r') as f:
                text = f.read()
                data_list.append((text, label))
    
    pos_files = get_files(pos_folder)
    neg_files = get_files(neg_folder)
    
    data_list = []
    append_files_data(data_list, pos_files, 1)
    append_files_data(data_list, neg_files, 0)
    shuffle(data_list)
    
    text, label = tuple(zip(*data_list))
    # replacing line breaks with spaces
    text = list(map(lambda txt: re.sub('(<br\s*/?>)+', ' ', txt), text))
    
    return pd.DataFrame({'text': text, 'label': label})

imdb_train = create_data_frame('aclImdb/train')
imdb_test = create_data_frame('aclImdb/test')

# system("mkdir 'csv'")
# imdb_train.to_csv('csv/imdb_train.csv', index=False)
# imdb_test.to_csv('csv/imdb_test.csv', index=False)

# imdb_train = pd.read_csv('csv/imdb_train.csv')
# imdb_test = pd.read_csv('csv/imdb_test.csv')

In [2]:
imdb_train.head()

Unnamed: 0,text,label
0,"Utterly tactical, strange (watch for the kinky...",1
1,This is an astounding film. As well as showing...,1
2,I voted this a 10 out of 10 simply because it ...,1
3,"First, there is NO way the remake can be as go...",1
4,Ursula Andress' naked body is one of those thi...,0


## Text vectorization

In [0]:
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer

### Unigram Counts 
[CountVectorizer](https://scikit-learn.org/stable/modules/generated/sklearn.feature_extraction.text.CountVectorizer.html#sklearn.feature_extraction.text.CountVectorizer)


In [0]:
unigram_vectorizer = CountVectorizer(ngram_range=(1, 1))
unigram_vectorizer.fit(imdb_train['text'].values)
X_train_unigram = unigram_vectorizer.transform(imdb_train['text'].values)

In [6]:
unigram_vectorizer.vocabulary_

{'utterly': 70583,
 'tactical': 65250,
 'strange': 63495,
 'watch': 72246,
 'for': 25450,
 'the': 66339,
 'kinky': 36663,
 'moment': 43526,
 'of': 46680,
 'drop': 20180,
 'dead': 16746,
 'gorgeous': 28173,
 'blonde': 7845,
 'acting': 1623,
 'as': 4465,
 'pull': 52487,
 'string': 63645,
 'doll': 19492,
 'some': 61617,
 'rich': 55713,
 'folks': 25346,
 'pointless': 50587,
 'but': 9881,
 'undoubtedly': 69471,
 'compelling': 13719,
 'late': 37855,
 'night': 45638,
 'feature': 24077,
 'this': 66562,
 'unhinged': 69692,
 'french': 26017,
 'production': 51858,
 'is': 34585,
 'stew': 63178,
 'perplexedly': 49340,
 'unfocused': 69598,
 'ideas': 32447,
 'and': 3258,
 'random': 53445,
 'plot': 50428,
 'illustrations': 32635,
 'centred': 11349,
 'on': 46916,
 'its': 34721,
 'very': 71159,
 'charismatic': 11663,
 'stars': 62896,
 'if': 32517,
 'somewhat': 61646,
 'anti': 3606,
 'heroes': 30722,
 'alain': 2462,
 'delon': 17371,
 'charles': 11675,
 'bronson': 9257,
 'really': 53839,
 'they': 66474,
 

### Unigram Tf-Idf

In [0]:
unigram_tf_idf_transformer = TfidfTransformer()
unigram_tf_idf_transformer.fit(X_train_unigram)
X_train_unigram_tf_idf = unigram_tf_idf_transformer.transform(X_train_unigram)

### Bigram Counts

In [0]:
bigram_vectorizer = CountVectorizer(ngram_range=(1, 2))
bigram_vectorizer.fit(imdb_train['text'].values)
X_train_bigram = bigram_vectorizer.transform(imdb_train['text'].values)

### Bigram Tf-Idf

In [0]:
bigram_tf_idf_transformer = TfidfTransformer()
bigram_tf_idf_transformer.fit(X_train_bigram)
X_train_bigram_tf_idf = bigram_tf_idf_transformer.transform(X_train_bigram)

## Train classifiers

In [0]:
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB

In [0]:
y_train = imdb_train['label'].values

def train_and_show_scores(clf, X, y, title: str) -> None:
    X_train, X_valid, y_train, y_valid = train_test_split(
        X, y, train_size=0.75, stratify=y
    )

    
    clf.fit(X_train, y_train)
    train_score = clf.score(X_train, y_train)
    valid_score = clf.score(X_valid, y_valid)
    print(f'{title}\nTrain score: {round(train_score, 4)} ; Validation score: {round(valid_score, 4)}\n')

In [15]:
clf = MultinomialNB()
train_and_show_scores(clf, X_train_unigram, y_train, 'Unigram Counts')
train_and_show_scores(clf, X_train_unigram_tf_idf, y_train, 'Unigram Tf-Idf')
train_and_show_scores(clf, X_train_bigram, y_train, 'Bigram Counts')
train_and_show_scores(clf, X_train_bigram_tf_idf, y_train, 'Bigram Tf-Idf')

Unigram Counts
Train score: 0.9048 ; Validation score: 0.8578

Unigram Tf-Idf
Train score: 0.9151 ; Validation score: 0.8574

Bigram Counts
Train score: 0.9921 ; Validation score: 0.8757

Bigram Tf-Idf
Train score: 0.9743 ; Validation score: 0.8864



### Stochastic Gradient Descent

In [0]:
from sklearn.linear_model import SGDClassifier
from sklearn.model_selection import RandomizedSearchCV
from scipy.stats import uniform

In [19]:
X_train = X_train_bigram_tf_idf


# Phase 1: loss, learning rate and initial learning rate

clf = SGDClassifier()

distributions = dict(
    loss=['hinge', 'log', 'modified_huber', 'squared_hinge', 'perceptron'],
    learning_rate=['optimal', 'invscaling', 'adaptive'],
    eta0=uniform(loc=1e-7, scale=1e-2)
)

random_search_cv = RandomizedSearchCV(
    estimator=clf,
    param_distributions=distributions,
    cv=5,
    n_iter=5
)
random_search_cv.fit(X_train, y_train)
print(f'Best params: {random_search_cv.best_params_}')
print(f'Best score: {random_search_cv.best_score_}')

Best params: {'eta0': 0.003519777732693537, 'learning_rate': 'optimal', 'loss': 'squared_hinge'}
Best score: 0.9036000000000002


In [20]:
# Phase 2: penalty and alpha

clf = SGDClassifier()

distributions = dict(
    penalty=['l1', 'l2', 'elasticnet'],
    alpha=uniform(loc=1e-6, scale=1e-4)
)

random_search_cv = RandomizedSearchCV(
    estimator=clf,
    param_distributions=distributions,
    cv=5,
    n_iter=5
)
random_search_cv.fit(X_train, y_train)
print(f'Best params: {random_search_cv.best_params_}')
print(f'Best score: {random_search_cv.best_score_}')

Best params: {'alpha': 1.104600758582921e-05, 'penalty': 'l2'}
Best score: 0.90876


## Testing model

In [21]:
sgd_classifier = random_search_cv.best_estimator_
X_test = bigram_vectorizer.transform(imdb_test['text'].values)
X_test = bigram_tf_idf_transformer.transform(X_test)
y_test = imdb_test['label'].values

score = sgd_classifier.score(X_test, y_test)
print(score)

0.9026
