In [1]:
import warnings

import pandas as pd
import numpy as np
from nltk.corpus import movie_reviews, stopwords
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.model_selection import cross_val_score
from sklearn.linear_model import LogisticRegression, SGDClassifier
from sklearn.svm import LinearSVC 
from sklearn.pipeline import Pipeline
from operator import itemgetter

In [2]:
warnings.filterwarnings('ignore')

In [3]:
negids = movie_reviews.fileids('neg')
posids = movie_reviews.fileids('pos')

negfeats = [movie_reviews.words(fileids=[f]) for f in negids]
posfeats = [movie_reviews.words(fileids=[f]) for f in posids]

data = []
for words in negfeats:
    data.append(dict(
        text=' '.join(words),
        positive=0,
    ))

for words in posfeats:
    data.append(dict(
        text=' '.join(words),
        positive=1,
    ))

data = pd.DataFrame(data)

In [4]:
cv_pipeline = Pipeline([
    ('vectorizer', CountVectorizer()),
    ('classificator', LogisticRegression()),
])

cv_pipeline_svc = Pipeline([
    ('vectorizer', CountVectorizer()),
    ('classificator', LinearSVC()),
])

cv_pipeline_sgd = Pipeline([
    ('vectorizer', CountVectorizer()),
    ('classificator', SGDClassifier(
        loss='hinge', penalty='l2',
        alpha=0.0001, l1_ratio=0.15, fit_intercept=True, max_iter=5,
        shuffle=True, verbose=0, epsilon=0.1, n_jobs=1, random_state=42,
        learning_rate='optimal', eta0=0.0, power_t=0.5, class_weight=None,
        warm_start=False, average=False
    )),
])

cv_pipeline10 = Pipeline([
    ('vectorizer', CountVectorizer(min_df=10)),
    ('classificator', LogisticRegression()),
])

cv_pipeline50 = Pipeline([
    ('vectorizer', CountVectorizer(min_df=50)),
    ('classificator', LogisticRegression()),
])

tf_pipeline_log = Pipeline([
    ('vectorizer', TfidfVectorizer()),
    ('classificator', LogisticRegression()),
])

In [5]:
cv_result = cross_val_score(cv_pipeline, data.text, data.positive, cv=5)
cv_result

array([0.805 , 0.845 , 0.84  , 0.8725, 0.85  ])

In [6]:
tf_log_result = cross_val_score(tf_pipeline_log, data.text, data.positive, cv=5)
tf_log_result

array([0.82  , 0.825 , 0.825 , 0.815 , 0.8175])

In [7]:
cv10_result = cross_val_score(cv_pipeline10, data.text, data.positive, cv=5)
cv10_result

array([0.815 , 0.8525, 0.835 , 0.855 , 0.84  ])

In [8]:
cv50_result = cross_val_score(cv_pipeline50, data.text, data.positive, cv=5)
cv50_result

array([0.7875, 0.825 , 0.8125, 0.82  , 0.8225])

In [9]:
cv_svc_result = cross_val_score(cv_pipeline_svc, data.text, data.positive, cv=5)
cv_svc_result

array([0.8025, 0.84  , 0.83  , 0.85  , 0.84  ])

In [10]:
cv_sgd_result = cross_val_score(cv_pipeline_sgd, data.text, data.positive, cv=5)
cv_sgd_result

array([0.7175, 0.8075, 0.8375, 0.725 , 0.83  ])

In [11]:
stop_words_nltk_result = cross_val_score(
    Pipeline([
        ('vectorizer', CountVectorizer(stop_words=stopwords.words('english'))),
        ('estimator', LogisticRegression())
    ]),
    data.text, data.positive, cv=5,
)
stop_words_nltk_result

array([0.82  , 0.85  , 0.835 , 0.8475, 0.8475])

In [12]:
stop_words_default_result = cross_val_score(
    Pipeline([
        ('vectorizer', CountVectorizer(stop_words='english')),
        ('estimator', LogisticRegression())
    ]),
    data.text, data.positive, cv=5,
)
stop_words_default_result

array([0.81  , 0.84  , 0.8425, 0.8475, 0.8425])

In [13]:
ngram_word_result = cross_val_score(
    Pipeline([
        ('vectorizer', CountVectorizer(ngram_range=(1, 2))),
        ('estimator', LogisticRegression())
    ]),
    data.text, data.positive, cv=5,
)
ngram_word_result

array([0.82  , 0.8575, 0.845 , 0.865 , 0.885 ])

In [14]:
ngram_letter_result = cross_val_score(
    Pipeline([
        ('vectorizer', CountVectorizer(ngram_range=(3, 5), analyzer='char_wb')),
        ('estimator', LogisticRegression())
    ]),
    data.text, data.positive, cv=5,
)
ngram_letter_result

array([0.8175, 0.84  , 0.8225, 0.8275, 0.8175])

In [15]:
with open('ans1.txt', 'w') as fp:
    fp.write(
        f'{round(np.mean(cv_result), 4)} '
        f'{round(np.std(cv_result), 4)} '
        f'{round(np.mean(tf_log_result), 4)} '
        f'{round(np.std(tf_log_result), 4)}'
    )

In [16]:
with open('ans2.txt', 'w') as fp:
    fp.write(
        f'{round(np.mean(cv10_result), 4)} '
        f'{round(np.mean(cv50_result), 4)}'
    )

In [17]:
classifier_results = [
    ('log', np.mean(cv_result)),
    ('svc', np.mean(cv_svc_result)),
    ('sgd', np.mean(cv_sgd_result)),
]
classifier_results.sort(key=itemgetter(1))
classifier_results

[('sgd', 0.7835), ('svc', 0.8325000000000001), ('log', 0.8424999999999999)]

In [18]:
with open('ans3.txt', 'w') as fp:
    fp.write(str(classifier_results[0][1]))

In [19]:
with open('ans4.txt', 'w') as fp:
    fp.write(
        f'{round(np.mean(stop_words_nltk_result), 4)} '
        f'{round(np.mean(stop_words_default_result), 4)}'
    )

In [20]:
with open('ans5.txt', 'w') as fp:
    fp.write(
        f'{round(np.mean(ngram_word_result), 4)} '
        f'{round(np.mean(ngram_letter_result), 4)}'
    )