In [1]:
import pandas as pd
import numpy as np

In [5]:
X_train = pd.read_csv('/content/imdb_train.csv', index_col = False)
X_test = pd.read_csv('/content/imdb_test.csv', index_col = False)
Y_train = pd.read_csv('/content/y_imdb_train.csv', index_col = False)

## Пример

In [6]:
X_train.iloc[2]['review']

"This fake documentary is flawed on a lot of points, it's badly made, has uninteresting characters but the biggest problem I have with it is the basic premise.<br /><br />This film uses the idea that H.P. Lovecraft has traveled to Italy and that some of his work is based on real supernatural events that he witnessed. I'm willing to go along with the notion that he traveled to Italy (only for suspension of disbelieve) but that some of his work is based on reality and that Insmouth exist is total nonsense.<br /><br />First of all, Lovecraft didn't believe in the supernatural, in his letters he clearly states that he considered himself a mechanical materialist, his monsters where there to show that humans weren't so special after all. Another myth used in this film is that Lovecraft was an expert on the occult, he wasn't, all his knowledge on the subject came from the most basic sources.<br /><br />So we end up with a film about people jelling at each other a lot and when we finally see t

# Отфильтруем лишнее

In [7]:
XX = pd.concat([X_test,X_train], axis=0)

In [8]:
XX.shape

(50000, 2)

In [9]:
import re

In [None]:
# убираем "(<.*?>)" теги
XX['review'] = XX['review'].apply(lambda x: re.sub('(<.*?>)', ' ', x))

# убираем пунктуацию
XX['review'] = XX['review'].apply(lambda x: re.sub('[,\.!?:()"]', '', x))
    
# убираем лишние пробелы
XX['review'] = XX['review'].apply(lambda x: x.strip())

# убираем все небуквенное
XX['review'] = XX['review'].apply(lambda x: re.sub('[^a-zA-Z"]',' ',x))
    
# переводим к нижнему регистру
XX['review'] = XX['review'].apply(lambda x: x.lower())

In [12]:
import nltk
from nltk.corpus import stopwords 
from nltk.tokenize import word_tokenize 
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import cross_validate

Чтоб долго не ковыряться, разбил на слова так:

In [13]:
from keras.preprocessing.text import text_to_word_sequence

In [14]:
words = XX['review'].apply(lambda x: text_to_word_sequence(x))

Убрал стоп-слова

In [15]:
stop_words = set(stopwords.words('english'))

# Фильтруем стоп-слова
filtered_words = words.apply(lambda x: [w for w in x if not w in stop_words])

# Заталкиваем обратно в колонку
XX['review'] = filtered_words.apply(lambda x: " ".join(x))

## Сделаем копию - потом пригодится

In [16]:
XX_for_next_usage = XX.copy()

In [81]:
train = XX[XX.id.isna()].drop('id', axis=1).copy()
test = XX[~XX.id.isna()].copy()

## TF-IDF

In [82]:
tfidf = TfidfVectorizer()
from sklearn.model_selection import train_test_split

In [83]:
x_train, x_test, y_train, y_test = train_test_split(train, Y_train, test_size=0.3, random_state=14)

In [84]:
x_train = tfidf.fit_transform(x_train.review)

In [85]:
x_test = tfidf.transform(x_test.review)

In [86]:
X_test_v1 = tfidf.transform(X_test.review)

In [87]:
x_train.shape, x_test.shape

((28000, 80212), (12000, 80212))

## Обучим модель

In [116]:
from sklearn.linear_model import LogisticRegression
clf_lr = LogisticRegression(solver='liblinear', penalty='l1', C=1.5)

In [117]:
clf_lr.fit(x_train, y_train.sentiment)

LogisticRegression(C=1.5, penalty='l1', solver='liblinear')

In [118]:
predictions_v1 = clf_lr.predict_proba(x_test)

## На обучающей выборке

In [119]:
from sklearn.metrics import roc_auc_score, roc_curve,accuracy_score

In [121]:
roc_auc_score(y_train,clf_lr.predict_proba(x_train)[:,1])

0.9708419636906522

## На тестовой выборке

In [122]:
roc_auc_score(y_test,clf_lr.predict_proba(x_test)[:,1])

0.9532903509405247

# Обучим на всей выборке


In [123]:
x_train = tfidf.fit_transform(train.review)
X_test_v1 = tfidf.transform(X_test.review)

In [124]:
clf_lr.fit(x_train, Y_train.sentiment)

LogisticRegression(C=1.5, penalty='l1', solver='liblinear')

In [125]:
print('ROC AUC: ',roc_auc_score(Y_train,clf_lr.predict_proba(x_train)[:,1]))
print('accuracy: ',accuracy_score(Y_train,clf_lr.predict(x_train)))

ROC AUC:  0.9730552435921705
accuracy:  0.9168


# Добавим лемматизацию

In [36]:
from nltk.stem import WordNetLemmatizer

In [48]:
from nltk.corpus import wordnet

def get_wordnet_pos(word):
    """ POS tag to first character lemmatize() accepts"""
    tag = nltk.pos_tag([word])[0][1][0].upper()
    tag_dict = {"J": wordnet.ADJ,
                "N": wordnet.NOUN,
                "V": wordnet.VERB,
                "R": wordnet.ADV}

    return tag_dict.get(tag, wordnet.NOUN)


lemmatizer = WordNetLemmatizer()

In [49]:
train = XX_for_next_usage[XX_for_next_usage.id.isna()].drop('id', axis = 1).copy()
test = XX_for_next_usage[~XX_for_next_usage.id.isna()].copy()

## Приводим слова "к нормальной форме"

In [51]:
def process_words(sentence):
    return " ".join([lemmatizer.lemmatize(w, get_wordnet_pos(w)) 
          for w in nltk.word_tokenize(sentence)])

In [None]:
test['review'] = test['review'].apply(process_words)

In [127]:
train['review'] = train['review'].apply(process_words)

In [128]:
x_train = tfidf.fit_transform(train.review)
x_test = tfidf.transform(test.review)

In [129]:
clf_lr = LogisticRegression(solver='liblinear', penalty='l1', C=1.5)

In [130]:
clf_lr.fit(x_train, Y_train.sentiment)

LogisticRegression(C=1.5, penalty='l1', solver='liblinear')

In [131]:
print('ROC AUC: ',roc_auc_score(Y_train,clf_lr.predict_proba(x_train)[:,1]))
print('accuracy: ',accuracy_score(Y_train,clf_lr.predict(x_train)))

ROC AUC:  0.9718951027189575
accuracy:  0.91375


Не очень-то помогло

In [132]:
my_answer = pd.DataFrame(clf_lr.predict(x_test))

In [133]:
my_answer.columns = ['prediction']

In [134]:
my_answer['id'] = list(range(10000))

In [135]:
my_answer.to_csv('prediction_linclass.csv', index=False)

Финальный accuracy score на тестовой выборке > 88%

