In [23]:
# !pip install spacy
# !python -m spacy download en_core_web_sm

In [45]:
import pandas as pd
import numpy as np

import spacy

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import cross_val_score
from sklearn.metrics import f1_score
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split

import catboost
import pickle

import time
import string

In [2]:
train = pd.read_csv('games/reviews_train.csv')
train.head()

Unnamed: 0,id,review,like
0,0,This is the kind of game you gift to your frie...,-1
1,1,Early Access Review,1
2,2,"Favourite game ever, must play.",1
3,3,'State of Decay' puts the 'survival' in the zo...,1
4,4,Really good game but you must buy all dlc to r...,1


In [3]:
# train[['review', 'like']]

In [3]:
train = train[['review', 'like']]
train

Unnamed: 0,review,like
0,This is the kind of game you gift to your frie...,-1
1,Early Access Review,1
2,"Favourite game ever, must play.",1
3,'State of Decay' puts the 'survival' in the zo...,1
4,Really good game but you must buy all dlc to r...,1
...,...,...
563126,&gt;joins DarkRP server &gt;guy tells me to fo...,1
563127,Fantastic game that ended very quickly. The ch...,1
563128,it's fun try itr out,1
563129,Early Access Review,-1


In [15]:
train = train[train.review != train.review[1]]

In [18]:
train[train.like == 1]

Unnamed: 0,review,like
2,"Favourite game ever, must play.",1
3,'State of Decay' puts the 'survival' in the zo...,1
4,Really good game but you must buy all dlc to r...,1
8,10/10 The Best Game Ever.No Rage And Very Funn...,1
9,I don't... How did you... Bethesda. Id. My ...,1
...,...,...
563124,If you like Goats This is the Game for you. Gr...,1
563125,if you have already played a game from SUDA51 ...,1
563126,&gt;joins DarkRP server &gt;guy tells me to fo...,1
563127,Fantastic game that ended very quickly. The ch...,1


In [19]:
train.like.value_counts(normalize=True)

 1    0.653277
-1    0.346723
Name: like, dtype: float64

In [20]:
train.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 471399 entries, 0 to 563128
Data columns (total 2 columns):
review    470844 non-null object
like      471399 non-null int64
dtypes: int64(1), object(1)
memory usage: 10.8+ MB


In [21]:
# Видно, в данных есть небольшой дисбаланс

In [22]:

# Пайплайны обработки:

# 1.
# токенизация
# удаление стоп-слов
# удаление знаков пунктуации
# лемматизация
# tf-idf + logreg


# 2.
# токенизация
# эмбеддинг берта
# logreg

In [23]:
train.like.value_counts()

 1    307954
-1    163445
Name: like, dtype: int64

# 1

In [26]:
# train.iloc[563126, 0]

In [27]:
nlp = spacy.load('en_core_web_sm')
doc = nlp(train.iloc[3, 0])

print('text', 'lemma_', 'is_alpha', 'is_stop', sep='\t')
for token in doc:
    print(token.text, token.lemma_, token.is_alpha, token.is_stop, sep='\t')

text	lemma_	is_alpha	is_stop
Really	really	True	True
good	good	True	False
game	game	True	False
but	but	True	True
you	you	True	True
must	must	True	True
buy	you	True	False
all	all	True	True
dlc	dlc	True	False
to	to	True	True
really	really	True	True
enjoy	enjoy	True	False
it	it	True	True
.	.	False	False


In [28]:
str.lower('Sp')

'sp'

In [29]:
def tokenizer(text):
    return nlp(str.lower(str(text)))


def preprocessing(text):
    tokens = tokenizer(text)
    data = ''
    for token in tokens:
        if token.is_alpha and not token.is_stop:
            data += ' ' + token.lemma_
    return data

In [49]:
def preprocessing(text):
    text = str(text).translate(str.maketrans('', '', string.punctuation))
    return text

In [48]:
s = 'dsffs, jfiroe j,m 8*'
s.translate(str.maketrans('', '', string.punctuation))

'dsffs jfiroe jm 8'

In [30]:
# train.review = train.review.apply

In [31]:
len(train) // 200

2356

In [32]:
train['token'] = 0

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  train['token'] = 0


In [33]:
train.head()

Unnamed: 0,review,like,token
0,This is the kind of game you gift to your frie...,-1,0
2,"Favourite game ever, must play.",1,0
3,'State of Decay' puts the 'survival' in the zo...,1,0
4,Really good game but you must buy all dlc to r...,1,0
8,10/10 The Best Game Ever.No Rage And Very Funn...,1,0


In [34]:
train.iloc[:5, 0]

0    This is the kind of game you gift to your frie...
2                      Favourite game ever, must play.
3    'State of Decay' puts the 'survival' in the zo...
4    Really good game but you must buy all dlc to r...
8    10/10 The Best Game Ever.No Rage And Very Funn...
Name: review, dtype: object

In [39]:
learn1 = train[train.like == -1].sample(20_000)
learn2 = train[train.like == 1].sample(20_000)

learn = pd.concat([learn1, learn2], axis=0)

In [44]:
# learn.token = learn.review.apply(preprocessing)

In [None]:
learn

In [None]:

# batch = len(train) // 200
# for i in range(200):
#     train.iloc[i*batch:i*batch+batch, 2] = train.iloc[i*batch:i*batch+batch, 0].apply(tokenizer)
#     time.sleep(1)



# train[['tokens', 'like']].to_csv('preprocess.csv')

In [None]:
train.head()

In [36]:
train.review.dtypes

dtype('O')

In [51]:
train.review

0         This is the kind of game you gift to your frie...
2                           Favourite game ever, must play.
3         'State of Decay' puts the 'survival' in the zo...
4         Really good game but you must buy all dlc to r...
8         10/10 The Best Game Ever.No Rage And Very Funn...
                                ...                        
563124    If you like Goats This is the Game for you. Gr...
563125    if you have already played a game from SUDA51 ...
563126    &gt;joins DarkRP server &gt;guy tells me to fo...
563127    Fantastic game that ended very quickly. The ch...
563128                                 it's fun try itr out
Name: review, Length: 471399, dtype: object

In [52]:
train.review.str.replace('[^\w\s]', '')

0         This is the kind of game you gift to your frie...
2                             Favourite game ever must play
3         State of Decay puts the survival in the zombie...
4         Really good game but you must buy all dlc to r...
8         1010 The Best Game EverNo Rage And Very FunnyG...
                                ...                        
563124    If you like Goats This is the Game for you Gre...
563125    if you have already played a game from SUDA51 ...
563126    gtjoins DarkRP server gtguy tells me to follow...
563127    Fantastic game that ended very quickly The cho...
563128                                  its fun try itr out
Name: review, Length: 471399, dtype: object

In [53]:
train.review = train.review.str.replace('\d+', '') # for digits
train.review = train.review.str.replace(r'(\b\w{1,2}\b)', '') # for words
train.review = train.review.str.replace('[^\w\s]', '') # for punctuation 

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self[name] = value


In [58]:
train.review = train.review.apply(str)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self[name] = value


In [60]:
train.review

0         This  the kind  game you gift  your friends  a...
2                             Favourite game ever must play
3         State  Decay puts the survival  the zombie sur...
4         Really good game but you must buy all dlc  rea...
8          The Best Game Ever Rage And Very FunnyGoals A...
                                ...                        
563124     you like Goats This  the Game for you Great f...
563125     you have already played  game from SUDA then ...
563126    joins DarkRP server guy tells   follow him lea...
563127    Fantastic game that ended very quickly The cho...
563128                                      fun try itr out
Name: review, Length: 471399, dtype: object

In [63]:
X_train, X_valid, y_train, y_valid = train_test_split(train['review'], train['like'], random_state=2021, test_size=0.35)

tf_idf = TfidfVectorizer(ngram_range=(1, 1), lowercase=True, stop_words='english')
tf_idf.fit(X_train)

X_train = tf_idf.transform(X_train)
X_valid = tf_idf.transform(X_valid)

In [64]:
model = catboost.CatBoostClassifier(save_snapshot=True, auto_class_weights='SqrtBalanced', eval_metric='F1')
model.fit(X_train, y_train, eval_set=(X_valid, y_valid), plot=True)

MetricVisualizer(layout=Layout(align_self='stretch', height='500px'))

Learning rate set to 0.130324
0:	learn: 0.7542358	test: 0.7536879	best: 0.7536879 (0)	total: 1.14s	remaining: 18m 59s
1:	learn: 0.7615130	test: 0.7610324	best: 0.7610324 (1)	total: 2.23s	remaining: 18m 33s
2:	learn: 0.7669521	test: 0.7666688	best: 0.7666688 (2)	total: 3.3s	remaining: 18m 18s
3:	learn: 0.7711910	test: 0.7708216	best: 0.7708216 (3)	total: 4.34s	remaining: 18m 1s
4:	learn: 0.7768606	test: 0.7769335	best: 0.7769335 (4)	total: 5.39s	remaining: 17m 53s
5:	learn: 0.7794210	test: 0.7792901	best: 0.7792901 (5)	total: 6.47s	remaining: 17m 52s
6:	learn: 0.7791189	test: 0.7789541	best: 0.7792901 (5)	total: 7.54s	remaining: 17m 49s
7:	learn: 0.7835357	test: 0.7832637	best: 0.7832637 (7)	total: 8.56s	remaining: 17m 41s
8:	learn: 0.7839822	test: 0.7836988	best: 0.7836988 (8)	total: 9.58s	remaining: 17m 34s
9:	learn: 0.7837706	test: 0.7835457	best: 0.7836988 (8)	total: 10.6s	remaining: 17m 29s
10:	learn: 0.7860642	test: 0.7858209	best: 0.7858209 (10)	total: 11.7s	remaining: 17m 30s
11

92:	learn: 0.8275992	test: 0.8272142	best: 0.8272142 (92)	total: 1m 33s	remaining: 15m 7s
93:	learn: 0.8279117	test: 0.8274835	best: 0.8274835 (93)	total: 1m 33s	remaining: 15m 5s
94:	learn: 0.8280921	test: 0.8276477	best: 0.8276477 (94)	total: 1m 34s	remaining: 15m 4s
95:	learn: 0.8283409	test: 0.8278348	best: 0.8278348 (95)	total: 1m 35s	remaining: 15m 3s
96:	learn: 0.8285732	test: 0.8280905	best: 0.8280905 (96)	total: 1m 36s	remaining: 15m 2s
97:	learn: 0.8288822	test: 0.8283155	best: 0.8283155 (97)	total: 1m 37s	remaining: 15m 1s
98:	learn: 0.8291515	test: 0.8285691	best: 0.8285691 (98)	total: 1m 38s	remaining: 14m 59s
99:	learn: 0.8294114	test: 0.8288828	best: 0.8288828 (99)	total: 1m 39s	remaining: 14m 58s
100:	learn: 0.8295411	test: 0.8289496	best: 0.8289496 (100)	total: 1m 40s	remaining: 14m 57s
101:	learn: 0.8297138	test: 0.8291109	best: 0.8291109 (101)	total: 1m 41s	remaining: 14m 56s
102:	learn: 0.8297317	test: 0.8292675	best: 0.8292675 (102)	total: 1m 42s	remaining: 14m 56s

181:	learn: 0.8417517	test: 0.8395547	best: 0.8395547 (181)	total: 3m 1s	remaining: 13m 36s
182:	learn: 0.8418188	test: 0.8395893	best: 0.8395893 (182)	total: 3m 2s	remaining: 13m 36s
183:	learn: 0.8419665	test: 0.8397689	best: 0.8397689 (183)	total: 3m 3s	remaining: 13m 35s
184:	learn: 0.8420734	test: 0.8398378	best: 0.8398378 (184)	total: 3m 4s	remaining: 13m 34s
185:	learn: 0.8421968	test: 0.8399867	best: 0.8399867 (185)	total: 3m 5s	remaining: 13m 33s
186:	learn: 0.8422477	test: 0.8400262	best: 0.8400262 (186)	total: 3m 6s	remaining: 13m 32s
187:	learn: 0.8423335	test: 0.8401653	best: 0.8401653 (187)	total: 3m 7s	remaining: 13m 31s
188:	learn: 0.8423844	test: 0.8401746	best: 0.8401746 (188)	total: 3m 8s	remaining: 13m 30s
189:	learn: 0.8425750	test: 0.8403940	best: 0.8403940 (189)	total: 3m 9s	remaining: 13m 29s
190:	learn: 0.8427441	test: 0.8406119	best: 0.8406119 (190)	total: 3m 10s	remaining: 13m 28s
191:	learn: 0.8428489	test: 0.8407712	best: 0.8407712 (191)	total: 3m 11s	remai

270:	learn: 0.8499211	test: 0.8461771	best: 0.8461771 (270)	total: 4m 30s	remaining: 12m 7s
271:	learn: 0.8498142	test: 0.8461412	best: 0.8461771 (270)	total: 4m 31s	remaining: 12m 6s
272:	learn: 0.8498696	test: 0.8461871	best: 0.8461871 (272)	total: 4m 32s	remaining: 12m 5s
273:	learn: 0.8499135	test: 0.8462684	best: 0.8462684 (273)	total: 4m 33s	remaining: 12m 4s
274:	learn: 0.8499739	test: 0.8463138	best: 0.8463138 (274)	total: 4m 34s	remaining: 12m 4s
275:	learn: 0.8500738	test: 0.8464548	best: 0.8464548 (275)	total: 4m 35s	remaining: 12m 3s
276:	learn: 0.8502784	test: 0.8466227	best: 0.8466227 (276)	total: 4m 36s	remaining: 12m 2s
277:	learn: 0.8503350	test: 0.8466329	best: 0.8466329 (277)	total: 4m 37s	remaining: 12m 1s
278:	learn: 0.8504085	test: 0.8467233	best: 0.8467233 (278)	total: 4m 38s	remaining: 12m
279:	learn: 0.8505070	test: 0.8468446	best: 0.8468446 (279)	total: 4m 39s	remaining: 11m 59s
280:	learn: 0.8505778	test: 0.8469151	best: 0.8469151 (280)	total: 4m 40s	remainin

359:	learn: 0.8557092	test: 0.8505920	best: 0.8505920 (359)	total: 5m 59s	remaining: 10m 38s
360:	learn: 0.8557446	test: 0.8506238	best: 0.8506238 (360)	total: 6m	remaining: 10m 37s
361:	learn: 0.8558350	test: 0.8506745	best: 0.8506745 (361)	total: 6m 1s	remaining: 10m 36s
362:	learn: 0.8559359	test: 0.8507145	best: 0.8507145 (362)	total: 6m 2s	remaining: 10m 35s
363:	learn: 0.8559914	test: 0.8507352	best: 0.8507352 (363)	total: 6m 3s	remaining: 10m 34s
364:	learn: 0.8560628	test: 0.8508056	best: 0.8508056 (364)	total: 6m 4s	remaining: 10m 33s
365:	learn: 0.8560072	test: 0.8507107	best: 0.8508056 (364)	total: 6m 5s	remaining: 10m 32s
366:	learn: 0.8560758	test: 0.8507765	best: 0.8508056 (364)	total: 6m 6s	remaining: 10m 31s
367:	learn: 0.8561525	test: 0.8508272	best: 0.8508272 (367)	total: 6m 7s	remaining: 10m 30s
368:	learn: 0.8562294	test: 0.8509235	best: 0.8509235 (368)	total: 6m 8s	remaining: 10m 29s
369:	learn: 0.8562271	test: 0.8508935	best: 0.8509235 (368)	total: 6m 9s	remaining

449:	learn: 0.8597861	test: 0.8533318	best: 0.8533620 (445)	total: 7m 27s	remaining: 9m 7s
450:	learn: 0.8598028	test: 0.8534192	best: 0.8534192 (450)	total: 7m 28s	remaining: 9m 6s
451:	learn: 0.8598228	test: 0.8533834	best: 0.8534192 (450)	total: 7m 29s	remaining: 9m 5s
452:	learn: 0.8599077	test: 0.8535010	best: 0.8535010 (452)	total: 7m 30s	remaining: 9m 4s
453:	learn: 0.8599329	test: 0.8534913	best: 0.8535010 (452)	total: 7m 31s	remaining: 9m 3s
454:	learn: 0.8600063	test: 0.8535420	best: 0.8535420 (454)	total: 7m 32s	remaining: 9m 2s
455:	learn: 0.8601019	test: 0.8535173	best: 0.8535420 (454)	total: 7m 33s	remaining: 9m 1s
456:	learn: 0.8601433	test: 0.8535537	best: 0.8535537 (456)	total: 7m 34s	remaining: 9m
457:	learn: 0.8602091	test: 0.8536040	best: 0.8536040 (457)	total: 7m 35s	remaining: 8m 59s
458:	learn: 0.8602590	test: 0.8536656	best: 0.8536656 (458)	total: 7m 36s	remaining: 8m 58s
459:	learn: 0.8602812	test: 0.8536606	best: 0.8536656 (458)	total: 7m 37s	remaining: 8m 57s

539:	learn: 0.8636494	test: 0.8556296	best: 0.8556482 (535)	total: 8m 55s	remaining: 7m 36s
540:	learn: 0.8636602	test: 0.8556656	best: 0.8556656 (540)	total: 8m 56s	remaining: 7m 35s
541:	learn: 0.8637079	test: 0.8556964	best: 0.8556964 (541)	total: 8m 57s	remaining: 7m 34s
542:	learn: 0.8637218	test: 0.8556919	best: 0.8556964 (541)	total: 8m 58s	remaining: 7m 33s
543:	learn: 0.8637670	test: 0.8557281	best: 0.8557281 (543)	total: 8m 59s	remaining: 7m 32s
544:	learn: 0.8637731	test: 0.8557139	best: 0.8557281 (543)	total: 9m	remaining: 7m 30s
545:	learn: 0.8638264	test: 0.8557811	best: 0.8557811 (545)	total: 9m 1s	remaining: 7m 29s
546:	learn: 0.8638396	test: 0.8558075	best: 0.8558075 (546)	total: 9m 2s	remaining: 7m 28s
547:	learn: 0.8638473	test: 0.8558132	best: 0.8558132 (547)	total: 9m 3s	remaining: 7m 27s
548:	learn: 0.8638774	test: 0.8558389	best: 0.8558389 (548)	total: 9m 3s	remaining: 7m 26s
549:	learn: 0.8639000	test: 0.8558437	best: 0.8558437 (549)	total: 9m 4s	remaining: 7m 2

629:	learn: 0.8665385	test: 0.8571506	best: 0.8571803 (625)	total: 10m 21s	remaining: 6m 5s
630:	learn: 0.8665692	test: 0.8571714	best: 0.8571803 (625)	total: 10m 22s	remaining: 6m 4s
631:	learn: 0.8666419	test: 0.8572025	best: 0.8572025 (631)	total: 10m 23s	remaining: 6m 3s
632:	learn: 0.8666887	test: 0.8572085	best: 0.8572085 (632)	total: 10m 24s	remaining: 6m 2s
633:	learn: 0.8667029	test: 0.8572809	best: 0.8572809 (633)	total: 10m 25s	remaining: 6m 1s
634:	learn: 0.8667381	test: 0.8573073	best: 0.8573073 (634)	total: 10m 26s	remaining: 6m
635:	learn: 0.8667792	test: 0.8573684	best: 0.8573684 (635)	total: 10m 27s	remaining: 5m 59s
636:	learn: 0.8668430	test: 0.8574240	best: 0.8574240 (636)	total: 10m 28s	remaining: 5m 58s
637:	learn: 0.8668940	test: 0.8574289	best: 0.8574289 (637)	total: 10m 29s	remaining: 5m 57s
638:	learn: 0.8669439	test: 0.8574655	best: 0.8574655 (638)	total: 10m 30s	remaining: 5m 56s
639:	learn: 0.8669363	test: 0.8575267	best: 0.8575267 (639)	total: 10m 31s	rema

718:	learn: 0.8694826	test: 0.8590565	best: 0.8590565 (718)	total: 11m 46s	remaining: 4m 36s
719:	learn: 0.8695002	test: 0.8590414	best: 0.8590565 (718)	total: 11m 47s	remaining: 4m 35s
720:	learn: 0.8695226	test: 0.8590520	best: 0.8590565 (718)	total: 11m 48s	remaining: 4m 34s
721:	learn: 0.8694895	test: 0.8590269	best: 0.8590565 (718)	total: 11m 49s	remaining: 4m 33s
722:	learn: 0.8695290	test: 0.8590477	best: 0.8590565 (718)	total: 11m 50s	remaining: 4m 32s
723:	learn: 0.8695407	test: 0.8590368	best: 0.8590565 (718)	total: 11m 51s	remaining: 4m 31s
724:	learn: 0.8695901	test: 0.8590991	best: 0.8590991 (724)	total: 11m 52s	remaining: 4m 30s
725:	learn: 0.8696136	test: 0.8591724	best: 0.8591724 (725)	total: 11m 53s	remaining: 4m 29s
726:	learn: 0.8696416	test: 0.8592289	best: 0.8592289 (726)	total: 11m 54s	remaining: 4m 28s
727:	learn: 0.8696836	test: 0.8591936	best: 0.8592289 (726)	total: 11m 55s	remaining: 4m 27s
728:	learn: 0.8696897	test: 0.8592659	best: 0.8592659 (728)	total: 11m

807:	learn: 0.8717634	test: 0.8605585	best: 0.8605585 (807)	total: 13m 11s	remaining: 3m 8s
808:	learn: 0.8717848	test: 0.8606211	best: 0.8606211 (808)	total: 13m 12s	remaining: 3m 7s
809:	learn: 0.8718245	test: 0.8606420	best: 0.8606420 (809)	total: 13m 13s	remaining: 3m 6s
810:	learn: 0.8718713	test: 0.8606586	best: 0.8606586 (810)	total: 13m 14s	remaining: 3m 5s
811:	learn: 0.8718965	test: 0.8606795	best: 0.8606795 (811)	total: 13m 15s	remaining: 3m 4s
812:	learn: 0.8718916	test: 0.8606599	best: 0.8606795 (811)	total: 13m 16s	remaining: 3m 3s
813:	learn: 0.8719364	test: 0.8606863	best: 0.8606863 (813)	total: 13m 16s	remaining: 3m 2s
814:	learn: 0.8719540	test: 0.8607171	best: 0.8607171 (814)	total: 13m 17s	remaining: 3m 1s
815:	learn: 0.8719569	test: 0.8606915	best: 0.8607171 (814)	total: 13m 18s	remaining: 3m
816:	learn: 0.8719882	test: 0.8606919	best: 0.8607171 (814)	total: 13m 19s	remaining: 2m 59s
817:	learn: 0.8720109	test: 0.8607122	best: 0.8607171 (814)	total: 13m 20s	remaini

896:	learn: 0.8738272	test: 0.8614484	best: 0.8616229 (884)	total: 14m 36s	remaining: 1m 40s
897:	learn: 0.8738151	test: 0.8614375	best: 0.8616229 (884)	total: 14m 37s	remaining: 1m 39s
898:	learn: 0.8738265	test: 0.8614322	best: 0.8616229 (884)	total: 14m 38s	remaining: 1m 38s
899:	learn: 0.8738556	test: 0.8614126	best: 0.8616229 (884)	total: 14m 39s	remaining: 1m 37s
900:	learn: 0.8739074	test: 0.8613976	best: 0.8616229 (884)	total: 14m 40s	remaining: 1m 36s
901:	learn: 0.8739044	test: 0.8614030	best: 0.8616229 (884)	total: 14m 41s	remaining: 1m 35s
902:	learn: 0.8739306	test: 0.8614030	best: 0.8616229 (884)	total: 14m 42s	remaining: 1m 34s
903:	learn: 0.8739389	test: 0.8613812	best: 0.8616229 (884)	total: 14m 43s	remaining: 1m 33s
904:	learn: 0.8739450	test: 0.8613814	best: 0.8616229 (884)	total: 14m 44s	remaining: 1m 32s
905:	learn: 0.8739804	test: 0.8614020	best: 0.8616229 (884)	total: 14m 45s	remaining: 1m 31s
906:	learn: 0.8739945	test: 0.8613923	best: 0.8616229 (884)	total: 14m

986:	learn: 0.8759342	test: 0.8624847	best: 0.8624847 (986)	total: 16m 2s	remaining: 12.7s
987:	learn: 0.8759232	test: 0.8624796	best: 0.8624847 (986)	total: 16m 3s	remaining: 11.7s
988:	learn: 0.8759374	test: 0.8624754	best: 0.8624847 (986)	total: 16m 4s	remaining: 10.7s
989:	learn: 0.8759865	test: 0.8624657	best: 0.8624847 (986)	total: 16m 5s	remaining: 9.76s
990:	learn: 0.8760061	test: 0.8624657	best: 0.8624847 (986)	total: 16m 6s	remaining: 8.78s
991:	learn: 0.8760235	test: 0.8624910	best: 0.8624910 (991)	total: 16m 7s	remaining: 7.8s
992:	learn: 0.8760631	test: 0.8625074	best: 0.8625074 (992)	total: 16m 8s	remaining: 6.83s
993:	learn: 0.8760660	test: 0.8625074	best: 0.8625074 (992)	total: 16m 9s	remaining: 5.85s
994:	learn: 0.8760928	test: 0.8624928	best: 0.8625074 (992)	total: 16m 10s	remaining: 4.88s
995:	learn: 0.8760477	test: 0.8624783	best: 0.8625074 (992)	total: 16m 11s	remaining: 3.9s
996:	learn: 0.8760752	test: 0.8624589	best: 0.8625074 (992)	total: 16m 12s	remaining: 2.93

<catboost.core.CatBoostClassifier at 0x7fb73e38fa90>

In [66]:
10 * f1_score(model.predict(X_valid), y_valid)

8.838873794197559

# Predict

In [67]:
test = pd.read_csv('games/reviews_test.csv')

In [70]:
test

Unnamed: 0.1,Unnamed: 0,review
0,638820,DISCLAIMER received review key from the dev ...
1,644537,Early Access Review
2,604237,First all love dual stick shooters and this ...
3,670182,Love the game far Thanks Steam for giving ...
4,681107,Early Access Review
...,...,...
44995,576936,Everyone else says Broken mechanics same clas...
44996,621717,Whoever gives this game negative review becau...
44997,637230,most played game steam moment
44998,689339,review infinite well VERY VERY good game ...


In [69]:


test.review = test.review.str.replace('\d+', '') # for digits
test.review = test.review.str.replace(r'(\b\w{1,2}\b)', '') # for words
test.review = test.review.str.replace('[^\w\s]', '') # for punctuation
test.review = test.review.apply(str) 

X_test = tf_idf.transform(test.review)

In [74]:
pd.DataFrame(model.predict(X_test), columns=['like']).to_csv('submission.csv', index=False)