## Loading dataset
The RuTweetCorp dataset was presented in Y. Rubtsova, "Constructing a Corpus for Sentiment Classification Training", Software & Systems, vol. 109, no. 1, pp. 72-78, 2015 and is available at http://study.mokoron.com.

In [1]:
!wget https://www.dropbox.com/s/r6u59ljhhjdg6j0/negative.csv
!wget https://www.dropbox.com/s/fnpq3z4bcnoktiv/positive.csv

--2020-05-31 20:42:00--  https://www.dropbox.com/s/r6u59ljhhjdg6j0/negative.csv
Resolving www.dropbox.com (www.dropbox.com)... 162.125.80.1, 2620:100:6030:1::a27d:5001
Connecting to www.dropbox.com (www.dropbox.com)|162.125.80.1|:443... connected.
HTTP request sent, awaiting response... 301 Moved Permanently
Location: /s/raw/r6u59ljhhjdg6j0/negative.csv [following]
--2020-05-31 20:42:01--  https://www.dropbox.com/s/raw/r6u59ljhhjdg6j0/negative.csv
Reusing existing connection to www.dropbox.com:443.
HTTP request sent, awaiting response... 302 Found
Location: https://uc3e8b35c480988f437db44d83df.dl.dropboxusercontent.com/cd/0/inline/A4wumffcHqTga7w2m9tGUOXk0Fmj36ukIkLsdld8ZDzBeIc7TdwKB9CheXV_fLPvAJc-Szg6x1LIBinFlYXz1KLSgf9IZRuauWEYlQNCC6uNXw/file# [following]
--2020-05-31 20:42:01--  https://uc3e8b35c480988f437db44d83df.dl.dropboxusercontent.com/cd/0/inline/A4wumffcHqTga7w2m9tGUOXk0Fmj36ukIkLsdld8ZDzBeIc7TdwKB9CheXV_fLPvAJc-Szg6x1LIBinFlYXz1KLSgf9IZRuauWEYlQNCC6uNXw/file
Resolving uc3e8b

In [0]:
import numpy as np
import pandas as pd

In [3]:
df = pd.concat((pd.read_csv(f, sep=';', header=None) for f in ['positive.csv', 'negative.csv']), ignore_index=True)
df.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11
0,408906692374446080,1386325927,pleease_shut_up,"@first_timee хоть я и школота, но поверь, у на...",1,0,0,0,7569,62,61,0
1,408906692693221377,1386325927,alinakirpicheva,"Да, все-таки он немного похож на него. Но мой ...",1,0,0,0,11825,59,31,2
2,408906695083954177,1386325927,EvgeshaRe,RT @KatiaCheh: Ну ты идиотка) я испугалась за ...,1,0,1,0,1273,26,27,0
3,408906695356973056,1386325927,ikonnikova_21,"RT @digger2912: ""Кто то в углу сидит и погибае...",1,0,1,0,1549,19,17,0
4,408906761416867842,1386325943,JumpyAlex,@irina_dyshkant Вот что значит страшилка :D\nН...,1,0,0,0,597,16,23,1


In [4]:
df = df.drop(columns=[0,1,2,5,6,7,8,9,10,11])
df.columns=['text','target']
df.head(2), df.tail(2)

(                                                text  target
 0  @first_timee хоть я и школота, но поверь, у на...       1
 1  Да, все-таки он немного похож на него. Но мой ...       1,
                                                      text  target
 226832  RT @_Them__: @LisaBeroud Тауриэль, не грусти :...      -1
 226833  Такси везет меня на работу. Раздумываю приплат...      -1)

In [5]:
df.target = df.target.map({-1:0, 1:1})
y = df.target.values
len(df), df.groupby(['target']).target.count(), df.groupby(['target']).target.count()/len(df)

(226834, target
 0    111923
 1    114911
 Name: target, dtype: int64, target
 0    0.493414
 1    0.506586
 Name: target, dtype: float64)

## Baseline
TF-IDF + Logistic regression with default hyperparameters

In [0]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import f1_score

np.random.seed(42)

In [7]:
vectorizer = TfidfVectorizer()
x = vectorizer.fit_transform(df.text)
x.shape

(226834, 294600)

In [8]:
x_train, x_test, y_train, y_test = train_test_split(x, y, random_state=42)
clf = LogisticRegression()
clf.fit(x_train, y_train)
y_pred = clf.predict(x_test)
print(f1_score(y_pred, y_test))

0.7690796714926763


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


In [9]:
f1_scores = []
for seed in np.random.randint(1000, size=(5)):
    x_train, x_test, y_train, y_test = train_test_split(x, y, random_state=seed)
    clf = LogisticRegression(
                          random_state=42, 
                          solver = 'saga', 
                          max_iter = 1000,
                          verbose=0,
                          )
    clf.fit(x_train, y_train)
    y_pred = clf.predict(x_test)
    print('  ', f1_score(y_pred, y_test))
    f1_scores += [f1_score(y_pred, y_test)]

print('TF-IDF LR baseline f1-score:', np.round(np.mean(f1_scores),3),'\xB1' ,np.round(np.std(f1_scores),3))

   0.7705563345778919
   0.7648230013835927
   0.7705309509695994
   0.7693922464872185
   0.7692567567567568
TF-IDF LR baseline f1-score: 0.769 ± 0.002


##Regularization

In [10]:
scores = []
for C in [0.01,0.1,0.5,1,5,10,100]:
    f1_scores = []
    print('C:', C)
    for seed in np.random.randint(1000, size=(5)):
        x_train, x_test, y_train, y_test = train_test_split(x, y, random_state=seed)
        clf = LogisticRegression(
                              C=C,
                              random_state=42, 
                              solver = 'saga', 
                              max_iter = 1000,
                              verbose=0,
                              )
        clf.fit(x_train, y_train)
        y_pred = clf.predict(x_test)
        print('  ', f1_score(y_pred, y_test))
        f1_scores += [f1_score(y_pred, y_test)]
    print('F1 score:', np.round(np.mean(f1_scores),3),'\xB1' ,np.round(np.std(f1_scores),3))
    scores += [(C, np.mean(f1_scores),np.std(f1_scores))]

C: 0.01
   0.693670388683323
   0.6945687634223513
   0.6945929107781094
   0.6949291256097161
   0.6937564541773895
F1 score: 0.694 ± 0.0
C: 0.1
   0.7368280512024565
   0.7316740652069097
   0.7323295482967336
   0.7365732316065375
   0.7368438588015792
F1 score: 0.735 ± 0.002
C: 0.5
   0.7642816410049982
   0.7632759027613877
   0.7641210495431302
   0.763571620532115
   0.76195242384813
F1 score: 0.763 ± 0.001
C: 1
   0.7668255416962445
   0.7666972570612829
   0.7727103939506464
   0.7679401965261218
   0.7708055785089124
F1 score: 0.769 ± 0.002
C: 5
   0.7752841737102885
   0.7718871094955592
   0.7740799617113651
   0.7752830028087496
   0.7698640004116003
F1 score: 0.773 ± 0.002
C: 10
   0.7686231411203077
   0.7683050179150024
   0.7705259533442369
   0.765165663692329
   0.7654180855088915
F1 score: 0.768 ± 0.002
C: 100
   0.7503626671327449
   0.7488362398388105
   0.7453335668009105
   0.7475110909281448
   0.749751269832958
F1 score: 0.748 ± 0.002


In [11]:
for i in sorted(scores, key=lambda x: x[1])[::-1]:
    print(i[:2])
best_C = sorted(scores, key=lambda x: x[1])[-1][0]
print('Best C LR hyperparemeter:', best_C)

(5, 0.7732796496275125)
(1, 0.7689957935486416)
(10, 0.7676075723161534)
(0.5, 0.7634405275379523)
(100, 0.7483589669067138)
(0.1, 0.7348497510228433)
(0.01, 0.6943035285341779)
Best C LR hyperparemeter: 5


## Word N-grams

In [12]:
for word_ngram in [1,2,3]:
    print('Number of word N-grams:', word_ngram)
    f1_scores = []
    vectorizer = TfidfVectorizer(
        ngram_range=(1,word_ngram),
    )
    x = vectorizer.fit_transform(df.text)

    for seed in np.random.randint(1000, size=(5)):
        x_train, x_test, y_train, y_test = train_test_split(x, y, random_state=seed)
        clf = LogisticRegression(
                              C=best_C,
                              random_state=42, 
                              solver = 'saga', 
                              max_iter = 1000,
                              verbose=0,
                              )
        clf.fit(x_train, y_train)
        y_pred = clf.predict(x_test)
        print(' ', f1_score(y_pred, y_test))
        f1_scores += [f1_score(y_pred, y_test)]
    print('F1 score:', np.round(np.mean(f1_scores),3),'\xB1' ,np.round(np.std(f1_scores),3))

Number of word N-grams: 1
  0.7759486321254141
  0.7740300654042394
  0.7736249443816955
  0.7734346910473059
  0.771822200897906
F1 score: 0.774 ± 0.001
Number of word N-grams: 2
  0.7871075166508089
  0.7875730127825277
  0.7898236092265943
  0.7885468522469438
  0.7888347031340585
F1 score: 0.788 ± 0.001
Number of word N-grams: 3
  0.786294750713815
  0.7880454031809213
  0.7906647807637907
  0.7850919487726065
  0.7864902365562286
F1 score: 0.787 ± 0.002


## EDA

In [13]:
np.random.choice(df.text.values,30)

array(['@bulygin499 действительно неуч. Алексей Булыгин внес больший вклад в изучение истории, чем профессор Йельского университета)',
       'Всего одну ночь не было столько всего пропустила(((',
       'Оказывается, даже в государственных учереждениях есть добрые понимающие люди)',
       '@_nat_borisova_ у вас тоже пофоткали его с ректором и повесят во всех корпусах лет на 20?)',
       'И кто теперь первый в заветном списке?\nА вот Евгеша теперь первая :D\n:З',
       '@owl_bazzinga ну мне тот больше выпуск нравится))особенно рок-н-рольщик!))ну он везде хорош)',
       'Не плаааачь \n"@ZazEAGLE: БРО УЕЗЖАЕТ:(((((((("',
       '@Maria__Way твой последний выпуск был просто великолепен, так что, да, я твоя поклонница:D http://t.co/OrTxAGEJtc',
       'Кстати. Если я поел — все, я не работник :( думать не але, писать код — не але. Могу читать ньюсы, фтыкать и играть во чт',
       'RT @ruslp: в этом весь Ваня)\n#NoizeMC #нойзерфолловьнойзера http://t.co/pbQ7as70T5',
       '@naaaaaaask

In [0]:
import re

replaces = [
 ['[:;][(]+',' <NSMILE> '],
 ['[:;]-[(]+',' <NSMILE> '],
 ['\([(]+',' <NSMILE> '], 
 ['[:;][)]+',' <PSMILE> '], 
 ['[:;]-[)]+',' <PSMILE> '], 
 ['\)[)]+',' <PSMILE> '], 
 ['@[_*:,А-Я,а-я,A-Z,a-z,0-9]+\s',' <UNAME> '],
 ['http:[\.,/,0-9,a-z,A-Z]+[\s|$]?', ' <HTTP> '] 
]

def add_tokens(s):
    for l, r in replaces:
        s = re.sub(l, r, s)
    return s 

In [15]:
df.iloc[813].text, add_tokens(df.iloc[813].text)

('Счастливый обладатель iphone 5c :-) http://t.co/YqZnfGr4yK',
 'Счастливый обладатель iphone 5c  <PSMILE>   <HTTP> ')

In [16]:
df.iloc[224032].text, add_tokens(df.iloc[224032].text)

('@meow_Julia_1D_  Всё равно, я переживаю :((',
 ' <UNAME>  Всё равно, я переживаю  <NSMILE> ')

In [0]:
df['text_with_tokens'] = df.text.apply(add_tokens)

In [18]:
for word_ngram in [1,2,3]:
    print('Number of word N-grams:', word_ngram)
    f1_scores = []
    vectorizer = TfidfVectorizer(
        ngram_range=(1,word_ngram),
    )
    x = vectorizer.fit_transform(df.text_with_tokens)

    for seed in np.random.randint(1000, size=(5)):
        x_train, x_test, y_train, y_test = train_test_split(x, y, random_state=seed)
        clf = LogisticRegression(
                              C=best_C,
                              random_state=42, 
                              solver = 'saga', 
                              max_iter = 100,
                              verbose=0,
                              )
        clf.fit(x_train, y_train)
        y_pred = clf.predict(x_test)
        print(' ', f1_score(y_pred, y_test))
        f1_scores += [f1_score(y_pred, y_test)]
    print('F1 score:', np.round(np.mean(f1_scores),3),'\xB1' ,np.round(np.std(f1_scores),3))

Number of word N-grams: 1
  0.894570418843741
  0.8940512227475257
  0.8938711969948222
  0.8934290260333797
  0.891794470704254
F1 score: 0.894 ± 0.001
Number of word N-grams: 2
  0.9001922622862347
  0.9002960172228203
  0.9010826440723555
  0.9023351003153729
  0.901914520695699
F1 score: 0.901 ± 0.001
Number of word N-grams: 3
  0.9011509591326106
  0.8990746288529392
  0.9010912003203524
  0.9036575823663575
  0.9006014032743067
F1 score: 0.901 ± 0.001
