In [0]:
import numpy as np
import pandas as pd
import re

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import f1_score 

In [0]:
!wget https://www.dropbox.com/s/r6u59ljhhjdg6j0/negative.csv
!wget https://www.dropbox.com/s/fnpq3z4bcnoktiv/positive.csv

In [0]:
df = pd.concat((pd.read_csv(f, sep=';', header=None) for f in ['positive.csv', 'negative.csv']), ignore_index=True)

In [0]:
df = df.drop(columns=[0,1,2,5,6,7,8,9,10,11])
df.columns=['text','target']
df.target = df.target.map({-1:0, 1:1})
y = df.target.values

In [0]:
replaces = [
 [':[(]+',' <NSMILE> '],
 [':-[(]+',' <NSMILE> '],
 ['\([(]+',' <NSMILE> '], 
 [':[)]+',' <PSMILE> '], 
 [':-[)]+',' <PSMILE> '], 
 ['\)[)]+',' <PSMILE> '], 
 ['@[_*,А-Я,а-я,A-Z,a-z,0-9]+\s',' <UNAME> '],
 ['http:[\.,/,0-9,a-z,A-Z]+[\s|$]?', ' <HTTP> '] 
]

def add_tokens(s):
    for l, r in replaces:
        s = re.sub(l, r, s)
    return s 

In [0]:
df['text_with_tokens'] = df.text.apply(add_tokens)

vectorizer = TfidfVectorizer(
    ngram_range=(1,1),
)
x = vectorizer.fit_transform(df.text_with_tokens)

In [7]:
np.random.seed(42)

f1_scores = []
for seed in np.random.randint(1000, size=(5)):
    x_train, x_test, y_train, y_test = train_test_split(x, y, random_state=seed)
    clf = LogisticRegression(
                          C=1, random_state=42, 
                          solver = 'saga', 
                          max_iter = 100,
                          verbose=0,
                          )
    clf.fit(x_train, y_train)
    y_pred = clf.predict(x_test)
    print(f1_score(y_pred, y_test))
    f1_scores += [f1_score(y_pred, y_test)]

0.8896255330490406
0.8888851813540659
0.8903436988543372
0.8896438612933458
0.8902013781136859


In [8]:
print('F1 score:', np.round(np.mean(f1_scores),3),'\xB1' ,np.round(np.std(f1_scores),3))

F1 score: 0.89 ± 0.001
