

---

# Task 1
---



In [1]:
import spacy
import pandas as pd
from tqdm import tqdm
from spacy.tokens import DocBin

### Read the dataset

In [None]:
train_df = pd.read_csv('train.csv')
train_df = train_df.drop('idx', axis=1)
train_df = train_df[['Text', 'Score']]

test_df = pd.read_csv('test.csv')
test_df = test_df.drop('idx', axis=1)

### Load the model

In [None]:
!python -m spacy download ru_core_news_sm

nlp = spacy.load("ru_core_news_sm")

### Split dataset and create .spacy files

In [None]:
data = [tuple(train_df.iloc[i].values) for i in range(train_df.shape[0])]
test = [tuple(test_df.iloc[i].values) for i in range(test_df.shape[0])]

dividing_point = int(len(data) * 0.75)

train_data = data[:dividing_point]
test_data = data[dividing_point:]

In [7]:
def make_docs(data):
    """
    this will take a list of texts and labels
    and transform them in spacy documents
    data: list(tuple(text, label))
    returns: List(spacy.Doc.doc)
    """
    docs = []
    # nlp.pipe([texts]) is way faster than running
    # nlp(text) for each text
    # as_tuples allows us to pass in a tuple,
    # the first one is treated as text
    # the second one will get returned as it is.
    # a = tqdm(nlp.pipe(data, as_tuples=True), total = len(data))
    for doc, label in tqdm(nlp.pipe(data, as_tuples=True), total = len(data)):
        doc.cats["Positive"] = int(label == 'Positive')
        doc.cats["Negative"] = int(label == 'Negative')
        
        # we need to set the (text)cat(egory) for each document
        #doc.cats["positive"] = label
        # put them into a nice list
        docs.append(doc)
    return docs

In [None]:
train_docs = make_docs(train_data)
doc_bin = DocBin(docs=train_docs)
doc_bin.to_disk("train.spacy") 

valid_docs = make_docs(test_data)
doc_bin = DocBin(docs=valid_docs)
doc_bin.to_disk("valid.spacy")

### Make a config

In [3]:
!python -m spacy init fill-config base_config.cfg config.cfg

[38;5;2m✔ Auto-filled config with all values[0m
[38;5;2m✔ Saved config[0m
config.cfg
You can now add your data and train your pipeline:
python -m spacy train config.cfg --paths.train ./train.spacy --paths.dev ./dev.spacy


### Train our model :)

In [12]:
!python -m spacy train config.cfg --output ./output --paths.train ./train.spacy --paths.dev ./valid.spacy --gpu-id 0

[38;5;4mℹ Saving to output directory: output[0m
[38;5;4mℹ Using GPU: 0[0m
[1m
[38;5;2m✔ Initialized pipeline[0m
[1m
[38;5;4mℹ Pipeline: ['tok2vec', 'textcat'][0m
[38;5;4mℹ Initial learn rate: 0.0001[0m
E    #       LOSS TOK2VEC  LOSS TEXTCAT  CATS_SCORE  SCORE 
---  ------  ------------  ------------  ----------  ------
  0       0          0.00          0.25       33.85    0.34
  0     200          0.29         52.30       34.40    0.34
  0     400          0.67         52.35       47.91    0.48
  0     600          1.35         54.76       42.08    0.42
  0     800          3.01         53.81       59.53    0.60
  0    1000         11.32         55.71       55.43    0.55
  0    1200         10.46         59.86       60.23    0.60
  0    1400         22.63         63.18       54.90    0.55
  0    1600         22.51         59.02       56.58    0.57
  0    1800         22.49         56.78       68.90    0.69
  0    2000         54.02         50.72       6

### Inference

In [19]:
# # load the best model from training
# nlp = spacy.load("output/model-best")
# 
# correct_answers = 0
# all_answers = len(test_data)
# 
# for i, string in enumerate(test_data):
#     
#     if i % 100 == 0:
#         print('Current position: {} / {}'.format(i, all_answers))
# 
#     output = nlp(string[0])
# 
#     if (output.cats['Positive'] > 0.5 and string[1] == 'Positive') or (output.cats['Negative'] > 0.5 and string[1] == 'Negative'):
#         correct_answers += 1
#         
# print(correct_answers / all_answers)

Current position: 0 / 3500
Current position: 100 / 3500
Current position: 200 / 3500
Current position: 300 / 3500
Current position: 400 / 3500
Current position: 500 / 3500
Current position: 600 / 3500
Current position: 700 / 3500
Current position: 800 / 3500
Current position: 900 / 3500
Current position: 1000 / 3500
Current position: 1100 / 3500
Current position: 1200 / 3500
Current position: 1300 / 3500
Current position: 1400 / 3500
Current position: 1500 / 3500
Current position: 1600 / 3500
Current position: 1700 / 3500
Current position: 1800 / 3500
Current position: 1900 / 3500
Current position: 2000 / 3500
Current position: 2100 / 3500
Current position: 2200 / 3500
Current position: 2300 / 3500
Current position: 2400 / 3500
Current position: 2500 / 3500
Current position: 2600 / 3500
Current position: 2700 / 3500
Current position: 2800 / 3500
Current position: 2900 / 3500
Current position: 3000 / 3500
Current position: 3100 / 3500
Current position: 3200 / 3500
Current position: 3300

In [None]:
text = ""
print("type : ‘quit’ to exit")
# predict the sentiment until someone writes quit
while text != "quit":
    text = input("Please enter example input: ")
    doc = nlp(text)
    print(doc.cats)
    if doc.cats['Positive'] >.5:
        print(f"the sentiment is positive")
    else:
        print(f"the sentiment is negative")