

---

# Task 1
---



In [1]:
import spacy
import pandas as pd
from tqdm import tqdm
from spacy.tokens import DocBin

### Read the dataset

In [2]:
train_df = pd.read_csv('train.csv')
train_df = train_df.drop('idx', axis=1)
train_df = train_df[['Text', 'Score']]

test_df = pd.read_csv('test.csv')
test_df = test_df.drop('idx', axis=1)

### Load the model

In [3]:
!python -m spacy download ru_core_news_sm

nlp = spacy.load("ru_core_news_sm")

Collecting ru-core-news-sm==3.6.0
  Downloading https://github.com/explosion/spacy-models/releases/download/ru_core_news_sm-3.6.0/ru_core_news_sm-3.6.0-py3-none-any.whl (15.3 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m15.3/15.3 MB[0m [31m8.3 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m

[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip available: [0m[31;49m22.3.1[0m[39;49m -> [0m[32;49m23.2.1[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m
[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('ru_core_news_sm')


### Split dataset and create .spacy files

In [4]:
data = [tuple(train_df.iloc[i].values) for i in range(train_df.shape[0])]
test = [tuple(test_df.iloc[i].values) for i in range(test_df.shape[0])]

dividing_point = int(len(data) * 0.75)

train_data = data[:dividing_point]
test_data = data[dividing_point:]

In [7]:
def make_docs(data):
    """
    this will take a list of texts and labels
    and transform them in spacy documents
    data: list(tuple(text, label))
    returns: List(spacy.Doc.doc)
    """
    docs = []
    # nlp.pipe([texts]) is way faster than running
    # nlp(text) for each text
    # as_tuples allows us to pass in a tuple,
    # the first one is treated as text
    # the second one will get returned as it is.
    # a = tqdm(nlp.pipe(data, as_tuples=True), total = len(data))
    for doc, label in tqdm(nlp.pipe(data, as_tuples=True), total = len(data)):
        doc.cats["Positive"] = int(label == 'Positive')
        doc.cats["Negative"] = int(label == 'Negative')
        
        # we need to set the (text)cat(egory) for each document
        #doc.cats["positive"] = label
        # put them into a nice list
        docs.append(doc)
    return docs

In [None]:
train_docs = make_docs(train_data)
doc_bin = DocBin(docs=train_docs)
doc_bin.to_disk("train.spacy") 

valid_docs = make_docs(test_data)
doc_bin = DocBin(docs=valid_docs)
doc_bin.to_disk("valid.spacy")

### Make a config

In [3]:
!python -m spacy init fill-config base_config.cfg config.cfg

[38;5;2m✔ Auto-filled config with all values[0m
[38;5;2m✔ Saved config[0m
config.cfg
You can now add your data and train your pipeline:
python -m spacy train config.cfg --paths.train ./train.spacy --paths.dev ./dev.spacy


### Train our model :)

In [6]:
!python -m spacy train config.cfg --output ./output --paths.train ./train.spacy --paths.dev ./valid.spacy --gpu-id 0

[38;5;4mℹ Saving to output directory: output[0m
[38;5;4mℹ Using GPU: 0[0m
[1m
[38;5;2m✔ Initialized pipeline[0m
[1m
[38;5;4mℹ Pipeline: ['textcat'][0m
[38;5;4mℹ Initial learn rate: 0.001[0m
E    #       LOSS TEXTCAT  CATS_SCORE  SCORE 
---  ------  ------------  ----------  ------
  0       0          0.25       33.07    0.33
  0     200         49.02       34.05    0.34
  0     400         44.90       64.82    0.65
  0     600         29.35       51.17    0.51
  0     800         29.25       88.11    0.88
  0    1000         25.54       86.00    0.86
  0    1200         24.72       89.31    0.89
  0    1400         20.83       79.74    0.80
  0    1600         25.73       86.86    0.87
  0    1800         19.67       90.75    0.91
  0    2000         17.80       89.69    0.90
  0    2200         14.68       90.72    0.91
  0    2400         15.64       90.82    0.91
  0    2600         17.52       91.51    0.92
  0    2800         14.86       91.97  

### Inference

In [7]:
# load the best model from training
nlp = spacy.load("output/model-best")

correct_answers = 0
all_answers = len(test_data)

for i, string in enumerate(test_data):

    if i % 100 == 0:
        print('Current position: {} / {}'.format(i, all_answers))

    output = nlp(string[0])

    if (output.cats['Positive'] > 0.5 and string[1] == 'Positive') or (output.cats['Negative'] > 0.5 and string[1] == 'Negative'):
        correct_answers += 1

print(correct_answers / all_answers)

Current position: 0 / 3500
Current position: 100 / 3500
Current position: 200 / 3500
Current position: 300 / 3500
Current position: 400 / 3500
Current position: 500 / 3500
Current position: 600 / 3500
Current position: 700 / 3500
Current position: 800 / 3500
Current position: 900 / 3500
Current position: 1000 / 3500
Current position: 1100 / 3500
Current position: 1200 / 3500
Current position: 1300 / 3500
Current position: 1400 / 3500
Current position: 1500 / 3500
Current position: 1600 / 3500
Current position: 1700 / 3500
Current position: 1800 / 3500
Current position: 1900 / 3500
Current position: 2000 / 3500
Current position: 2100 / 3500
Current position: 2200 / 3500
Current position: 2300 / 3500
Current position: 2400 / 3500
Current position: 2500 / 3500
Current position: 2600 / 3500
Current position: 2700 / 3500
Current position: 2800 / 3500
Current position: 2900 / 3500
Current position: 3000 / 3500
Current position: 3100 / 3500
Current position: 3200 / 3500
Current position: 3300

In [8]:
text = ""
print("type : ‘quit’ to exit")
# predict the sentiment until someone writes quit
while text != "quit":
    text = input("Please enter example input: ")
    doc = nlp(text)
    print(doc.cats)
    if doc.cats['Positive'] >.5:
        print(f"the sentiment is positive")
    else:
        print(f"the sentiment is negative")

type : ‘quit’ to exit
{'Positive': 0.011917437426745892, 'Negative': 0.9880826473236084}
the sentiment is negative
{'Positive': 0.9015288949012756, 'Negative': 0.09847111999988556}
the sentiment is positive
{'Positive': 0.0, 'Negative': 0.0}
the sentiment is negative
{'Positive': 0.0, 'Negative': 0.0}
the sentiment is negative
{'Positive': 0.0, 'Negative': 0.0}
the sentiment is negative
{'Positive': 1.0, 'Negative': 2.4505830522706374e-19}
the sentiment is positive
