In [None]:
!pip3 install sklearn
!pip3 install nltk
!pip3 install gensim
!pip3 install pandas

In [None]:
import pandas as pd
import re
import nltk
import numpy as np
nltk.download("stopwords")
from nltk.corpus import stopwords
from string import punctuation
from sklearn.model_selection import train_test_split

In [8]:
train = pd.read_csv('train.csv')
train.head()

Unnamed: 0,deputy,speech,fraction
0,Ю. Н. Афанасьев,уважаемый товарищ депутат призывать критично о...,Inter_regional_Deputies_Group
1,Ю. Н. Афанасьев,александр яковлевич много месяц назад группа н...,Inter_regional_Deputies_Group
2,Попов Г. X.,товарищ вчера полный демократический обстановк...,Inter_regional_Deputies_Group
3,Попов Г. X.,протестовать против три минутный выступление у...,Inter_regional_Deputies_Group
4,Попов Г. X.,товарищ прежде обвинение адрес михаил сергееви...,Inter_regional_Deputies_Group


In [9]:
russian_stopwords = stopwords.words("russian")

In [10]:
train['speech_clean'] = train['speech'].map(lambda x: [token for token in x.split(' ') if token.strip() not in russian_stopwords\
                                                                  and token != " " \
                                                                  and token.strip() not in punctuation])
train['speech_clean'] = train['speech_clean'].map(lambda x: ' '.join(x))

In [11]:
train.head()

Unnamed: 0,deputy,speech,fraction,speech_clean
0,Ю. Н. Афанасьев,уважаемый товарищ депутат призывать критично о...,Inter_regional_Deputies_Group,уважаемый товарищ депутат призывать критично о...
1,Ю. Н. Афанасьев,александр яковлевич много месяц назад группа н...,Inter_regional_Deputies_Group,александр яковлевич месяц назад группа народны...
2,Попов Г. X.,товарищ вчера полный демократический обстановк...,Inter_regional_Deputies_Group,товарищ вчера полный демократический обстановк...
3,Попов Г. X.,протестовать против три минутный выступление у...,Inter_regional_Deputies_Group,протестовать против минутный выступление утвер...
4,Попов Г. X.,товарищ прежде обвинение адрес михаил сергееви...,Inter_regional_Deputies_Group,товарищ прежде обвинение адрес михаил сергееви...


# 

In [12]:
X_train, X_test, y_train, y_test = train_test_split(train['speech'], train['fraction'], test_size=0.25, random_state=42)
print(X_train.shape, X_test.shape)

(132,) (44,)


# SDG

In [13]:
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import SGDClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn import metrics
from sklearn.model_selection import GridSearchCV

In [24]:
# without params
sgd_ppl_clf = Pipeline([
    ('tfidf', TfidfVectorizer(strip_accents='unicode')),
    ('sgd_clf', SGDClassifier(random_state=42))
])
sgd_ppl_clf.fit(X_train, y_train)
predicted_sgd = sgd_ppl_clf.predict(X_test)
print(metrics.classification_report(predicted_sgd, y_test))

                               precision    recall  f1-score   support

               CSPU_Officials       0.10      1.00      0.18         1
Inter_regional_Deputies_Group       1.00      0.79      0.88        43

                     accuracy                           0.80        44
                    macro avg       0.55      0.90      0.53        44
                 weighted avg       0.98      0.80      0.87        44



In [15]:
# grid search
parameters = { 
                'sgd_clf__loss':['hinge', 'log', 'modified_huber', 'squared_hinge'],
                'sgd_clf__class_weight':[None, 'balanced'],
                'sgd_clf__penalty':['l2', 'l1', 'elasticnet'],
                'tfidf__ngram_range': ((1, 1), (1,3), (1,5), (2,2)),
                'tfidf__norm': ('l1', 'l2'),
              }
model = GridSearchCV(sgd_ppl_clf, parameters, n_jobs=-1, verbose=2)
model.fit(X_train, y_train)
print('Best score and parameter combination:')
print(model.best_score_, model.best_params_) 

predicted_sgd = model.predict(X_test)
print(metrics.classification_report(predicted_sgd, y_test))

Fitting 5 folds for each of 192 candidates, totalling 960 fits
Best score and parameter combination:
0.8492877492877492 {'sgd_clf__class_weight': None, 'sgd_clf__loss': 'hinge', 'sgd_clf__penalty': 'l1', 'tfidf__ngram_range': (1, 3), 'tfidf__norm': 'l2'}
                               precision    recall  f1-score   support

               CSPU_Officials       0.60      0.86      0.71         7
Inter_regional_Deputies_Group       0.97      0.89      0.93        37

                     accuracy                           0.89        44
                    macro avg       0.79      0.87      0.82        44
                 weighted avg       0.91      0.89      0.89        44



In [16]:
# best
sgd_ppl_clf = Pipeline([
    ('tfidf', TfidfVectorizer(strip_accents='unicode', ngram_range=(1, 3), norm='l2')),
    ('sgd_clf', SGDClassifier(random_state=42, class_weight=None, loss='hinge', penalty='l1'))
])
sgd_ppl_clf.fit(X_train, y_train)
predicted_sgd = sgd_ppl_clf.predict(X_test)
print(metrics.classification_report(predicted_sgd, y_test))

                               precision    recall  f1-score   support

               CSPU_Officials       0.60      0.86      0.71         7
Inter_regional_Deputies_Group       0.97      0.89      0.93        37

                     accuracy                           0.89        44
                    macro avg       0.79      0.87      0.82        44
                 weighted avg       0.91      0.89      0.89        44



# KNN

In [17]:
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import SGDClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn import metrics
from sklearn.model_selection import GridSearchCV

In [22]:
# without params
knb_ppl_clf = Pipeline([
    ('tfidf', TfidfVectorizer(strip_accents='unicode', norm='l2')),
    ('knb_clf', KNeighborsClassifier())
])
knb_ppl_clf.fit(X_train, y_train)
predicted_knn = knb_ppl_clf.predict(X_test)
print(metrics.classification_report(predicted_knn, y_test))

                               precision    recall  f1-score   support

               CSPU_Officials       0.40      0.67      0.50         6
Inter_regional_Deputies_Group       0.94      0.84      0.89        38

                     accuracy                           0.82        44
                    macro avg       0.67      0.75      0.69        44
                 weighted avg       0.87      0.82      0.84        44



In [19]:
# grid search
parameters = { 
                'knb_clf__n_neighbors':[5,7,10,15],
                'knb_clf__weights':['uniform', 'distance'],
               'knb_clf__algorithm':['auto', 'ball_tree', 'kd_tree', 'brute'],
                'knb_clf__leaf_size':[25,29,30,31],
                'tfidf__ngram_range': ((1,3), (2,2)),
              }
model = GridSearchCV(knb_ppl_clf, parameters, n_jobs=-1, verbose=2)
model.fit(X_train, y_train)
print('Best score and parameter combination:')
print(model.best_score_, model.best_params_) 

predicted_knn = model.predict(X_test)
print(metrics.classification_report(predicted_knn, y_test))

Fitting 5 folds for each of 256 candidates, totalling 1280 fits
Best score and parameter combination:
0.8336182336182336 {'knb_clf__algorithm': 'auto', 'knb_clf__leaf_size': 25, 'knb_clf__n_neighbors': 10, 'knb_clf__weights': 'uniform', 'tfidf__ngram_range': (2, 2)}
                               precision    recall  f1-score   support

               CSPU_Officials       0.30      0.75      0.43         4
Inter_regional_Deputies_Group       0.97      0.82      0.89        40

                     accuracy                           0.82        44
                    macro avg       0.64      0.79      0.66        44
                 weighted avg       0.91      0.82      0.85        44



In [23]:
# best
knb_ppl_clf = Pipeline([
    ('tfidf', TfidfVectorizer(strip_accents='unicode', norm='l2')),
    ('knb_clf', KNeighborsClassifier())
])
knb_ppl_clf.fit(X_train, y_train)
predicted_knn = knb_ppl_clf.predict(X_test)
print(metrics.classification_report(predicted_knn, y_test))

                               precision    recall  f1-score   support

               CSPU_Officials       0.40      0.67      0.50         6
Inter_regional_Deputies_Group       0.94      0.84      0.89        38

                     accuracy                           0.82        44
                    macro avg       0.67      0.75      0.69        44
                 weighted avg       0.87      0.82      0.84        44



# Best model ML

In [25]:
# best
sgd_ppl_clf = Pipeline([
    ('tfidf', TfidfVectorizer(strip_accents='unicode', ngram_range=(1, 3), norm='l2')),
    ('sgd_clf', SGDClassifier(random_state=42, class_weight=None, loss='hinge', penalty='l1'))
])
sgd_ppl_clf.fit(X_train, y_train)
predicted_sgd = sgd_ppl_clf.predict(X_test)
print(metrics.classification_report(predicted_sgd, y_test))

                               precision    recall  f1-score   support

               CSPU_Officials       0.60      0.86      0.71         7
Inter_regional_Deputies_Group       0.97      0.89      0.93        37

                     accuracy                           0.89        44
                    macro avg       0.79      0.87      0.82        44
                 weighted avg       0.91      0.89      0.89        44



# Counting speeches and deputies by faction

In [28]:
#подсчет до исключения строчек "не представился", "голос с места" и председательствующий
(unique, counts) = np.unique(result, return_counts=True)
frequencies = np.asarray((unique, counts)).T
print(frequencies)

[['CSPU_Officials' '181']
 ['Inter_regional_Deputies_Group' '709']]


In [29]:
all_data['fraction'] = result
all_data.head()

Unnamed: 0,deputy,speech,fraction
0,толпежников в. ф.,товарищ прежде начинать заседание просить почт...,Inter_regional_Deputies_Group
1,лукин в.,уважаемый товарищ известно организация подгото...,Inter_regional_Deputies_Group
2,назарбаев н. а.,уважаемый товарищ народный депутат вчера собра...,Inter_regional_Deputies_Group
3,сахаров а. д.,уважаемый депутат хотеть выступать защита два ...,Inter_regional_Deputies_Group
4,председательствующий.,минута товарищ думать заключительный этап дово...,CSPU_Officials


In [30]:
distinct = all_data.groupby(['deputy','fraction']).size().reset_index().rename(columns={0:'count'})

In [31]:
#подсчет до исключения строчек "не представился", "голос с места" и председательствующий
cspu = len(distinct[distinct['fraction']=='CSPU_Officials'])
inter_regional = len(distinct[distinct['fraction']=='Inter_regional_Deputies_Group'])
print(cspu, ' ', inter_regional)

108   327
