In [130]:
import pandas as pd
import spacy
import numpy as np
import nltk
from nltk.corpus import stopwords
import re
from joblib import dump, load
from typing import List, Any, Dict
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.base import TransformerMixin
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.naive_bayes import MultinomialNB, BernoulliNB
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import SGDClassifier #, LogisticRegression
#from sklearn.preprocessing import Binarizer
from sklearn.decomposition import TruncatedSVD
from sklearn.preprocessing import FunctionTransformer
from sklearn.feature_selection import SelectFromModel



nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [2]:
df = pd.read_csv('./dataset_for_assignment.csv')
df.head()

Unnamed: 0,comment_text,target
0,Explanation\nWhy the edits made under my usern...,0
1,D'aww! He matches this background colour I'm s...,0
2,"Hey man, I'm really not trying to edit war. It...",0
3,"""\nMore\nI can't make any real suggestions on ...",0
4,"You, sir, are my hero. Any chance you remember...",0


In [3]:
groups = df.groupby(['target']).agg({'target': 'count'}).to_dict('index')
print(groups[1]['target'], groups[0]['target'])
groups[1]['target'] / groups[0]['target']

16225 143346


0.1131876717871444

In [4]:
REPLACE_BY_SPACE_RE = re.compile('[/(){}\[\]\|@,;\\n]')
BAD_SYMBOLS_RE = re.compile('[^0-9a-z #+_]')
STOPWORDS = set(stopwords.words('english'))

def loss_0_1(model, xs: List[str], ys: List[int]):
  # validation on class "0" samples that were not incuded in train process
  predicted = model.predict(xs)
  return np.mean(predicted == ys)
  #print("class 0 recall:", np.mean(predicted == ys))
  #print(metrics.classification_report(y_true=df_other_class_0.target.tolist(), y_pred=predicted))

def clean_text(text: str) -> str:
    '''Clean text'''
       
    text = text.lower() # lowercase text
    text = REPLACE_BY_SPACE_RE.sub(' ', text) # replace REPLACE_BY_SPACE_RE symbols by space in text
    text = BAD_SYMBOLS_RE.sub('', text) # delete symbols which are in BAD_SYMBOLS_RE from text
    text = ' '.join(word for word in text.split() if word not in STOPWORDS) # delete stopwors from text
    return text
    
df['comment_text'] = df['comment_text'].apply(clean_text)
df.head()


Unnamed: 0,comment_text,target
0,explanation edits made username hardcore metal...,0
1,daww matches background colour im seemingly st...,0
2,hey man im really trying edit war guy constant...,0
3,cant make real suggestions improvement wondere...,0
4,sir hero chance remember page thats,0


## Dataset Undersampling

---



In [0]:
indexies_class_0 = df[df.target == 0].index.tolist()
indexies_class_1 = df[df.target == 1].index.tolist()
random_indices_class_0 = np.random.choice(indexies_class_0, len(indexies_class_1), replace=False).tolist()

df_balanced = df.loc[random_indices_class_0 + indexies_class_1]
df_other_class_0 = df.loc[~df.index.isin(random_indices_class_0 + indexies_class_1)]
#df_balanced.groupby(['target']).agg({'target': 'count'})

df_train, df_test = train_test_split(df, shuffle=True, test_size=0.15)
df_train_b, df_test_b = train_test_split(df_balanced, shuffle=True, test_size=0.15)

X = df_train.comment_text.tolist()
Y = df_train.target.tolist()
Xb = df_train_b.comment_text.tolist()
Yb = df_train_b.target.tolist()


# def undersample(df: pd.DataFrame, y, enabled):
#   indexies_class_0 = df[df.target == 0].index.tolist()
#   indexies_class_1 = df[df.target == 1].index.tolist()
#   random_indices_class_0 = np.random.choice(indexies_class_0, len(indexies_class_1), replace=False).tolist()
#   df_balanced = df.loc[random_indices_class_0 + indexies_class_1]
#   print('=====df=df=====')
#   return df_balanced.comment_text.tolist()

# def undersample2(X):
#   print("===undersample2===")
# #   indexies_class_0 = df[df.target == 0].index.tolist()
# #   indexies_class_1 = df[df.target == 1].index.tolist()
# #   random_indices_class_0 = np.random.choice(indexies_class_0, len(indexies_class_1), replace=False).tolist()
# #   df_balanced = df.loc[random_indices_class_0 + indexies_class_1]
#   return X
  
def mk_clf_report(
    pipeline, 
    parameters, 
    df_train, 
    df_test, 
    df_other_class_0=None
  ):
  
  X = df_train.comment_text.tolist()
  Y = df_train.target.tolist()
  
  gs = GridSearchCV(pipeline, parameters, cv=5, iid=False, n_jobs=-1)
  gs = gs.fit(X, Y)
  
  # test
  predicted = gs.predict(df_test.comment_text.tolist())
  report = classification_report(
      y_true=df_test.target.tolist(), 
      y_pred=predicted
  )
  if df_other_class_0 is not None:
    loss = loss_0_1(
        gs,
        df_other_class_0.comment_text.tolist(), 
        df_other_class_0.target.tolist()
    )
  else:
    loss = None
    
  return gs, report, loss
  


# Multinomial Naive Bayes


---



In [103]:
ppl = Pipeline([
    ('vect', CountVectorizer()),
    ('tfidf', TfidfTransformer(use_idf=False)),
    ('clf', MultinomialNB(fit_prior=False)),
])

parameters = {
  'tfidf__use_idf': [True, False],
  'clf__fit_prior': [True, False]
}


              precision    recall  f1-score   support

           0       0.94      0.99      0.96     21409
           1       0.84      0.45      0.59      2527

   micro avg       0.93      0.93      0.93     23936
   macro avg       0.89      0.72      0.78     23936
weighted avg       0.93      0.93      0.92     23936



###  Imbalanced dataset

In [0]:
gs, report, loss = mk_clf_report(ppl, parameters, df_train, df_test)
print(report)
print(f'Best params:', {gs.best_params_})

### Balanced dataset

In [108]:
gs, report, loss = mk_clf_report(ppl, parameters, df_train_b, df_test_b, df_other_class_0)
print(report)
print('Best params:', gs.best_params_)
print('0-1 loss for sample of only class "0":', loss)

              precision    recall  f1-score   support

           0       0.88      0.90      0.89      2489
           1       0.89      0.87      0.88      2379

   micro avg       0.88      0.88      0.88      4868
   macro avg       0.88      0.88      0.88      4868
weighted avg       0.88      0.88      0.88      4868

Best params: {'tfidf__use_idf': True}
0-1 loss for samples of class "0": 0.8949032811258565


# Bernuolli Naive Bayes
---

In [0]:
ppl = Pipeline([
    ('vect', CountVectorizer()),
    ('tfidf', TfidfTransformer(use_idf=False)),
    ('clf', BernoulliNB(binarize=0.0, fit_prior=True)),
])

parameters = {
  'tfidf__use_idf': [True, False],
  'clf__fit_prior': [True, False]
}



###  Imbalanced dataset

In [110]:
gs, report, loss = mk_clf_report(ppl, parameters, df_train, df_test)
print(report)
print('Best params:', gs.best_params_)

              precision    recall  f1-score   support

           0       0.95      0.99      0.97     21399
           1       0.89      0.55      0.68      2537

   micro avg       0.94      0.94      0.94     23936
   macro avg       0.92      0.77      0.82     23936
weighted avg       0.94      0.94      0.94     23936



TypeError: ignored

###  Balanced dataset

In [111]:
gs, report, loss = mk_clf_report(ppl, parameters, df_train_b, df_test_b, df_other_class_0)
print(report)
print('Best params:', gs.best_params_)
print('0-1 loss for sample of only class "0":', loss)

              precision    recall  f1-score   support

           0       0.87      0.52      0.65      2489
           1       0.65      0.92      0.76      2379

   micro avg       0.72      0.72      0.72      4868
   macro avg       0.76      0.72      0.71      4868
weighted avg       0.76      0.72      0.71      4868

Best params: {'tfidf__use_idf': True}
0-1 loss for sample of only class "0": 0.5209052792221585


# SVM & Logistic Regression
---

In [0]:
ppl = Pipeline([
    ('l0', CountVectorizer()),
    ('l1', TfidfTransformer(use_idf=True)),
    ('l3', SGDClassifier(
        loss='hinge', 
        penalty='l2', 
        alpha=1e-5, 
        random_state=17, 
        max_iter=10, 
        tol=None,
        early_stopping=True,
        #validation_fraction=0.2
    )),
])
parameters = {
  'l1__use_idf': [True],
  'l3__alpha': [1e-5, 1e-4],
  'l3__loss': ['hinge', 'log'],
  'l3__penalty': ['l2', 'l1']
}


###  Imbalanced dataset

In [156]:
gs, report, loss = mk_clf_report(ppl, parameters, df_train, df_test)
print(report)
print('Best params:', gs.best_params_)
dump(gs.best_estimator_, 'SGDClassifier.joblib')



              precision    recall  f1-score   support

           0       0.96      0.99      0.98     21473
           1       0.91      0.66      0.77      2463

   micro avg       0.96      0.96      0.96     23936
   macro avg       0.94      0.83      0.87     23936
weighted avg       0.96      0.96      0.96     23936

Best params: {'l1__use_idf': True, 'l3__alpha': 1e-05, 'l3__loss': 'hinge', 'l3__penalty': 'l2'}


['SGDClassifier.joblib']

###  Balanced dataset

In [157]:
gs, report, loss = mk_clf_report(ppl, parameters, df_train_b, df_test_b, df_other_class_0)
print(report)
print('Best params:', gs.best_params_)
print('0-1 loss for sample of only class "0":', loss)

dump(gs.best_estimator_, 'SGDClassifier_balanced.joblib')




              precision    recall  f1-score   support

           0       0.89      0.93      0.91      2436
           1       0.93      0.88      0.90      2432

   micro avg       0.90      0.90      0.90      4868
   macro avg       0.91      0.90      0.90      4868
weighted avg       0.91      0.90      0.90      4868

Best params: {'l1__use_idf': True, 'l3__alpha': 1e-05, 'l3__loss': 'log', 'l3__penalty': 'l1'}
0-1 loss for sample of only class "0": 0.9271638832293642


['SGDClassifier_balanced.joblib']

In [139]:
model = load('SGDClassifier.joblib')
model.predict(['hello from planet earth'])


  (0, 160734)	0.4679595972134574
  (0, 98927)	0.297785570592945
  (0, 87851)	0.7215564558847756
  (0, 72885)	0.4143595664316861

   (0, 160734)	0.4679595972134574
  (0, 98927)	0.297785570592945
  (0, 87851)	0.7215564558847756
  (0, 72885)	0.4143595664316861


array([0])

In [152]:
model = load('SGDClassifier.joblib')
model.predict(['love me'])

array([0])