In [4]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [5]:
import pandas as pd
import numpy as np 
import matplotlib.pyplot as plt
import seaborn as sns

In [6]:
with open('drive/MyDrive/Diplom/train3.csv') as f:
    train = pd.read_csv(f,sep='\t')
with open('drive/MyDrive/Diplom/test3.csv') as f:
    test = pd.read_csv(f,sep='\t')

In [7]:
with open('drive/MyDrive/Diplom/train2.csv') as f:
    train0 = pd.read_csv(f,sep='\t')
with open('drive/MyDrive/Diplom/test2.csv') as f:
    test0 = pd.read_csv(f,sep='\t')

In [8]:
import re

pattern = '[^a-zA-Z0-9\t\n\r\f\v]'

def num_punct_symbols(text):
  return(len(re.findall(pattern, text)))

pattern2 = ["<3", "♥",">:D", ":-D", ":D", "=-D", "=D", "X-D", "x-D", "XD", "xD", "8-D", ">:P", ":-P", ":P", ":-p", ":p", ":-b", ":b", ":c\)", ":o\)", ":^\)", ">:\)", ":-\)", ":\)", "=\)", "=]", ":]", ":}", ":>", ":3", "8\)", "8-\)", ">;]", ";-\)", ";\)", ";-]", ";]", ";D", ";^\)", "\*-\)", "\*\)", ">:o", ":-O", ":O", ":o", ":-o", "o_O", "o.O", "°O°", "°o°", ">:/",  ":-/", ":/", ":-.", ":-s", ":s", ":S", ":-S", ">.>", ">:\[", ":-\(", ":\(", "=\(", ":-\[", ":\[", ":{", ":-<", ":c", ":-c", "=/",":'\(", ":'''\(", ";'\(","\(\(\(","\)\)\)"]

def num_smiles(text):
  sum = 0
  for i in pattern2:
    sum += len(re.findall(i, text))
  return sum

train['punct_symbols_frac'] = train0['body'].apply(num_punct_symbols)/train0['body_length']
test['punct_symbols_frac'] = test0['body'].apply(num_punct_symbols)/test0['body_length']

train['smile'] = train0['body'].apply(num_smiles)
test['smile'] = test0['body'].apply(num_smiles)

In [13]:
from sklearn import metrics
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report
from sklearn.metrics import f1_score
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import f1_score, accuracy_score, balanced_accuracy_score, log_loss, roc_auc_score
from sklearn.pipeline import Pipeline

def metrics(y_test, y_pred, y_pred_proba):
  
  conf = confusion_matrix(y_test, y_pred)
  print(conf)
  f1_ma = round(f1_score(y_test, y_pred, average='macro'), 4)
  f1_mi = round(f1_score(y_test, y_pred, average='micro'), 4)
  f1_we = round(f1_score(y_test, y_pred, average='weighted'), 4)
  b_acc = round(balanced_accuracy_score(y_test, y_pred), 4)
  neg_ll = round(- log_loss(y_test, y_pred_proba), 4)
  roc_auc_ovr = round(roc_auc_score(y_test, y_pred_proba, average='weighted', multi_class = 'ovr'), 4)
  roc_auc_ovo = round(roc_auc_score(y_test, y_pred_proba, average='weighted', multi_class = 'ovo'), 4)

  print('f1_score_macro:', f1_ma)
  print('f1_score_micro:', f1_mi)
  print('f1_score_weighted:', f1_we)
  print('balanced_accuracy_score:', b_acc)
  print('neg_log_loss:', neg_ll)
  print('roc_auc_score ovr:', roc_auc_ovr)
  print('roc_auc_score ovo:', roc_auc_ovo)

  return pd.DataFrame({'features': ['+TF-IDF+pol_and_sub'],'f1_score_macro': [f1_ma], 'f1_score_micro': [f1_mi],'f1_score_weighted': [f1_we], 'balanced_accuracy_score': [b_acc], 'neg_log_loss': [neg_ll],'roc_auc_score ovr': [roc_auc_ovr], 'roc_auc_score ovo': [roc_auc_ovo]})


In [10]:
X_train = train.drop(columns = ['Y', 'Unnamed: 0'])
X_test = test.drop(columns = ['Y', 'Unnamed: 0'])
y_train = train.Y
y_test = test.Y

In [17]:
with open('drive/MyDrive/Diplom/results.csv') as f:
    results = pd.read_csv(f)

#RandomForest

Grid Search

In [None]:
pipel = Pipeline(steps = [('scaler', StandardScaler()), ('rf', RandomForestClassifier())])

param_grid = {
    "rf__max_depth": [5, 15, 30, 45, 60],
    "rf__criterion" : ('gini', 'entropy', 'log_loss'),
    "rf__min_samples_leaf": [5,10,20,50,100],
    "rf__max_features" :('sqrt', 'log2')
}

search = GridSearchCV(pipel, param_grid, n_jobs=2)

search.fit(X_train, y_train)

y_pred = search.predict(X_test)

print(metrics(y_test, y_pred))

print("Best parameter (CV score=%0.3f):" % search.best_score_)
print(search.best_params_)

[[43675     0  2961]
 [  423     4  1525]
 [ 3295     1  9859]]
              precision    recall  f1-score   support

           0       0.92      0.94      0.93     46636
           1       0.80      0.00      0.00      1952
           2       0.69      0.75      0.72     13155

    accuracy                           0.87     61743
   macro avg       0.80      0.56      0.55     61743
weighted avg       0.87      0.87      0.85     61743

f1_score_macro: 0.5500249312868623
f1_score_micro: 0.8671104416695011
None
Best parameter (CV score=0.909):
{'rf__criterion': 'log_loss', 'rf__max_depth': 60, 'rf__max_features': 'sqrt', 'rf__min_samples_leaf': 5}


Cross validation для Random forest с набором лучших параметров.

In [16]:
#from sklearn.model_selection import RepeatedStratifiedKFold
#from sklearn.model_selection import cross_val_score
#from sklearn.metrics import classification_report, f1_score, make_scorer
#from sklearn.metrics import balanced_accuracy_score


#def classification_rep(y_true, y_pred):
#    return balanced_accuracy_score(y_true, y_pred)

#my_scorer = make_scorer(classification_rep)

#steps = [('model',RandomForestClassifier(criterion= 'log_loss', max_depth= 60, max_features= 'sqrt', min_samples_leaf= 5))]
#pipeline = Pipeline(steps=steps)

#cv = RepeatedStratifiedKFold(n_splits=5, n_repeats=2, random_state=1)
#scores = cross_val_score(pipeline, X_train, y_train, cv=cv,scoring = my_scorer, error_score='raise')
#scores.mean()

In [15]:
steps = [('model',RandomForestClassifier(criterion= 'log_loss', max_depth= 60, max_features= 'sqrt', min_samples_leaf= 5))]

pipel = Pipeline(steps = steps)

pipel.fit(X_train, y_train)

y_pred = pipel.predict(X_test)
y_pred_proba = pipel.predict_proba(X_test)

res = metrics(y_test, y_pred, y_pred_proba)

[[43526     0  3110]
 [  370     3  1579]
 [ 3145     0 10010]]
f1_score_macro: 0.5504
f1_score_micro: 0.8671
f1_score_weighted: 0.8551
balanced_accuracy_score: 0.5653
neg_log_loss: -0.316
roc_auc_score ovr: 0.9523
roc_auc_score ovo: 0.9201


In [22]:
results = pd.concat([results, res])

# LogReg

In [24]:
from sklearn.linear_model import LogisticRegression

In [None]:
pipel = Pipeline(steps = [('scaler', StandardScaler()), ('model',LogisticRegression(solver = 'newton-cg',class_weight = 'balanced'))])

C = np.logspace(-2, 2, 10)

param_grid = {
    "model__max_iter" : [5,25,50],
    "model__C" : C
}

search = GridSearchCV(pipel, param_grid, n_jobs=2)

search.fit(X_train, y_train)

y_pred = search.predict(X_test)

print(metrics(y_test, y_pred))
print(search.best_params_)

[[39270  3338  4028]
 [   52  1276   624]
 [ 1876  3953  7326]]
              precision    recall  f1-score   support

           0       0.95      0.84      0.89     46636
           1       0.15      0.65      0.24      1952
           2       0.61      0.56      0.58     13155

    accuracy                           0.78     61743
   macro avg       0.57      0.68      0.57     61743
weighted avg       0.85      0.78      0.81     61743

f1_score_macro: 0.5732579761563533
f1_score_micro: 0.7753429538571174
None
{'model__C': 4.6415888336127775, 'model__max_iter': 50}


In [25]:
pipel = Pipeline(steps = [('scaler', StandardScaler()), ('model',LogisticRegression(C = 4.6415888336127775, max_iter = 10000, solver = 'newton-cg', class_weight = 'balanced'))])

pipel.fit(X_train, y_train)

y_pred = pipel.predict(X_test)
y_pred_proba = pipel.predict_proba(X_test)

res = metrics(y_test, y_pred, y_pred_proba)

[[39270  3338  4028]
 [   52  1276   624]
 [ 1876  3953  7326]]
f1_score_macro: 0.5733
f1_score_micro: 0.7753
f1_score_weighted: 0.8073
balanced_accuracy_score: 0.6842
neg_log_loss: -0.6952
roc_auc_score ovr: 0.9001
roc_auc_score ovo: 0.8888


In [27]:
results = pd.concat([results, res])

In [None]:
#from sklearn.model_selection import RepeatedStratifiedKFold
#from sklearn.model_selection import cross_val_score
#from sklearn.metrics import classification_report, f1_score, make_scorer
#from sklearn.metrics import balanced_accuracy_score


#def classification_report_with_f1_score(y_true, y_pred):
#    return balanced_accuracy_score(y_true, y_pred)

#my_scorer = make_scorer(classification_report_with_f1_score)

#pipel = Pipeline(steps = [('scaler', StandardScaler()), ('model',LogisticRegression(C = 100.0, max_iter = 50, solver = 'newton-cg', class_weight = 'balanced'))])

#cv = RepeatedStratifiedKFold(n_splits=5, n_repeats=2, random_state=1)
#scores = cross_val_score(pipel, X_train, y_train, cv=cv,scoring = my_scorer, error_score='raise')
#scores.mean()

In [28]:
results

Unnamed: 0,features,f1_score_macro,f1_score_micro,f1_score_weighted,balanced_accuracy_score,neg_log_loss,roc_auc_score ovr,roc_auc_score ovo
0,"ups,score,body_length,count",0.4429,0.6345,0.6987,0.6068,-1.0515,0.7838,0.7873
1,+TF-IDF+pol_and_sub,0.509,0.7135,0.7629,0.6574,-0.8013,0.8654,0.8577
2,+LDA,0.547,0.7514,0.7905,0.6674,-0.7302,0.8852,0.876
0,+TF-IDF+pol_and_sub,0.5504,0.8671,0.8551,0.5653,-0.316,0.9523,0.9201
0,+TF-IDF+pol_and_sub,0.5733,0.7753,0.8073,0.6842,-0.6952,0.9001,0.8888


In [31]:
train.drop(columns=['Unnamed: 0'], inplace = True)
test.drop(columns=['Unnamed: 0'], inplace = True)

In [34]:
train.to_csv('drive/MyDrive/Diplom/train4.csv', sep='\t')
test.to_csv('drive/MyDrive/Diplom/test4.csv', sep='\t')
results.to_csv('drive/MyDrive/Diplom/results.csv')