In [1]:
import pandas as pd
import numpy as np
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import f1_score, make_scorer
from sklearn.metrics import confusion_matrix, classification_report

In [2]:
train = pd.read_csv('feature_train.csv')
y_train = train['label']
X_train = train.drop('id', axis=1)
X_train = X_train.drop('label', axis=1)

In [3]:
test = pd.read_csv('feature_test.csv')
y_test = test['label']
X_test = test.drop('id', axis=1)
X_test = X_test.drop('label', axis=1)

In [4]:
lr0 = LogisticRegression(max_iter=5000, C=1, class_weight='balanced', multi_class='multinomial' , n_jobs=8, verbose=2, random_state=10)
lr0.fit(X_train, y_train)

[Parallel(n_jobs=8)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=8)]: Done   1 out of   1 | elapsed:  3.2min finished


LogisticRegression(C=1, class_weight='balanced', max_iter=5000,
                   multi_class='multinomial', n_jobs=8, random_state=10,
                   verbose=2)

In [5]:
y_pred = lr0.predict(X_test)

In [6]:
def eval(y_pred):
    cm = confusion_matrix(y_test, y_pred)
    accuracies = np.diag(cm) / cm.sum(axis=1)
    f1 = f1_score(y_test, y_pred, average='macro')
    for i, accuracy in enumerate(accuracies):
        print(f"Accuracy of class {i}: {accuracy:.2f}")
    print(f"Weighted F1 score: {f1:.2f}")

In [7]:
eval(y_pred)

Accuracy of class 0: 0.63
Accuracy of class 1: 0.33
Accuracy of class 2: 0.69
Accuracy of class 3: 0.80
Weighted F1 score: 0.60


### Some other parameters setting

In [14]:
lr = LogisticRegression(max_iter=5000, C=10, class_weight='balanced', multi_class='multinomial' , n_jobs=8, verbose=2, random_state=10)
lr.fit(X_train, y_train)

[Parallel(n_jobs=8)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=8)]: Done   1 out of   1 | elapsed:  3.5min finished


LogisticRegression(C=10, class_weight='balanced', max_iter=5000,
                   multi_class='multinomial', n_jobs=8, random_state=10,
                   verbose=2)

In [15]:
y_pred = lr.predict(X_test)
eval(y_pred)

Accuracy of class 0: 0.63
Accuracy of class 1: 0.32
Accuracy of class 2: 0.67
Accuracy of class 3: 0.80
Weighted F1 score: 0.59


In [16]:
lr = LogisticRegression(max_iter=5000, C=0.1, class_weight='balanced', multi_class='multinomial' , n_jobs=8, verbose=2, random_state=10)
lr.fit(X_train, y_train)
y_pred = lr.predict(X_test)
eval(y_pred)

[Parallel(n_jobs=8)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=8)]: Done   1 out of   1 | elapsed:  2.2min finished


Accuracy of class 0: 0.64
Accuracy of class 1: 0.33
Accuracy of class 2: 0.70
Accuracy of class 3: 0.79
Weighted F1 score: 0.60


### Evaluation on COVID dataset

In [8]:
data = pd.read_csv('covid_test.csv')

In [9]:
data.drop(['id'], axis=1, inplace=True)
replace_dict = {'Satire': 1, 'Hoax': 2, 'Propaganda': 3, 'Reliable News': 4}
data['label'] = data['label'].replace(replace_dict)

In [10]:
data

Unnamed: 0,d1,d2,d3,d4,d5,d6,d7,d8,d9,d10,...,d144,d145,d146,d147,d148,d149,d150,d151,d152,label
0,0.0,0.000000,0.00000,0.000000,0.0,0.000000,0.00000,0.000000,0.000000,0.0,...,0.000000,0.0,0.132239,0.0,0.000000,0.0,0.111579,79,0,4
1,0.0,0.000000,0.00000,0.225661,0.0,0.000000,0.00000,0.150991,0.000000,0.0,...,0.137182,0.0,0.000000,0.0,0.000000,0.0,0.000000,458,0,4
2,0.0,0.039538,0.09156,0.062778,0.0,0.000000,0.00000,0.000000,0.043025,0.0,...,0.038163,0.0,0.000000,0.0,0.060505,0.0,0.030551,484,0,4
3,0.0,0.000000,0.00000,0.000000,0.0,0.000000,0.00000,0.000000,0.000000,0.0,...,0.000000,0.0,0.000000,0.0,0.000000,0.0,0.000000,12,0,4
4,0.0,0.000000,0.00000,0.000000,0.0,0.000000,0.00000,0.000000,0.000000,0.0,...,0.086459,0.0,0.000000,0.0,0.000000,0.0,0.000000,156,0,4
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
115,0.0,0.181346,0.00000,0.000000,0.0,0.193119,0.22575,0.000000,0.000000,0.0,...,0.000000,0.0,0.000000,0.0,0.277511,0.0,0.000000,91,0,1
116,0.0,0.332461,0.00000,0.175957,0.0,0.000000,0.00000,0.000000,0.000000,0.0,...,0.000000,0.0,0.101486,0.0,0.254379,0.0,0.000000,150,0,1
117,0.0,0.191076,0.00000,0.000000,0.0,0.000000,0.00000,0.000000,0.000000,0.0,...,0.000000,0.0,0.000000,0.0,0.146200,0.0,0.000000,95,1,1
118,0.0,0.000000,0.00000,0.077492,0.0,0.000000,0.00000,0.000000,0.000000,0.0,...,0.094216,0.0,0.000000,0.0,0.149372,0.0,0.000000,120,0,1


In [11]:
y = data['label']
x = data.drop('label', axis=1)

In [20]:
y_pred = lr0.predict(x)

In [21]:
y

0      4
1      4
2      4
3      4
4      4
      ..
115    1
116    1
117    1
118    1
119    1
Name: label, Length: 120, dtype: int64

In [24]:
cm = confusion_matrix(y, y_pred)
accuracies = np.diag(cm) / cm.sum(axis=1)
f1 = f1_score(y, y_pred, average='macro')
for i, accuracy in enumerate(accuracies):
    print(f"Accuracy of class {i}: {accuracy:.2f}")
print(f"F1 score: {f1:.2f}")
print(classification_report(y_pred, y))

Accuracy of class 0: 0.58
Accuracy of class 1: 0.07
Accuracy of class 2: 0.45
Accuracy of class 3: 0.13
F1 score: 0.29
              precision    recall  f1-score   support

           1       0.58      0.69      0.63        26
           2       0.07      0.40      0.11         5
           3       0.45      0.22      0.29        60
           4       0.13      0.14      0.14        29

    accuracy                           0.31       120
   macro avg       0.31      0.36      0.29       120
weighted avg       0.38      0.31      0.32       120



In [23]:
cm


array([[18,  2,  8,  3],
       [ 6,  2, 15,  7],
       [ 0,  1, 13, 15],
       [ 2,  0, 24,  4]], dtype=int64)