In [2]:
# Loading the data.
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.multiclass import OneVsRestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import f1_score

In [3]:
train_data = pd.read_csv('jakub/DM2023_training_docs_and_labels.tsv', sep='\t', 
                         header=None, encoding='ISO-8859-1')
test_data = pd.read_csv('jakub/DM2023_test_docs.tsv', sep='\t', 
                        header=None, encoding='ISO-8859-1')

In [25]:
train_data[[1,2]]

Unnamed: 0,1,2
0,Soap Programming with Java (Transcend Techniqu...,"H.3.5,D.3.2,I.7.2"
1,Residue objects: a challenge to web browser se...,D.4.6
2,SimCon - a simulation and visualization enviro...,I.6.3
3,An on demand data integration model for biolog...,"H.2.8,J.3"
4,The Usability of Multimedia Interface Based on...,"I.2.0,H.5.1,H.5.2"
...,...,...
99995,Multimedia indexing and retrieval: ever great ...,"H.3.1,H.2.4,H.3.3"
99996,Error detection and concealment for video tran...,"E.4,I.4.2,I.6.6"
99997,Comparison of empirical testing and walkthroug...,"H.5.2,H.1.2"
99998,On computational efficiency of the iterative m...,G.1.5


In [22]:
test_data

Unnamed: 0,0,1,2
0,963168.txt,An Analysis of the Imagine PA Public Sector ER...,
1,1811004.txt,The tidy set: a minimal simplicial set for com...,
2,192631.txt,Towards usability guidelines for multimedia sy...,
3,1183872.txt,Relational Formalism for the Management of Spa...,
4,1280491.txt,Continuous parallel-iterated RKN-type PC metho...,
...,...,...,...
99995,1201771.txt,Node similarity in the citation graph Publishe...,
99996,1502692.txt,Intelligently creating and recommending reusab...,
99997,881912.txt,Computer-controlled orientation of multiple op...,
99998,789070.txt,Labelled Markov Processes: Stronger and Faster...,


In [5]:
train_data['labels'] = train_data[2].apply(lambda x: x.split(','))

In [19]:
from tqdm.notebook import tqdm

In [20]:
t = set()
for i in tqdm(train_data['labels']):
    for j in i:
        t.add(j)

  0%|          | 0/100000 [00:00<?, ?it/s]

In [21]:
t

{'A.0',
 'A.1',
 'A.2',
 'A.m',
 'B.0',
 'B.1',
 'B.1.0',
 'B.1.1',
 'B.1.2',
 'B.1.3',
 'B.1.4',
 'B.1.5',
 'B.1.m',
 'B.2',
 'B.2.0',
 'B.2.1',
 'B.2.2',
 'B.2.3',
 'B.2.4',
 'B.2.m',
 'B.3',
 'B.3.0',
 'B.3.1',
 'B.3.2',
 'B.3.3',
 'B.3.4',
 'B.3.m',
 'B.4',
 'B.4.0',
 'B.4.1',
 'B.4.2',
 'B.4.3',
 'B.4.4',
 'B.4.5',
 'B.4.m',
 'B.5',
 'B.5.0',
 'B.5.1',
 'B.5.2',
 'B.5.3',
 'B.5.m',
 'B.6',
 'B.6.0',
 'B.6.1',
 'B.6.2',
 'B.6.3',
 'B.6.m',
 'B.7',
 'B.7.0',
 'B.7.1',
 'B.7.2',
 'B.7.3',
 'B.7.m',
 'B.8',
 'B.8.0',
 'B.8.1',
 'B.8.2',
 'B.8.m',
 'B.m',
 'C.0',
 'C.1',
 'C.1.0',
 'C.1.1',
 'C.1.2',
 'C.1.3',
 'C.1.4',
 'C.1.m',
 'C.2',
 'C.2.0',
 'C.2.1',
 'C.2.2',
 'C.2.3',
 'C.2.4',
 'C.2.5',
 'C.2.6',
 'C.2.m',
 'C.3',
 'C.4',
 'C.5',
 'C.5.0',
 'C.5.1',
 'C.5.2',
 'C.5.3',
 'C.5.4',
 'C.5.5',
 'C.5.m',
 'C.m',
 'D.0',
 'D.1',
 'D.1.0',
 'D.1.1',
 'D.1.2',
 'D.1.3',
 'D.1.4',
 'D.1.5',
 'D.1.6',
 'D.1.7',
 'D.1.m',
 'D.2',
 'D.2.0',
 'D.2.1',
 'D.2.10',
 'D.2.11',
 'D.2.12',
 'D.2

In [None]:
sum(train_data['labels'],[])

In [None]:
test_data

In [23]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report

# Load the data
# data = pd.read_csv('DM2023_training_docs_and_labels.tsv', sep='\t', header=None, names=['id', 'abstract', 'labels'])

# Preprocess labels
train_data['labels'] = train_data[2].apply(lambda x: x.split(','))

# Split data into training and test sets
X_train, X_test, y_train, y_test = train_test_split(train_data[1], train_data['labels'], test_size=0.2, random_state=42)

# Feature extraction using TF-IDF
vectorizer = TfidfVectorizer(max_features=5000)
X_train_tfidf = vectorizer.fit_transform(X_train)
X_test_tfidf = vectorizer.transform(X_test)

# Encode labels
mlb = MultiLabelBinarizer()
y_train_enc = mlb.fit_transform(y_train)
y_test_enc = mlb.transform(y_test)

# Train a classifier (example: Logistic Regression)
classifier = LogisticRegression()
classifier.fit(X_train_tfidf, y_train_enc)

# Predict on the test set
y_pred = classifier.predict(X_test_tfidf)

# Evaluate the model
print(classification_report(y_test_enc, y_pred, target_names=mlb.classes_))


ValueError: y should be a 1d array, got an array of shape (80000, 358) instead.

In [24]:
y_train

75220                                [I.5.5]
48955                         [H.3.3, H.2.8]
44966    [D.1.3, C.2.5, H.3.3, H.3.4, I.2.8]
13568           [G.2.2, F.2.2, G.1.6, G.2.1]
92727                         [K.3.m, K.6.1]
                        ...                 
6265                          [I.5.4, H.5.2]
54886                                [H.5.2]
76820                  [G.1.3, B.7.1, B.8.2]
860                             [H.4.2, J.1]
15795                  [I.2.4, F.3.3, F.4.1]
Name: labels, Length: 80000, dtype: object