In [1]:
# We're mounting the drive.

from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
# Loading the data.
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.multiclass import OneVsRestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import f1_score

try:
    train_data = pd.read_csv('/content/drive/My Drive/DM/bigtask2/DM2023_training_docs_and_labels.tsv', sep='\t', header=None, encoding='ISO-8859-1')
    test_data = pd.read_csv('/content/drive/My Drive/DM/bigtask2/DM2023_test_docs.tsv', sep='\t', header=None, encoding='ISO-8859-1')
except Exception as e:
    print("Error with ISO-8859-1 encoding:", e)



In [None]:
print(train_data.head())
print(test_data.head())

             0                                                  1  \
0   580106.txt  Soap Programming with Java (Transcend Techniqu...   
1  1755942.txt  Residue objects: a challenge to web browser se...   
2  1416298.txt  SimCon - a simulation and visualization enviro...   
3  1516665.txt  An on demand data integration model for biolog...   
4  1259693.txt  The Usability of Multimedia Interface Based on...   

                   2  
0  H.3.5,D.3.2,I.7.2  
1              D.4.6  
2              I.6.3  
3          H.2.8,J.3  
4  I.2.0,H.5.1,H.5.2  
             0                                                  1   2
0   963168.txt  An Analysis of the Imagine PA Public Sector ER... NaN
1  1811004.txt  The tidy set: a minimal simplicial set for com... NaN
2   192631.txt  Towards usability guidelines for multimedia sy... NaN
3  1183872.txt  Relational Formalism for the Management of Spa... NaN
4  1280491.txt  Continuous parallel-iterated RKN-type PC metho... NaN


In [3]:
# Preprocessing and vectorizing text.
vectorizer = TfidfVectorizer(max_features=10000)
X_train = vectorizer.fit_transform(train_data[1])
X_test = vectorizer.transform(test_data[1])

In [None]:
print(X_train)

  (0, 3659)	0.044030021370828176
  (0, 4170)	0.05292680143163375
  (0, 9434)	0.04714812305649257
  (0, 8384)	0.061657282427684375
  (0, 9876)	0.0360121483412932
  (0, 4932)	0.0224946685283858
  (0, 8956)	0.03418530428547641
  (0, 6456)	0.040231538172770184
  (0, 5473)	0.09099024931496738
  (0, 9160)	0.042449937193888294
  (0, 6258)	0.02577938789857192
  (0, 4844)	0.05688707334875818
  (0, 576)	0.01723924911407796
  (0, 1195)	0.055614002356452294
  (0, 9848)	0.05001649635735849
  (0, 6165)	0.05435949312332808
  (0, 403)	0.05185089427885018
  (0, 526)	0.05369136089089549
  (0, 3833)	0.037469821146702297
  (0, 9898)	0.03923723459194303
  (0, 6899)	0.03545173758751881
  (0, 2252)	0.058489945888254354
  (0, 6830)	0.04594100193144336
  (0, 2621)	0.05282810197855327
  (0, 9075)	0.027218540564652362
  :	:
  (99999, 5984)	0.16868946009880786
  (99999, 9037)	0.0495287290087336
  (99999, 1313)	0.02247969321586242
  (99999, 6312)	0.06427199971881725
  (99999, 9060)	0.03210585612778624
  (99999, 98

In [4]:
# Preparing labels.
from sklearn.preprocessing import MultiLabelBinarizer
mlb = MultiLabelBinarizer()
y_train = mlb.fit_transform(train_data[2].str.split(','))

In [5]:
from sklearn.metrics import accuracy_score, classification_report

x_train2 = X_train[:70000]
x_test = X_train[70000:]

y_train2 = y_train[:70000]
y_test = y_train[70000:]

# Initialize the model
model = OneVsRestClassifier(LogisticRegression(max_iter=1000))

# Train the model on the new training subset
model.fit(x_train2, y_train2)

# Predict on the testing subset
y_pred = model.predict(x_test)

# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
report = classification_report(y_test, y_pred)

print("Accuracy:", accuracy)
print("Classification Report:\n", report)




Accuracy: 0.040633333333333334
Classification Report:
               precision    recall  f1-score   support

           0       0.80      0.06      0.10       144
           1       0.00      0.00      0.00       119
           2       0.00      0.00      0.00        58
           3       0.40      0.02      0.03       252
           4       0.00      0.00      0.00        38
           5       0.00      0.00      0.00         6
           6       0.00      0.00      0.00         4
           7       0.00      0.00      0.00        14
           8       0.00      0.00      0.00        14
           9       0.00      0.00      0.00         3
          10       0.00      0.00      0.00        53
          11       0.00      0.00      0.00        22
          12       0.00      0.00      0.00         0
          13       0.00      0.00      0.00         0
          14       0.00      0.00      0.00         9
          15       0.00      0.00      0.00        32
          16       0.00   

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [None]:
print(y_train)

[[0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 ...
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]]


In [None]:
# Training the model.
model = OneVsRestClassifier(LogisticRegression(max_iter=1000))
model.fit(X_train, y_train)

In [None]:
# And finally predicting on test data.
y_pred = model.predict(X_test)

In [None]:
print(y_pred[15])

[0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0]


In [None]:
y_pred_labels = mlb.inverse_transform(y_pred)

In [None]:
print(y_pred_labels)

[(), (), ('H.1.2', 'H.5.1', 'H.5.2'), ('H.2.8',), ('G.1.0',), ('C.3',), ('K.6.1',), (), ('I.2.7',), (), (), ('K.6.1',), ('I.2.3',), ('G.2.2',), ('G.1.1', 'I.3.5'), (), (), ('I.2.11',), ('I.6.5',), ('G.1.2',), (), ('K.6.1',), ('F.2.2', 'I.2.8'), (), ('F.2.2',), (), (), (), (), ('H.5.1',), ('B.3.1', 'B.7.1', 'B.8.2'), (), (), (), (), (), ('K.3.2',), (), (), (), (), (), (), (), ('J.3',), ('I.2.8',), ('H.5.2',), (), ('K.3.2',), ('I.4.8',), ('B.8.1',), (), ('H.3.3',), (), ('I.4.3',), (), (), (), ('H.5.2', 'K.8.0'), ('F.2.1',), ('D.4.2',), ('D.4.3',), ('C.2.1', 'C.4'), (), ('I.2.7',), ('F.2.2', 'G.2.2'), ('G.1.6',), ('I.5.2',), (), ('G.3',), ('I.2.6',), ('I.2.7',), (), (), ('I.3.5',), (), (), ('C.2.1', 'C.2.3', 'C.4', 'K.6.4'), ('G.1.0', 'G.1.6'), ('I.2.3',), ('D.2.4',), (), (), (), (), ('K.6.1',), ('I.2.6',), ('H.3.3',), ('K.3.2',), ('D.2.5', 'H.5.2'), ('K.3.1',), ('G.1.8',), (), ('I.5.2',), ('G.2.2',), ('C.2.1', 'C.2.2', 'C.4'), ('I.5.4',), ('C.2.4',), (), ('H.3.5', 'J.1', 'K.3.1'), ('D.2.

In [None]:
with open('/content/drive/My Drive/DM/bigtask2/submission.txt', 'w') as f:
    for labels in y_pred_labels:
        f.write(','.join(labels) + '\n')